## **Import Libraries**

In [1]:
#!pip install selenium
#!pip install webdriver-manager
#!pip install pyyaml ua-parser user-agents fake-useragent

################ WEB SCRAPING MODULES ############
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import ChromeType
from selenium.webdriver.common.by import By
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
from datetime import date 
import datetime
############## DATA MANIPULATION MODULES #########
import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

## **Define web source**

In [2]:
link = 'https://www.amaze.org.au/understand-autism/about-autism/'

## **Read 100++ questions list**

In [119]:
df = pd.read_excel('ASDquestions8.xlsx',engine='openpyxl')

df[link]=np.nan
df.head(25)

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism,https://www.myautismteam.com/resources/autism-an-overview,https://www.autism.org.uk/advice-and-guidance/what-is-autism/asperger-syndrome,https://iancommunity.org/autism-faq,https://icahn.mssm.edu/research/seaver/resources/autism-faqs,https://otsimo.com/en/frequently-asked-questions-autism/,https://rockmelon.com/about-autism/,https://www.amaze.org.au/understand-autism/about-autism/
0,What are the Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,,,,,,
1,How common is autism?,According to a 2020 report commissioned by the...,It is estimated that in the United States 1.6 ...,,,,According to the U.S. Centers for Disease Cont...,Recent studies have found that 1 in 59 childre...,
2,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,,,,You may think that something you did or ate or...,,
3,Is autism contagious?,,Autism is not a contagious condition. Autism i...,,,,,,
4,Are rates of autism increasing?,,Estimates released by the Centers for Disease ...,,,,,,
5,Is autism a new condition?,,"It is likely that autism has always existed, b...",,,,,,
6,Is there a cure for autism?,,"There is no cure for autism. However, early in...",,,,,,
7,How Is Autism Diagnosed?,,There is no one single conclusive test for aut...,,There is no blood test to diagnose autism spec...,"While there is no medical test for autism, an ...",,There are a number of different therapies avai...,
8,Is autism permanent?,,There is some controversy on the topic of whet...,,,,,,
9,How Is Autism Treated?,,Treatment for autism depends largely on the in...,,,,,,


## **Scrape QA pairs from website**

In [4]:
# Open webpage in a new window for scraping
#driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.GOOGLE).install())   #cannot fix in colab

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [/home/aceirus/.wdm/drivers/chromedriver/linux64/92.0.4515.107/chromedriver] found in cache


In [120]:
# Parse text in webpage
source = driver.page_source
soup = bs4.BeautifulSoup(source, 'html.parser')

In [121]:
# Search the questions mentioned in webpage
quesList = []
#for ques in soup.find_all('h3'):
for ques in soup.find_all('button',{'class':'btn btn-unstyled'}):

    #if(ques.text[-1]=='?'):
        toSplit = str(ques.text)
        quesSplit = toSplit.split()
        quesJoin = ' '.join(quesSplit)
        
        print(quesJoin)
        quesList.append(quesJoin)

What causes autism?
Secondary conditions and difficulties
Autistic people with complex support needs
Do autistic people look different?
Talking about autism


In [122]:
# Search the answers for corresponding questions in quesList
ansList = []

#for ans in soup.find_all('div', {'class':'field-items'}):
for ans in soup.find_all('div',{'class':'card-body typography'}):

    #if ans.find(string=re.compile("autism")):
            toSplit = str(ans.text)            
            #ansSplit = toSplit.split("<br/>")
            ansClean = toSplit.replace("\n","")
            

            print(ansClean)
            print('*'*100)
            ansList.append(ansClean)

There is no known cause of autism.Much research is being done to try to find out more. Right now, evidence suggests that autism results from changes to the development and growth of the brain. These changes may be caused by a combination of factors, including genetics.Autism is part of who a person is. It isn’t caused by parenting or social circumstances. Autism is also not caused by vaccination or other medical treatment. 
****************************************************************************************************
Autism may be present with other conditions. This can affect people in different ways. Some other conditions autistic people commonly experience are:speech and language challengesintellectual disabilitysleep problemsattention problemshyperactivityepilepsyanxiety and depressionchallenges with fine and gross motor skills.There are other conditions that are associated with autism, including Fragile X Syndrome, Tuberous Sclerosis and other genetic disorders.
************

## **Check which questions are similar**

In [123]:
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    sent = sent.lower() # lowercase
    sent = re.sub(r'[^\w\s]', '', sent) # remove punctuations
    sent = re.sub('Autism Spectrum Disorder','ASD',sent) # Compress term
    sent = [w for w in sent.split() if not w.lower() in stop_words] # Remove stopwords
    sent = " ".join(sent)
    return sent

In [124]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [125]:
# try to match websource questions with our own 100 questions list
for c1,i in enumerate(quesList):
    i2 = clean_text(i)
    list1 = i2.split()
    
    temp1 = 0.0
    temp2 = ''
    temp3 = ''
    temp4 = 0
    
    for c2,j in enumerate(df['Question']):
        j2 = clean_text(j)
        list2 = j2.split()
            
        sim = jaccard_similarity(list1, list2)
        if(sim>temp1):
            temp1 = sim
            temp2 = j
            temp3 = i
            temp4 = c2
            
    if(temp1>=0.3):  # sim threshold
        print('Website --> ',temp3,'(Index {})'.format(c1))
        print('100 questions list --> ',temp2,'(Index {})'.format(temp4))
        print('similarity:', temp1)
        print('*'*100)

Website -->  What causes autism? (Index 0)
100 questions list -->  What causes autism? Can it be cured? (Index 2)
similarity: 0.6666666666666666
****************************************************************************************************
Website -->  Do autistic people look different? (Index 3)
100 questions list -->  Where do Autistic People Usually Live? (Index 135)
similarity: 0.3333333333333333
****************************************************************************************************
Website -->  Talking about autism (Index 4)
100 questions list -->  What is Autism? (Index 30)
similarity: 0.5
****************************************************************************************************


In [126]:
for i in range (0,len(quesList)):
    print("Index #{0:d}: {1:s}".format(i,quesList[i]))

Index #0: What causes autism?
Index #1: Secondary conditions and difficulties
Index #2: Autistic people with complex support needs
Index #3: Do autistic people look different?
Index #4: Talking about autism


In [127]:
for i in range (0,len(ansList)):
    print("Index #{0:d}: {1:s}".format(i,ansList[i]))

Index #0: There is no known cause of autism.Much research is being done to try to find out more. Right now, evidence suggests that autism results from changes to the development and growth of the brain. These changes may be caused by a combination of factors, including genetics.Autism is part of who a person is. It isn’t caused by parenting or social circumstances. Autism is also not caused by vaccination or other medical treatment. 
Index #1: Autism may be present with other conditions. This can affect people in different ways. Some other conditions autistic people commonly experience are:speech and language challengesintellectual disabilitysleep problemsattention problemshyperactivityepilepsyanxiety and depressionchallenges with fine and gross motor skills.There are other conditions that are associated with autism, including Fragile X Syndrome, Tuberous Sclerosis and other genetic disorders.
Index #2: Autistic people with complex support needs are people:Whose support needs span mult

In [130]:
# add websource answer to matched question in existing dataframe
df[link].loc[2]=ansList[0]

In [131]:
df[link].loc[2]

'There is no known cause of autism.Much research is being done to try to find out more. Right now, evidence suggests that autism results from changes to the development and growth of the brain. These changes may be caused by a combination of factors, including genetics.Autism is part of who a person is. It isn’t caused by parenting or social circumstances. Autism is also not caused by vaccination or other medical treatment.\xa0'

In [132]:
# update with new valid questions list
quesListUpd = [quesList[1],
               quesList[2],
               quesList[3],
               quesList[4]]

quesListUpd

['Secondary conditions and difficulties',
 'Autistic people with complex support needs',
 'Do autistic people look different?',
 'Talking about autism']

In [133]:
# manually select answers to updated questions list
ansListUpd = [ansList[1],
              ansList[2],
              ansList[3],
              ansList[4]]

ansListUpd

['Autism may be present with other conditions. This can affect people in different ways. Some other conditions autistic people commonly experience are:speech and language challengesintellectual disabilitysleep problemsattention problemshyperactivityepilepsyanxiety and depressionchallenges with fine and gross motor skills.There are other conditions that are associated with autism, including Fragile X Syndrome, Tuberous Sclerosis and other genetic disorders.',
 'Autistic people with complex support needs are people:Whose support needs span multiple domains (i.e. health, mental health, justice etc); and/orWho have high levels of need in one or more areas; and/orWho are more vulnerable or at a great risk of vulnerability than the broader autistic communityComplex support needs may be indicated by the presence of:multiple disabilitiescoexisting mental health issuespersistent and or chronic health conditionsintellectual disabilityspecific learning disabilityexperiences of trauma or neglectbe

In [134]:
# Create new dataframe with QA pairs
df2 = pd.DataFrame(zip(quesListUpd,ansListUpd),columns=['Question',link])
df2

Unnamed: 0,Question,https://www.amaze.org.au/understand-autism/about-autism/
0,Secondary conditions and difficulties,Autism may be present with other conditions. T...
1,Autistic people with complex support needs,Autistic people with complex support needs are...
2,Do autistic people look different?,Autism is a neurodevelopmental (meaning relate...
3,Talking about autism,"The language we useis powerful, because it hel..."


In [135]:
# Concatenate existing and new dataframes
df3 = pd.concat([df,df2],axis=0)
df3 = df3.sort_values(by=list(df3.columns[1:])).reset_index(drop=True)
df3.head(40)

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism,https://www.myautismteam.com/resources/autism-an-overview,https://www.autism.org.uk/advice-and-guidance/what-is-autism/asperger-syndrome,https://iancommunity.org/autism-faq,https://icahn.mssm.edu/research/seaver/resources/autism-faqs,https://otsimo.com/en/frequently-asked-questions-autism/,https://rockmelon.com/about-autism/,https://www.amaze.org.au/understand-autism/about-autism/
0,What are the Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,,,,,,
1,How common is autism?,According to a 2020 report commissioned by the...,It is estimated that in the United States 1.6 ...,,,,According to the U.S. Centers for Disease Cont...,Recent studies have found that 1 in 59 childre...,
2,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,,,,You may think that something you did or ate or...,,There is no known cause of autism.Much researc...
3,Is autism contagious?,,Autism is not a contagious condition. Autism i...,,,,,,
4,Are rates of autism increasing?,,Estimates released by the Centers for Disease ...,,,,,,
5,Is autism a new condition?,,"It is likely that autism has always existed, b...",,,,,,
6,Is there a cure for autism?,,"There is no cure for autism. However, early in...",,,,,,
7,How Is Autism Diagnosed?,,There is no one single conclusive test for aut...,,There is no blood test to diagnose autism spec...,"While there is no medical test for autism, an ...",,There are a number of different therapies avai...,
8,Is autism permanent?,,There is some controversy on the topic of whet...,,,,,,
9,How Is Autism Treated?,,Treatment for autism depends largely on the in...,,,,,,


## **Save Output**

In [136]:
df3.to_excel('ASDquestions9.xlsx',index=False)