## **Import Libraries**

In [51]:
#!pip install selenium
#!pip install webdriver-manager
#!pip install pyyaml ua-parser user-agents fake-useragent

################ WEB SCRAPING MODULES ############
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import ChromeType
from selenium.webdriver.common.by import By
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
from datetime import date 
import datetime
############## DATA MANIPULATION MODULES #########
import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

## **Define web source**

In [52]:
link = 'https://icahn.mssm.edu/research/seaver/resources/autism-faqs'

## **Read 100++ questions list**

In [55]:
df = pd.read_excel('ASDquestions5.xlsx',engine='openpyxl')

df[link]=np.nan
df.head(25)

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism,https://www.myautismteam.com/resources/autism-an-overview,https://www.autism.org.uk/advice-and-guidance/what-is-autism/asperger-syndrome,https://iancommunity.org/autism-faq,https://icahn.mssm.edu/research/seaver/resources/autism-faqs
0,What are the Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,,,
1,How common is autism?,According to a 2020 report commissioned by the...,It is estimated that in the United States 1.6 ...,,,
2,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,,,
3,Is autism contagious?,,Autism is not a contagious condition. Autism i...,,,
4,Are rates of autism increasing?,,Estimates released by the Centers for Disease ...,,,
5,Is autism a new condition?,,"It is likely that autism has always existed, b...",,,
6,Is there a cure for autism?,,"There is no cure for autism. However, early in...",,,
7,How Is Autism Diagnosed?,,There is no one single conclusive test for aut...,,There is no blood test to diagnose autism spec...,
8,Is autism permanent?,,There is some controversy on the topic of whet...,,,
9,How Is Autism Treated?,,Treatment for autism depends largely on the in...,,,


## **Scrape QA pairs from website**

In [56]:
# Open webpage in a new window for scraping
#driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.GOOGLE).install())   #cannot fix in colab

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Get LATEST driver version for 92.0.4515
Trying to download new driver from https://chromedriver.storage.googleapis.com/92.0.4515.107/chromedriver_linux64.zip
Driver has been saved in cache [/home/aceirus/.wdm/drivers/chromedriver/linux64/92.0.4515.107]


In [57]:
# Parse text in webpage
source = driver.page_source
soup = bs4.BeautifulSoup(source, 'html.parser')

In [90]:
# Search the questions mentioned in webpage
quesList = []
for ques in soup.find_all('strong'):
    if(ques.text[-1]=='?'):
        print(ques.text)
        quesList.append(ques.text)

What causes autism spectrum disorder (also known as ASD)?
Can vaccines cause autism?
What are the most common signs of autism?
How is autism diagnosed?
Can diet help children with autism spectrum disorder?
What autism research is being done?
What should I do if I think my child may have autism?
What are the treatments for autism?


In [104]:
# Search the answers for corresponding questions in quesList
ansList = []

#for ans in soup.find_all('div', {'class':'field-items'}):
for ans in soup.find_all('p'):

    if ans.find(string=re.compile("autism")):
        #if(ans.text[-1]=='?'):
            toSplit = str(ans)            
            ansSplit = toSplit.split("<br/>")
            ansClean = ansSplit[1].replace("</p>","")

            print(ansClean)
            print('*'*100)
            ansList.append(ansClean)

The experts at the Seaver Autism Center for Research and Treatment have compiled the following answers to frequently asked questions about autism and related conditions.
****************************************************************************************************
There is strong data indicating that genetics are the major cause of autism. However, not all genetic risk is inherited; some genetic changes leading to autism occur as mutations in the egg or sperm.
****************************************************************************************************
There are several epidemiological studies that have disproven the connection between autism and vaccines.
****************************************************************************************************
In toddlers, obvious symptoms may not be present, so it is often the absence of expected behavior that indicates a developmental problem. Autism has two core symptom domains as defined by the Diagnostic and Statistical Ma

## **Check which questions are similar**

In [92]:
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    sent = sent.lower() # lowercase
    sent = re.sub(r'[^\w\s]', '', sent) # remove punctuations
    sent = re.sub('Autism Spectrum Disorder','ASD',sent) # Compress term
    sent = [w for w in sent.split() if not w.lower() in stop_words] # Remove stopwords
    sent = " ".join(sent)
    return sent

In [93]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [94]:
# try to match websource questions with our own 100 questions list
for c1,i in enumerate(quesList):
    i2 = clean_text(i)
    list1 = i2.split()
    
    temp1 = 0.0
    temp2 = ''
    temp3 = ''
    temp4 = 0
    
    for c2,j in enumerate(df['Question']):
        j2 = clean_text(j)
        list2 = j2.split()
            
        sim = jaccard_similarity(list1, list2)
        if(sim>temp1):
            temp1 = sim
            temp2 = j
            temp3 = i
            temp4 = c2
            
    if(temp1>=0.3):  # sim threshold
        print('Website --> ',temp3,'(Index {})'.format(c1))
        print('100 questions list --> ',temp2,'(Index {})'.format(temp4))
        print('similarity:', temp1)
        print('*'*100)

Website -->  What causes autism spectrum disorder (also known as ASD)? (Index 0)
100 questions list -->  What are the Autism Spectrum Disorders (ASD)? (Index 0)
similarity: 0.375
****************************************************************************************************
Website -->  Can vaccines cause autism? (Index 1)
100 questions list -->  Does Thimerosal cause autism? (Index 48)
similarity: 0.5
****************************************************************************************************
Website -->  What are the most common signs of autism? (Index 2)
100 questions list -->  How common is autism? (Index 1)
similarity: 0.6666666666666666
****************************************************************************************************
Website -->  How is autism diagnosed? (Index 3)
100 questions list -->  How Is Autism Diagnosed? (Index 7)
similarity: 1.0
****************************************************************************************************
Website -->

In [95]:
for i in range (0,len(quesList)):
    print("Index #{0:d}: {1:s}".format(i,quesList[i]))

Index #0: What causes autism spectrum disorder (also known as ASD)?
Index #1: Can vaccines cause autism?
Index #2: What are the most common signs of autism?
Index #3: How is autism diagnosed?
Index #4: Can diet help children with autism spectrum disorder?
Index #5: What autism research is being done?
Index #6: What should I do if I think my child may have autism?
Index #7: What are the treatments for autism?


In [96]:
for i in range (0,len(ansList)):
    print("Index #{0:d}: {1:s}".format(i,ansList[i]))

Index #0: The experts at the Seaver Autism Center for Research and Treatment have compiled the following answers to frequently asked questions about autism and related conditions.
Index #1: There is strong data indicating that genetics are the major cause of autism. However, not all genetic risk is inherited; some genetic changes leading to autism occur as mutations in the egg or sperm.
Index #2: There are several epidemiological studies that have disproven the connection between autism and vaccines.
Index #3: In toddlers, obvious symptoms may not be present, so it is often the absence of expected behavior that indicates a developmental problem. Autism has two core symptom domains as defined by the Diagnostic and Statistical Manual of Mental Disorders, 5th Edition (DSM-5): The first is deficits in social communication and social interaction, and the second is repetitive and restricted behaviors and interests. The ways in which these symptoms manifest themselves vary from one child to t

In [97]:
# add websource answer to matched question in existing dataframe
df[link].loc[7]=ansList[4] #3
df[link].loc[19]=ansList[6] #5
df[link].loc[22]=ansList[7] #6

In [98]:
df[link].loc[7]

'While there is no medical test for autism, an autism spectrum diagnosis is made based on observed behavior, developmental history, and autism-specific assessment tools. The “gold standard” assessment includes the Autism Diagnostic Observation Schedule, Second Edition (ADOS-2) and the Autism Diagnostic Interview - Revised (ADI-R). These tools, along with clinical judgment, are used to make a DSM-5 diagnosis of autism spectrum disorder. Early diagnosis is important because it can lead to earlier treatment, which research has linked to more positive outcomes.'

In [109]:
# update with new valid questions list
quesListUpd = [quesList[0],
               quesList[1],
               quesList[2],
               quesList[4],
               quesList[7]]

quesListUpd

['What causes autism spectrum disorder (also known as ASD)?',
 'Can vaccines cause autism?',
 'What are the most common signs of autism?',
 'Can diet help children with autism spectrum disorder?',
 'What are the treatments for autism?']

In [110]:
# manually select answers to updated questions list
ansListUpd = [ansList[1],
              ansList[2],
              ansList[3],
              ansList[5],
              ansList[8]]

ansListUpd

['There is strong data indicating that genetics are the major cause of autism. However, not all genetic risk is inherited; some genetic changes leading to autism occur as mutations in the egg or sperm.',
 'There are several epidemiological studies that have disproven the connection between autism and vaccines.',
 'In toddlers, obvious symptoms may not be present, so it is often the absence of expected behavior that indicates a developmental problem. Autism has two core symptom domains as defined by the Diagnostic and Statistical Manual of Mental Disorders, 5th Edition (DSM-5): The first is deficits in social communication and social interaction, and the second is repetitive and restricted behaviors and interests. The ways in which these symptoms manifest themselves vary from one child to the next. Common symptoms to look for include: fleeting eye contact, limited gesturing such as pointing or waving, limited pretend play, difficulty reading nonverbal cues, odd language such as scripted

In [111]:
# Create new dataframe with QA pairs
df2 = pd.DataFrame(zip(quesListUpd,ansListUpd),columns=['Question',link])
df2

Unnamed: 0,Question,https://icahn.mssm.edu/research/seaver/resources/autism-faqs
0,What causes autism spectrum disorder (also kno...,There is strong data indicating that genetics ...
1,Can vaccines cause autism?,There are several epidemiological studies that...
2,What are the most common signs of autism?,"In toddlers, obvious symptoms may not be prese..."
3,Can diet help children with autism spectrum di...,Families have reported improvements in behavio...
4,What are the treatments for autism?,The primary and first-line treatments include ...


In [113]:
# Concatenate existing and new dataframes
df3 = pd.concat([df,df2],axis=0)
df3 = df3.sort_values(by=list(df3.columns[1:])).reset_index(drop=True)
df3.head(40)

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism,https://www.myautismteam.com/resources/autism-an-overview,https://www.autism.org.uk/advice-and-guidance/what-is-autism/asperger-syndrome,https://iancommunity.org/autism-faq,https://icahn.mssm.edu/research/seaver/resources/autism-faqs
0,What are the Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,,,
1,How common is autism?,According to a 2020 report commissioned by the...,It is estimated that in the United States 1.6 ...,,,
2,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,,,
3,Is autism contagious?,,Autism is not a contagious condition. Autism i...,,,
4,Are rates of autism increasing?,,Estimates released by the Centers for Disease ...,,,
5,Is autism a new condition?,,"It is likely that autism has always existed, b...",,,
6,Is there a cure for autism?,,"There is no cure for autism. However, early in...",,,
7,How Is Autism Diagnosed?,,There is no one single conclusive test for aut...,,There is no blood test to diagnose autism spec...,"While there is no medical test for autism, an ..."
8,Is autism permanent?,,There is some controversy on the topic of whet...,,,
9,How Is Autism Treated?,,Treatment for autism depends largely on the in...,,,


## **Save Output**

In [114]:
df3.to_excel('ASDquestions6.xlsx',index=False)