## **Import Libraries**

In [1]:
#!pip install selenium
#!pip install webdriver-manager
#!pip install pyyaml ua-parser user-agents fake-useragent

################ WEB SCRAPING MODULES ############
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import ChromeType
from selenium.webdriver.common.by import By
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
from datetime import date 
import datetime
############## DATA MANIPULATION MODULES #########
import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

## **Define web source**

In [2]:
link = 'https://otsimo.com/en/frequently-asked-questions-autism/'

## **Read 100++ questions list**

In [3]:
df = pd.read_excel('ASDquestions6.xlsx',engine='openpyxl')

df[link]=np.nan
df.head(25)

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism,https://www.myautismteam.com/resources/autism-an-overview,https://www.autism.org.uk/advice-and-guidance/what-is-autism/asperger-syndrome,https://iancommunity.org/autism-faq,https://icahn.mssm.edu/research/seaver/resources/autism-faqs,https://otsimo.com/en/frequently-asked-questions-autism/
0,What are the Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,,,,
1,How common is autism?,According to a 2020 report commissioned by the...,It is estimated that in the United States 1.6 ...,,,,
2,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,,,,
3,Is autism contagious?,,Autism is not a contagious condition. Autism i...,,,,
4,Are rates of autism increasing?,,Estimates released by the Centers for Disease ...,,,,
5,Is autism a new condition?,,"It is likely that autism has always existed, b...",,,,
6,Is there a cure for autism?,,"There is no cure for autism. However, early in...",,,,
7,How Is Autism Diagnosed?,,There is no one single conclusive test for aut...,,There is no blood test to diagnose autism spec...,"While there is no medical test for autism, an ...",
8,Is autism permanent?,,There is some controversy on the topic of whet...,,,,
9,How Is Autism Treated?,,Treatment for autism depends largely on the in...,,,,


## **Scrape QA pairs from website**

In [4]:
# Open webpage in a new window for scraping
#driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.GOOGLE).install())   #cannot fix in colab

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [/home/aceirus/.wdm/drivers/chromedriver/linux64/92.0.4515.107/chromedriver] found in cache


In [5]:
# Parse text in webpage
source = driver.page_source
soup = bs4.BeautifulSoup(source, 'html.parser')

In [8]:
# Search the questions mentioned in webpage
quesList = []
for ques in soup.find_all('h3'):
    if(ques.text[-1]=='?'):
        print(ques.text)
        quesList.append(ques.text)

What is Autism?
What Causes Autism?
How Common is Autism?
What is “the Spectrum”?
Does My Child Have Autism?
Why My Child Has Autism?
What if I Have Autism?
What to do After Diagnosis of Autism?
Can My Child with Autism Attend School?


In [29]:
# Search the answers for corresponding questions in quesList
ansList = []

#for ans in soup.find_all('div', {'class':'field-items'}):
for ans in soup.find_all('p'):

    if ans.find(string=re.compile("autism")):
            #toSplit = str(ans)            
            #ansSplit = toSplit.split("<br/>")
            #ansClean = ansSplit[1].replace("</p>","")

            print(ans.text)
            print('*'*100)
            ansList.append(ans.text)

For parents, whose child just got diagnosed with autism, it might be hard to deal with the overwhelming feeling the questions in their minds bring. Or the diagnosis can be an explanation to some adults who have been dealing with something they did not know. Either way, with this information come a lot of questions. The best way to understand what this diagnosis is to educate yourself. Here are answers to some frequently asked questions.
****************************************************************************************************
Autism spectrum disorder is an umbrella term used for various brain development disorders. They are characterized and thus categorized by difficulties in social interaction, verbal and nonverbal communication, and repetitive behaviors. Autistic disorder, Rett Syndrome, childhood disintegrative disorder, PDD-NOS (pervasive developmental disorder-not otherwise specified) and Asperger Syndrome can be listed under this umbrella. There are many characteristic

## **Check which questions are similar**

In [30]:
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    sent = sent.lower() # lowercase
    sent = re.sub(r'[^\w\s]', '', sent) # remove punctuations
    sent = re.sub('Autism Spectrum Disorder','ASD',sent) # Compress term
    sent = [w for w in sent.split() if not w.lower() in stop_words] # Remove stopwords
    sent = " ".join(sent)
    return sent

In [31]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [32]:
# try to match websource questions with our own 100 questions list
for c1,i in enumerate(quesList):
    i2 = clean_text(i)
    list1 = i2.split()
    
    temp1 = 0.0
    temp2 = ''
    temp3 = ''
    temp4 = 0
    
    for c2,j in enumerate(df['Question']):
        j2 = clean_text(j)
        list2 = j2.split()
            
        sim = jaccard_similarity(list1, list2)
        if(sim>temp1):
            temp1 = sim
            temp2 = j
            temp3 = i
            temp4 = c2
            
    if(temp1>=0.3):  # sim threshold
        print('Website --> ',temp3,'(Index {})'.format(c1))
        print('100 questions list --> ',temp2,'(Index {})'.format(temp4))
        print('similarity:', temp1)
        print('*'*100)

Website -->  What is Autism? (Index 0)
100 questions list -->  What is Autism? (Index 28)
similarity: 1.0
****************************************************************************************************
Website -->  What Causes Autism? (Index 1)
100 questions list -->  What causes autism? Can it be cured? (Index 2)
similarity: 0.6666666666666666
****************************************************************************************************
Website -->  How Common is Autism? (Index 2)
100 questions list -->  How common is autism? (Index 1)
similarity: 1.0
****************************************************************************************************
Website -->  Does My Child Have Autism? (Index 4)
100 questions list -->  What is Autism? (Index 28)
similarity: 0.5
****************************************************************************************************
Website -->  Why My Child Has Autism? (Index 5)
100 questions list -->  What is Autism? (Index 28)
similarity: 

In [33]:
for i in range (0,len(quesList)):
    print("Index #{0:d}: {1:s}".format(i,quesList[i]))

Index #0: What is Autism?
Index #1: What Causes Autism?
Index #2: How Common is Autism?
Index #3: What is “the Spectrum”?
Index #4: Does My Child Have Autism?
Index #5: Why My Child Has Autism?
Index #6: What if I Have Autism?
Index #7: What to do After Diagnosis of Autism?
Index #8: Can My Child with Autism Attend School?


In [34]:
for i in range (0,len(ansList)):
    print("Index #{0:d}: {1:s}".format(i,ansList[i]))

Index #0: For parents, whose child just got diagnosed with autism, it might be hard to deal with the overwhelming feeling the questions in their minds bring. Or the diagnosis can be an explanation to some adults who have been dealing with something they did not know. Either way, with this information come a lot of questions. The best way to understand what this diagnosis is to educate yourself. Here are answers to some frequently asked questions.
Index #1: Autism spectrum disorder is an umbrella term used for various brain development disorders. They are characterized and thus categorized by difficulties in social interaction, verbal and nonverbal communication, and repetitive behaviors. Autistic disorder, Rett Syndrome, childhood disintegrative disorder, PDD-NOS (pervasive developmental disorder-not otherwise specified) and Asperger Syndrome can be listed under this umbrella. There are many characteristics that define autism spectrum disorder (ASD) as mental inabilities, struggles wit

In [47]:
# add websource answer to matched question in existing dataframe
df[link].loc[28]=ansList[1]+ansList[2] #0
df[link].loc[2]=ansList[3]+ansList[4]+ansList[5] #1
df[link].loc[1]=ansList[6]+ansList[7] #2

In [48]:
df[link].loc[28]

'Autism spectrum disorder is an umbrella term used for various brain development disorders. They are characterized and thus categorized by difficulties in social interaction, verbal and nonverbal communication, and repetitive behaviors. Autistic disorder, Rett Syndrome, childhood disintegrative disorder, PDD-NOS (pervasive developmental disorder-not otherwise specified) and Asperger Syndrome can be listed under this umbrella. There are many characteristics that define autism spectrum disorder (ASD) as mental inabilities, struggles with motor functions and other mental and health issues that come along. Besides these, some individuals with autism may show extraordinary skills in math, art, music.Autism is thought to take its roots from early brain development stages. Its symptoms generally occur between the age of 12 and 18 months. Some children may develop normally and don’t show the signs until they reach the age 2. They start to develop autism by losing skills, which is called “regre

In [49]:
# update with new valid questions list
quesListUpd = [quesList[3],
               quesList[5],
               quesList[6],
               quesList[8]]
quesListUpd

['What is “the Spectrum”?',
 'Why My Child Has Autism?',
 'What if I Have Autism?',
 'Can My Child with Autism Attend School?']

In [50]:
# manually select answers to updated questions list
ansListUpd = [ansList[8],
              ansList[9],
              ansList[11],
              ansList[12]]

ansListUpd

['Autism is a term that is constantly redefined the more it’s studied and it is an umbrella term. Being “on the spectrum” means having some sort of autism. Subsets of autism ranges from mild to severe. We can say that every case in autism is unique. While some individuals on the autism spectrum have great art skills and above-average intelligence, some have above below-average to average intelligence. Most people on the spectrum can have a distinctive ability resulting from their peculiar way of viewing the world, while others suffering from autism are unable to live without aid. About 25 percent of individuals with ASD can develop communication skills by using other means than verbal. The way and quality of life can be greatly improved with effective treatment and education method, while raising the awareness on accepting, respecting, and supporting those in need.',
 'Although mostly there is not one cause that can be determined, it is suggested that autism development can be traced t

In [51]:
# Create new dataframe with QA pairs
df2 = pd.DataFrame(zip(quesListUpd,ansListUpd),columns=['Question',link])
df2

Unnamed: 0,Question,https://otsimo.com/en/frequently-asked-questions-autism/
0,What is “the Spectrum”?,Autism is a term that is constantly redefined ...
1,Why My Child Has Autism?,Although mostly there is not one cause that ca...
2,What if I Have Autism?,You should consider consulting your doctor for...
3,Can My Child with Autism Attend School?,Every child has the right to get appropriate e...


In [52]:
# Concatenate existing and new dataframes
df3 = pd.concat([df,df2],axis=0)
df3 = df3.sort_values(by=list(df3.columns[1:])).reset_index(drop=True)
df3.head(40)

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism,https://www.myautismteam.com/resources/autism-an-overview,https://www.autism.org.uk/advice-and-guidance/what-is-autism/asperger-syndrome,https://iancommunity.org/autism-faq,https://icahn.mssm.edu/research/seaver/resources/autism-faqs,https://otsimo.com/en/frequently-asked-questions-autism/
0,What are the Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,,,,
1,How common is autism?,According to a 2020 report commissioned by the...,It is estimated that in the United States 1.6 ...,,,,According to the U.S. Centers for Disease Cont...
2,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,,,,You may think that something you did or ate or...
3,Is autism contagious?,,Autism is not a contagious condition. Autism i...,,,,
4,Are rates of autism increasing?,,Estimates released by the Centers for Disease ...,,,,
5,Is autism a new condition?,,"It is likely that autism has always existed, b...",,,,
6,Is there a cure for autism?,,"There is no cure for autism. However, early in...",,,,
7,How Is Autism Diagnosed?,,There is no one single conclusive test for aut...,,There is no blood test to diagnose autism spec...,"While there is no medical test for autism, an ...",
8,Is autism permanent?,,There is some controversy on the topic of whet...,,,,
9,How Is Autism Treated?,,Treatment for autism depends largely on the in...,,,,


## **Save Output**

In [53]:
df3.to_excel('ASDquestions7.xlsx',index=False)