## **Import Libraries**

In [1]:
#!pip install selenium
#!pip install webdriver-manager
#!pip install pyyaml ua-parser user-agents fake-useragent

################ WEB SCRAPING MODULES ############
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import ChromeType
from selenium.webdriver.common.by import By
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
from datetime import date 
import datetime
############## DATA MANIPULATION MODULES #########
import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

In [8]:
#from google.colab import drive
#drive.mount('/content/gdrive')

# Change working directory to be current folder
# os.chdir('/content/gdrive/My Drive/Your Folder Name/Your sub Folder Name')
#os.chdir('/content/gdrive/My Drive/iss/irs_proj/')
#!ls

## **Define web source**

In [2]:
link = 'https://birchtreecenter.org/learn/autism'

## **Read 100++ questions list**

In [3]:
df = pd.read_excel('ASDquestions.xlsx',engine='openpyxl')

df[link]=np.nan
df

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism
0,What is Autism?,
1,What are the Autism Spectrum Disorders (ASD)?,
2,What is Asperger’s Syndrome?,
3,How can you tell Autism from Asperger’s Syndrome?,
4,What is Pervasive Developmental Disorder – Not...,
...,...,...
95,What are some ways that parents can reduce the...,
96,Do some families deal with stress better than ...,
97,Do siblings suffer increased stress as a resul...,
98,What can I do about my children’s stress?,


## **Scrape QA pairs from website**

In [4]:
# Open webpage in a new window for scraping
#driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.GOOGLE).install())   #cannot fix in colab

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [/home/aceirus/.wdm/drivers/chromedriver/linux64/92.0.4515.107/chromedriver] found in cache


In [8]:
# Parse text in webpage
source = driver.page_source
soup = bs4.BeautifulSoup(source, 'html.parser')

In [9]:
# Search the questions mentioned in webpage
quesList = []
for ques in soup.find_all('h3'):
    print(ques.text)
    quesList.append(ques.text)

What are Autism Spectrum Disorders (ASD)?
How common is autism?
What causes autism? Can it be cured?
Why doesn’t The Birchtree Center refer to its students as “autistic”?


In [18]:
# Search the answers for questions
ansList = []
for ans in soup.find_all('p'):

    print(ans.text)
    print('*'*100)
    ansList.append(ans.text)

ASD refers to a wide spectrum of neurodevelopmental disorders that affect communication, behavior, and social interaction.
****************************************************************************************************
Children with ASD have delayed or absent communication skills and may demonstrate repetitive or idiosyncratic use of language.  They often have underdeveloped play skills and may engage in repetitive behaviors or adhere to rigid routines.
****************************************************************************************************
****************************************************************************************************
According to a 2020 report commissioned by the U.S. Centers for Disease Control and Prevention, approximately one in 54 American children has been diagnosed with autism or a closely related neurodevelopmental disorder—a dramatic increase over the past decade.
***************************************************************************

## **Check which questions are similar**

In [10]:
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    sent = sent.lower() # lowercase
    sent = re.sub(r'[^\w\s]', '', sent) # remove punctuations
    sent = re.sub('Autism Spectrum Disorder','ASD',sent) # Compress term
    sent = [w for w in sent.split() if not w.lower() in stop_words] # Remove stopwords
    sent = " ".join(sent)
    return sent

In [11]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [12]:
# try to match websource questions with our own 100 questions list
for c1,i in enumerate(quesList):
    i2 = clean_text(i)
    list1 = i2.split()
    
    temp1 = 0.0
    temp2 = ''
    temp3 = ''
    temp4 = 0
    
    for c2,j in enumerate(df['Question']):
        j2 = clean_text(j)
        list2 = j2.split()
            
        sim = jaccard_similarity(list1, list2)
        
        if(sim>temp1):
            temp1 = sim
            temp2 = j
            temp3 = i
            temp4 = c2
            
    if(temp1>=0.3):  # sim threshold
        print('Website --> ',temp3,'(Index {})'.format(c1))
        print('100 questions list --> ',temp2,'(Index {})'.format(temp4))
        print('similarity:', temp1)
        print('*'*100)

Website -->  What are Autism Spectrum Disorders (ASD)? (Index 0)
100 questions list -->  What are the Autism Spectrum Disorders (ASD)? (Index 1)
similarity: 1.0
****************************************************************************************************
Website -->  How common is autism? (Index 1)
100 questions list -->  What is Autism? (Index 0)
similarity: 0.5
****************************************************************************************************
Website -->  What causes autism? Can it be cured? (Index 2)
100 questions list -->  What is Autism? (Index 0)
similarity: 0.3333333333333333
****************************************************************************************************


In [16]:
for i in range (0,len(quesList)):
    print("Index #{0:d}: {1:s}".format(i,quesList[i]))

Index #0: What are Autism Spectrum Disorders (ASD)?
Index #1: How common is autism?
Index #2: What causes autism? Can it be cured?
Index #3: Why doesn’t The Birchtree Center refer to its students as “autistic”?


In [19]:
for i in range (0,len(ansList)):
    print("Index #{0:d}: {1:s}".format(i,ansList[i]))

Index #0: ASD refers to a wide spectrum of neurodevelopmental disorders that affect communication, behavior, and social interaction.
Index #1: Children with ASD have delayed or absent communication skills and may demonstrate repetitive or idiosyncratic use of language.  They often have underdeveloped play skills and may engage in repetitive behaviors or adhere to rigid routines.
Index #3: According to a 2020 report commissioned by the U.S. Centers for Disease Control and Prevention, approximately one in 54 American children has been diagnosed with autism or a closely related neurodevelopmental disorder—a dramatic increase over the past decade.
Index #4: The causes of this complex disorder remain uncertain. Instructional methods such as Applied Behavior Analysis have proven effective in helping individuals with autism learn to overcome many of the challenges that autism presents. These interventions have proven particularly effective when started during the first several years of a chil

In [20]:
# add websource answer to matched question in existing dataframe
df[link].loc[1]=ansList[0] + " " + ansList[1] + " " + ansList[2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [21]:
df[link].loc[1]



In [22]:
# update with new valid questions list
quesListUpd = quesList[1:3]
quesListUpd

['How common is autism?', 'What causes autism? Can it be cured?']

In [23]:
# manually select answers to updated questions list
ansListUpd = [ansList[3],
              ansList[4]]
ansListUpd

['According to a 2020 report commissioned by the U.S. Centers for Disease Control and Prevention, approximately one in 54 American children has been diagnosed with autism or a closely related neurodevelopmental disorder—a dramatic increase over the past decade.',
 'The causes of this complex disorder remain uncertain. Instructional methods such as Applied Behavior Analysis have proven effective in helping individuals with autism learn to overcome many of the challenges that autism presents. These interventions have proven particularly effective when started during the first several years of a child’s life. For more information about recent research into the causes of and treatments for autism, visit the Autism Speaks website at http://www.autismspeaks.org/science.']

In [24]:
# Create new dataframe with QA pairs
df2 = pd.DataFrame(zip(quesListUpd,ansListUpd),columns=['Question',link])
df2

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism
0,How common is autism?,According to a 2020 report commissioned by the...
1,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...


In [28]:
# Concatenate existing and new dataframes
df3 = pd.concat([df,df2],axis=0)
df3 = df3.sort_values(by=list(df3.columns[1:])).reset_index(drop=True)
df3

Unnamed: 0,Question,https://birchtreecenter.org/learn/autism
0,What are the Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...
1,How common is autism?,According to a 2020 report commissioned by the...
2,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...
3,What is Autism?,
4,What is Asperger’s Syndrome?,
...,...,...
97,What are some ways that parents can reduce the...,
98,Do some families deal with stress better than ...,
99,Do siblings suffer increased stress as a resul...,
100,What can I do about my children’s stress?,


## **Save Output**

In [27]:
df3.to_excel('ASDquestions1.xlsx',index=False)