# Import Libraries

In [1]:
################ WEB SCRAPING MODULES ############
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
from datetime import date 
import datetime
############## DATA MANIPULATION MODULES #########
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

# Define Source

In [2]:
link = 'https://www.autism.org/is-it-autism/'

# Read 100 questions list

In [3]:
df = pd.read_excel('Excel/Source1.xlsx',engine='openpyxl')

df[link]=np.nan
df

Unnamed: 0,Question,https://autismnavigator.com/what-is-autism/,https://www.autism.org/is-it-autism/
0,What are the Autism Spectrum Disorders (ASD)?,Autism spectrum disorder (ASD) is a neurodevel...,
1,How is ASD diagnosed?,Diagnosing ASD can be difficult because there ...,
2,What are early red flags of ASD in toddlers?,The diagnostic features of ASD can be easy to ...,
3,What are 16 early signs of autism by 16 months?,The early signs of autism are easy to miss. Au...,
4,What is Autism?,,
...,...,...,...
98,What are some ways that parents can reduce the...,,
99,Do some families deal with stress better than ...,,
100,Do siblings suffer increased stress as a resul...,,
101,What can I do about my children’s stress?,,


# Scrape QA pairs from website

In [4]:
# Open webpage in a new window
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\aparg\.wdm\drivers\chromedriver\win32\92.0.4515.107\chromedriver.exe] found in cache


In [5]:
# Parse text in webpage
source = driver.page_source
soup = bs4.BeautifulSoup(source, 'html.parser')

In [6]:
# Search the questions mentioned in webpage
l = []
for ques in soup.find_all('h2'):
    print(ques.text)
    l.append(ques.text)

What Is Autism?
How Common Is It?
What are the Signs of Autism?
Where Can I Get Autism Screening?
What should I do next?


In [7]:
# Search the answers for questions
k = []
for j in range(3,9):
    ans = soup.find('div',{'class':'fusion-text fusion-text-{} nitro-offscreen'.format(j)})
    print(ans.text)
    print('*'*100)
    k.append(ans.text)

Autism Spectrum Disorder (ASD) is a developmental disorder with symptoms that appear within the first three years of life. Most children with autism look like other kids but they act and interact in ways that feel different from the behaviors of other children. When interacting with others, they may respond in unexpected ways, or they may not interact at all. Autism is a spectrum disorder, which means that it appears in a range of forms and levels of severity. Some individuals develop typical capabilities in terms of speech and language – and develop exceptional skills – but struggle with lifelong social and behavioral differences. Others may have challenges in communication, sensory sensitivities, and behavioral issues, such as excessive tantrums, repetitive behaviors, aggression, and self-harm. The good news is that appropriate treatments can improve outcomes for many, if not most, people diagnosed with ASD. 
***************************************************************************

# Check which questions are similar

In [8]:
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    sent = sent.lower() # lowercase
    sent = re.sub(r'[^\w\s]', '', sent) # remove punctuations
    sent = re.sub('autism spectrum disorder','asd',sent) # Compress term
    sent = re.sub('autism','asd',sent) # Compress term
    sent = [w for w in sent.split() if not w.lower() in stop_words] # Remove stopwords
    sent = " ".join(sent)
    return sent

In [9]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [10]:
for c1,i in enumerate(l):
    i2 = clean_text(i)
    list1 = i2.split()
    
    temp1 = 0.0
    temp2 = ''
    temp3 = ''
    temp4 = 0
    
    for c2,j in enumerate(df['Question']):
        j2 = clean_text(j)
        list2 = j2.split()
            
        sim = jaccard_similarity(list1, list2)
        
        if(sim>temp1):
            temp1 = sim
            temp2 = j
            temp3 = i
            temp4 = c2
            
    if(temp1>=0.3):
        print('Website --> ',temp3)
        print('100 questions list --> ',temp2,'(Index {})'.format(temp4))
        print(temp1)
        print('*'*100)

Website -->  What Is Autism?
100 questions list -->  What is Autism? (Index 4)
1.0
****************************************************************************************************
Website -->  What are the Signs of Autism?
100 questions list -->  What is Autism? (Index 4)
0.5
****************************************************************************************************
Website -->  Where Can I Get Autism Screening?
100 questions list -->  What are Screening Tools for Autism? (Index 18)
0.5
****************************************************************************************************


In [11]:
# Directly add answer to existing dataframe
df[link].loc[4]=k[0]
df[link].loc[18]=k[3]+'\n Common autism screening tools include: \n '+k[4]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [12]:
l

['What Is Autism?',
 'How Common Is It?',
 'What are the Signs of Autism?',
 'Where Can I Get Autism Screening?',
 'What should I do next?']

In [13]:
# update questions list
l=[l[i] for i in [1,2,4]]
l

['How Common Is It?',
 'What are the Signs of Autism?',
 'What should I do next?']

In [14]:
# manually select answers to questions in list 'l'
l2 = [k[1],
      k[2],
     k[-1]]
l2

['For many years a diagnosis of autism was rare, occurring in just 1 child out of 2,000. However, since the mid-1980s, the rate of autism has increased dramatically around the world. In March 2020, the US Federal Centers for Disease Control announced that 1 in every 54 children in the United States is affected by autism. Autism is more likely to affect boys than girls, but children of all genders have been diagnosed with ASD. ',
 'Many signs can indicate that an individual may be affected by an ASD. Keep in mind that every person diagnosed with autism is different. Some have several signs and symptoms, while others experience only a few.   Common early signs include:  Delayed speech or difficulty communicating Poor eye contact Little or no imaginative play No joint attention – not looking in the same direction as others Showing limited interest in other people Highly emotional responses to changes in routine  The DSM-5, the standard reference used by most healthcare providers, includes

In [15]:
# Create new dataframe with QA pairs
df2 = pd.DataFrame(zip(l,l2),columns=['Question',link])
df2

Unnamed: 0,Question,https://www.autism.org/is-it-autism/
0,How Common Is It?,"For many years a diagnosis of autism was rare,..."
1,What are the Signs of Autism?,Many signs can indicate that an individual may...
2,What should I do next?,When preparing to talk to your medical provide...


In [16]:
# Concatenate existing and new dataframes
df3 = pd.concat([df,df2],axis=0)
df3 = df3.sort_values(by=list(df3.columns[1:])).reset_index(drop=True)
df3.head(10)

Unnamed: 0,Question,https://autismnavigator.com/what-is-autism/,https://www.autism.org/is-it-autism/
0,What are the Autism Spectrum Disorders (ASD)?,Autism spectrum disorder (ASD) is a neurodevel...,
1,How is ASD diagnosed?,Diagnosing ASD can be difficult because there ...,
2,What are early red flags of ASD in toddlers?,The diagnostic features of ASD can be easy to ...,
3,What are 16 early signs of autism by 16 months?,The early signs of autism are easy to miss. Au...,
4,What is Autism?,,Autism Spectrum Disorder (ASD) is a developmen...
5,What are Screening Tools for Autism?,,Early intervention can make a big difference f...
6,How Common Is It?,,"For many years a diagnosis of autism was rare,..."
7,What are the Signs of Autism?,,Many signs can indicate that an individual may...
8,What should I do next?,,When preparing to talk to your medical provide...
9,What is Asperger’s Syndrome?,,


# Save Output

In [17]:
df3.to_excel('Excel/Source2_1.xlsx',index=False)