# Import Libraries

In [1]:
################ WEB SCRAPING MODULES ############
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
from datetime import date 
import datetime
############## DATA MANIPULATION MODULES #########
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

# Define Source

In [2]:
link = 'https://autismnavigator.com/what-is-autism/'

# Read 100 questions list

In [3]:
df = pd.read_excel('Excel/questions.xlsx',engine='openpyxl')

df[link]=np.nan
df

Unnamed: 0,Question,https://autismnavigator.com/what-is-autism/
0,What is Autism?,
1,What are the Autism Spectrum Disorders (ASD)?,
2,What is Asperger’s Syndrome?,
3,How can you tell Autism from Asperger’s Syndrome?,
4,What is Pervasive Developmental Disorder – Not...,
...,...,...
95,What are some ways that parents can reduce the...,
96,Do some families deal with stress better than ...,
97,Do siblings suffer increased stress as a resul...,
98,What can I do about my children’s stress?,


# Scrape QA pairs from website

In [4]:
# Open webpage in a new window
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\aparg\.wdm\drivers\chromedriver\win32\92.0.4515.107\chromedriver.exe] found in cache


In [5]:
# Parse text in webpage
source = driver.page_source
soup = bs4.BeautifulSoup(source, 'html.parser')

In [6]:
# Search the questions mentioned in webpage
l = []
for i in soup.find_all('span',{'style':'color: #91a552; font-size: 22px; font-weight: 600;'}):
    if(i.text[-1]=='?'):
        print(i.text)
        l.append(i.text)

What is Autism Spectrum Disorder?
What are early red flags of ASD in toddlers?
How is ASD diagnosed?
What are 16 early signs of autism by 16 months?


In [7]:
# Tried to search for answers of questions
k=[]
c = 0
for i in soup.find_all('div',{'class':'wpb_text_column wpb_content_element'}):
    txt = i.text.strip()
    if(txt in l):
        print('*'*100)
    if(txt in l or len(txt.split())>50):
        print(c,'-->', txt)
    c = c+1
    
    k.append(txt)

1 --> Learn about what autism spectrum disorder (ASD) is, the early red flags of ASD in toddlers, and how ASD is diagnosed. View our online 16 Early Signs of Autism by 16 Months Lookbook and select a print version—8 pages, 1 page, or fillable Checklist of the 16 Early Signs. Find printables with Autism Navigator messages in letter and tabloid size to download, print, and share.
****************************************************************************************************
2 --> What is Autism Spectrum Disorder?
3 --> Autism spectrum disorder (ASD) is a neurodevelopmental disorder defined by persistent deficits in social communication and social interaction, accompanied by restricted, repetitive patterns of behavior, interests, or activities.
The signs of ASD are usually evident in early childhood. Though it is still considered a lifelong diagnosis, with appropriate early intervention, individuals with ASD can lead productive, inclusive, and fulfilling lives. Many children with ASD

# Check which questions are similar

In [8]:
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    sent = sent.lower() # lowercase
    sent = re.sub(r'[^\w\s]', '', sent) # remove punctuations
    sent = re.sub('autism spectrum disorder','asd',sent) # Compress term
    sent = re.sub('autism','asd',sent) # Compress term
    sent = [w for w in sent.split() if not w.lower() in stop_words] # Remove stopwords
    sent = " ".join(sent)
    return sent

In [9]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [10]:
for i in l:
    i2 = clean_text(i)
    list1 = i2.split()
    
    temp1 = 0.0
    temp2 = ''
    temp3 = ''
    
    for j in df['Question']:
        j2 = clean_text(j)
        list2 = j2.split()
            
        sim = jaccard_similarity(list1, list2)
        
        if(sim>temp1):
            temp1 = sim
            temp2 = j
            temp3 = i
            
    if(temp1>=0.3):
        print('Website --> ',temp3)
        print('100 questions list --> ',temp2)
        print(temp1)
        print('*'*100)

Website -->  What is Autism Spectrum Disorder?
100 questions list -->  What are the Autism Spectrum Disorders (ASD)?
0.4
****************************************************************************************************


In [11]:
df

Unnamed: 0,Question,https://autismnavigator.com/what-is-autism/
0,What is Autism?,
1,What are the Autism Spectrum Disorders (ASD)?,
2,What is Asperger’s Syndrome?,
3,How can you tell Autism from Asperger’s Syndrome?,
4,What is Pervasive Developmental Disorder – Not...,
...,...,...
95,What are some ways that parents can reduce the...,
96,Do some families deal with stress better than ...,
97,Do siblings suffer increased stress as a resul...,
98,What can I do about my children’s stress?,


In [12]:
# Directly add answer to 'What are the Autism Spectrum Disorders (ASD)?' in existing dataframe
df[link].loc[1]=k[3]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [13]:
l

['What is Autism Spectrum Disorder?',
 'What are early red flags of ASD in toddlers?',
 'How is ASD diagnosed?',
 'What are 16 early signs of autism by 16 months?']

In [14]:
# update questions list
l=l[1:]
l

['What are early red flags of ASD in toddlers?',
 'How is ASD diagnosed?',
 'What are 16 early signs of autism by 16 months?']

In [15]:
# manually select answers to questions in list 'l'

l2 = [k[8]+'\n'+k[12]+'\n'+k[13]+'\n'+k[14],
      k[16]+'\n'+k[17],
     k[19]+'\n'+k[21]]

In [16]:
# Create new dataframe with QA pairs
df2 = pd.DataFrame(zip(l,l2),columns=['Question',link])
df2

Unnamed: 0,Question,https://autismnavigator.com/what-is-autism/
0,What are early red flags of ASD in toddlers?,The diagnostic features of ASD can be easy to ...
1,How is ASD diagnosed?,Diagnosing ASD can be difficult because there ...
2,What are 16 early signs of autism by 16 months?,The early signs of autism are easy to miss. Au...


In [17]:
# Concatenate existing and new dataframes
df3 = pd.concat([df,df2],axis=0)
df3 = df3.sort_values(by=list(df3.columns[1:])).reset_index(drop=True)
df3

Unnamed: 0,Question,https://autismnavigator.com/what-is-autism/
0,What are the Autism Spectrum Disorders (ASD)?,Autism spectrum disorder (ASD) is a neurodevel...
1,How is ASD diagnosed?,Diagnosing ASD can be difficult because there ...
2,What are early red flags of ASD in toddlers?,The diagnostic features of ASD can be easy to ...
3,What are 16 early signs of autism by 16 months?,The early signs of autism are easy to miss. Au...
4,What is Autism?,
...,...,...
98,What are some ways that parents can reduce the...,
99,Do some families deal with stress better than ...,
100,Do siblings suffer increased stress as a resul...,
101,What can I do about my children’s stress?,


# Save Output

In [18]:
df3.to_excel('Excel/Source1.xlsx',index=False)