In [1]:
#!pip install nltk
#!python -m nltk.downloader stopwords
#!python -m nltk.downloader
#!python -m nltk.downloader wordnet
#!pip install flashtext 

# Import Libraries

In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from flashtext import KeywordProcessor
from nltk.stem import WordNetLemmatizer

# Combine Excel sheets

In [3]:
df1 = pd.read_excel('Apar.xlsx',engine='openpyxl')
df2 = pd.read_excel('Gopan.xlsx',engine='openpyxl')
df3 = pd.read_excel('WeePing.xlsx',engine='openpyxl')

In [4]:
df = pd.concat([df1,df2,df3],axis=0).dropna(how='all')
df['Question'] = df['Question'].apply(lambda x: x.strip())

In [5]:
df = df.sort_values(by=list(df.columns)).reset_index(drop=True)
df.head()

Unnamed: 0,Question,https://autismnavigator.com/what-is-autism/,https://www.autism.org/is-it-autism/,https://www.autism.org/advice-for-parents/,https://www.autism.org/symptoms-of-autism/,https://www.autism.org/autism-prognosis/,https://autismcanada.org/autism-explained/,https://autismcanada.org/autism-explained/cooccurring-conditions/,https://autismcanada.org/autism-explained/screening-tools/,https://www.who.int/news-room/q-a-detail/autism-spectrum-disorders-(asd),...,https://birchtreecenter.org/learn/autism,https://www.myautismteam.com/resources/autism-an-overview,https://www.autism.org.uk/advice-and-guidance/what-is-autism/asperger-syndrome,https://iancommunity.org/autism-faq,https://icahn.mssm.edu/research/seaver/resources/autism-faqs,https://otsimo.com/en/frequently-asked-questions-autism/,https://rockmelon.com/about-autism/,https://www.amaze.org.au/understand-autism/about-autism/,https://autismrecovery.sg/autism/what-is-aspergers/,https://www.autism.org.sg/living-with-autism/what-is-autism
0,Adams,,,Psychiatric medications are not well-tested in...,,,,,,,...,,,,,,,,,,
1,Are antipsychotic medications helpful in the t...,,,,,,,,,,...,,,,,,,,,,
2,Are antipsychotic medications helpful in the t...,,,,,,,,,,...,,,,,,,,,,
3,Are antipsychotic medications helpful in the t...,,,,,,,,,,...,,,,,,,,,,
4,Are childhood vaccines responsible for autism ...,,,,,,,,,Available epidemiological data show that there...,...,,,,,,,,,,


# Group Questions

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    try:
        sent = sent.lower() # lowercase
        sent = re.sub(r'[^\w\s]', '', sent) # remove punctuations
        sent = re.sub('autism spectrum disorder','asd',sent) # Compress term
        sent = re.sub('autism','asd',sent) # Compress term
        sent = [w.lower() for w in sent.split() if not w.lower() in stop_words] # Remove stopwords
        sent = [lemmatizer.lemmatize(w) for w in sent]#lemantize words
        sent = " ".join(sent)
        return sent
    except:
        pass

In [7]:
df2 = df[['Question']].copy()

df2 = df2.drop_duplicates(subset ="Question")

ques = df2['Question'].apply(clean_text)

In [8]:
# KeyWords for Each Groups
oKeyWordDict = {
                'Treatment': ["vaccine ","disorder","treated","treatment ","treatments","therapy ","diagnosed ","syndrome","diagnosis ","medication ","respite","asperger ","mitochondrid ","diagnosed","treating","autistic","mitochondrial","vaccine"],
                'CausesAndSymptoms' : ["cause ","symptom ","symptoms","disability","behaviour","sign","early","cause","stress"],
                "RelationAndFamily" : ["people ","parent","education","educational","relationship","family","sibling","life","learning","parenting","home","future","help","need","guardianship","benefit","next"]
                }

# adding keywords
kp = KeywordProcessor()
kp.add_keywords_from_dict(oKeyWordDict)

df2['Group'] = ques.apply(lambda x: kp.extract_keywords(x, span_info = False))

In [9]:
df2

Unnamed: 0,Question,Group
0,Adams,[]
1,Are antipsychotic medications helpful in the t...,[]
4,Are childhood vaccines responsible for autism ...,[Treatment]
5,Are computers useful in the education of an au...,"[RelationAndFamily, Treatment]"
8,Are dietary interventions helpful to autistic ...,[Treatment]
...,...,...
442,Why do concerns about mercury in vaccines pers...,[Treatment]
443,Why do people diagnosed with autism spectrum d...,[Treatment]
444,Why does a parent with an autistic child need ...,"[RelationAndFamily, Treatment, RelationAndFamily]"
447,Will My Child Ever Be Able To Talk?,[]


# Divide Groups among us

In [10]:
# Filter answered questions from df
answered_ques = df.dropna(subset=list(df.columns[1:]),how='all')

In [11]:
df2['# Groups'] = df2['Group'].apply(lambda x: len(set(x)))
df2['Answered'] = df2['Question'].apply(lambda x: x in list(answered_ques['Question']))

In [12]:
df2

Unnamed: 0,Question,Group,# Groups,Answered
0,Adams,[],0,True
1,Are antipsychotic medications helpful in the t...,[],0,False
4,Are childhood vaccines responsible for autism ...,[Treatment],1,True
5,Are computers useful in the education of an au...,"[RelationAndFamily, Treatment]",2,False
8,Are dietary interventions helpful to autistic ...,[Treatment],1,False
...,...,...,...,...
442,Why do concerns about mercury in vaccines pers...,[Treatment],1,True
443,Why do people diagnosed with autism spectrum d...,[Treatment],1,True
444,Why does a parent with an autistic child need ...,"[RelationAndFamily, Treatment, RelationAndFamily]",2,False
447,Will My Child Ever Be Able To Talk?,[],0,True


In [13]:
df2.to_excel("Apar_Grouped_Questions.xlsx",index=False)

In [14]:
df2['# Groups'].unique()

array([0, 1, 2, 3], dtype=int64)

In [15]:
df3 = df2.loc[df2['# Groups']==1]
df4 = df2.loc[df2['# Groups']!=1]

In [16]:
df3

Unnamed: 0,Question,Group,# Groups,Answered
4,Are childhood vaccines responsible for autism ...,[Treatment],1,True
8,Are dietary interventions helpful to autistic ...,[Treatment],1,False
11,Are medications useful in treating the behavio...,"[Treatment, Treatment]",1,False
22,Are there other therapies that are useful in t...,[Treatment],1,True
29,Asperger syndrome is also on the spectrum of a...,"[Treatment, Treatment]",1,True
...,...,...,...,...
428,Where do Autistic People Usually Live?,[Treatment],1,False
438,Why do I need to establish guardianship for my...,"[RelationAndFamily, RelationAndFamily]",1,False
441,Why do children with autism have difficulty le...,[RelationAndFamily],1,True
442,Why do concerns about mercury in vaccines pers...,[Treatment],1,True


In [17]:
df3['Group'] = df3['Group'].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['Group'] = df3['Group'].apply(lambda x: x[0])


In [18]:
x=list(df3['Group'].unique())
x

['Treatment', 'RelationAndFamily', 'CausesAndSymptoms']

In [19]:
y=list(df3['Answered'].unique())
y

[True, False]

In [20]:
df3.groupby(['Group']).size().reset_index(name='counts')

Unnamed: 0,Group,counts
0,CausesAndSymptoms,24
1,RelationAndFamily,34
2,Treatment,53


In [21]:
df3.groupby(['Group','Answered']).size().reset_index(name='counts')

Unnamed: 0,Group,Answered,counts
0,CausesAndSymptoms,False,2
1,CausesAndSymptoms,True,22
2,RelationAndFamily,False,14
3,RelationAndFamily,True,20
4,Treatment,False,20
5,Treatment,True,33


In [22]:
for i in x:
    for j in y:
        print(i,j)
        k = df3.loc[(df3['Group']==i) & (df3['Answered']==j)]
        k.to_excel("Divide_Groups/"+i+"_"+str(j)+".xlsx",index=False)
        print('*'*100)

Treatment True
****************************************************************************************************
Treatment False
****************************************************************************************************
RelationAndFamily True
****************************************************************************************************
RelationAndFamily False
****************************************************************************************************
CausesAndSymptoms True
****************************************************************************************************
CausesAndSymptoms False
****************************************************************************************************


In [23]:
df4.to_excel("Divide_Groups/Others.xlsx",index=False)