# 0. Installation (one time job)

In [1]:
!pip install scikit-learn



In [2]:
# xlrd no longer support xlsx - https://stackoverflow.com/questions/65254535/xlrd-biffh-xlrderror-excel-xlsx-file-not-supported
!pip install openpyxl



In [7]:
#!pip install text-preprocessing

Collecting text-preprocessing
  Downloading text_preprocessing-0.1.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting pyspellchecker (from text-preprocessing)
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting contractions (from text-preprocessing)
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting names-dataset==2.1 (from text-preprocessing)
  Downloading names_dataset-2.1.0-py3-none-any.whl.metadata (219 bytes)
Collecting unittest-xml-reporting (from text-preprocessing)
  Downloading unittest_xml_reporting-3.2.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting textsearch>=0.0.21 (from contractions->text-preprocessing)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions->text-preprocessing)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions->text-preprocessing)
  Downlo

# 1. Import Library

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from TextPreprocessing import text_preprocessing

# 2. Check Data

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('popular')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_dat

True

In [13]:
# Read data
pd.set_option('display.max_columns', None)

# data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ')
data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ', engine='openpyxl')

data.head()

Unnamed: 0,sn,Question,Long_Answer,Short_Answer,Source,Remarks
0,1,What are Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,http://birchtreecenter.org/learn/autism,
1,2,How common is autism?,According to a 2020 report commissioned by the...,,http://birchtreecenter.org/learn/autism,
2,3,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,http://birchtreecenter.org/learn/autism,
3,4,Why doesn’t intervention center refer to its s...,Our students are children or youth who are cha...,,http://birchtreecenter.org/learn/autism,
4,5,What are the types of Autism Spectrum Disorders?,Autistic Disorder; Asperger Syndrome; Pervasiv...,,http://dhss.alaska.gov/dph/wcfh/Pages/autism/s...,


In [14]:
print(data.shape)

(226, 6)


# 3. Data Preprocessing

In [15]:
# Select long_answer from the data
long_answer = data.get('Long_Answer')

# Preprocess the long_answer
long_answer = long_answer.map(lambda x: ' '.join(text_preprocessing(x)))

# Vectorize the answers (one-hot)
sparse_vectorizer = CountVectorizer(strip_accents = 'unicode')
sparse_vectors = sparse_vectorizer.fit_transform(long_answer)

# (226, 2753)
print(sparse_vectors.shape)

(226, 2753)


# 4. Build Topic Model using LDA

In [16]:
# Your super power to define number of topics
n_topics = 4

# Run LDA to generate topics/clusters
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=1000,
                                learning_method='online',
                                random_state=0)

lda.fit(sparse_vectors)

# 5. Display the resulting topics/clusters of ASD FAQ's Long_Answer field

In [17]:
# Print the top-n key words
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [18]:
# Show the first n_top_words key words
n_top_words = 10
feature_names = sparse_vectorizer.get_feature_names_out()

for i, topic in enumerate(lda.components_):
    print('Topic {num}'.format(num=i+1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic 1
autism disorder asd child may spectrum people behavior social cause

Topic 2
child autism may treatment intervention diagnosis early help parent therapy

Topic 3
institute national autism tel information health disorder fax behavior md

Topic 4
ability assessment behaviour concern specific characteristic academic ot memory diet



In [19]:
print("1st document(long FAQ answer) belongs to Topic",lda.transform(sparse_vectors[0]).argmax()+1)

1st document(long FAQ answer) belongs to Topic 1


In [20]:
# Display topics of first 20 long FAQ answers:
for i in range(0, 20):
    print("Document (long FAQ answer)", i+1, "belongs to Topic", lda.transform(sparse_vectors[i]).argmax()+1)

Document (long FAQ answer) 1 belongs to Topic 1
Document (long FAQ answer) 2 belongs to Topic 1
Document (long FAQ answer) 3 belongs to Topic 1
Document (long FAQ answer) 4 belongs to Topic 1
Document (long FAQ answer) 5 belongs to Topic 1
Document (long FAQ answer) 6 belongs to Topic 1
Document (long FAQ answer) 7 belongs to Topic 3
Document (long FAQ answer) 8 belongs to Topic 1
Document (long FAQ answer) 9 belongs to Topic 1
Document (long FAQ answer) 10 belongs to Topic 1
Document (long FAQ answer) 11 belongs to Topic 1
Document (long FAQ answer) 12 belongs to Topic 1
Document (long FAQ answer) 13 belongs to Topic 1
Document (long FAQ answer) 14 belongs to Topic 1
Document (long FAQ answer) 15 belongs to Topic 1
Document (long FAQ answer) 16 belongs to Topic 2
Document (long FAQ answer) 17 belongs to Topic 2
Document (long FAQ answer) 18 belongs to Topic 2
Document (long FAQ answer) 19 belongs to Topic 2
Document (long FAQ answer) 20 belongs to Topic 2


# 6. Interpret the identified topics (using top 10 words)

### Topic 1 is about: It is a review of the autism present in asd children dataset and its spectrum among people based on the bahaviour


### Topic 2 is about: Autism in children can be treated from an early intervention by the parents to start therapy


### Topic 3 is about: The National Institute of Autism can give information about the disorder if sent the behaviour over fax


### Topic 4 is about: The ability or behaviour of Autism concern certain characteristics like memory and diet



# 7. Food for thought: What about the clusters/topics of "Questons"?

Solution: The clusters created from questions define the topics we will be obtaining

---
`The end is called the new start.` --- ISS : **I** **S**(elf) **S**(tudy)