# Topic Modelling - LDA

## Data processing

In [2]:
import pandas as pd

In [9]:
df = pd.read_excel('dataNLU.xlsx')
df.head()

Unnamed: 0,Utterances,Intents,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,"POSextracts(NN,VBG) [not included in data ]"
0,how to access google,offer,,,,,,,access google
1,how to access missed call alert,ideafiber_usage_alert,,,,,,,access call alert
2,how to access my sd card on myideafi device,ideacinema_download_externalmemory,,,,,,,access card myjiofi device
3,how to access myidea fi device for changing pa...,use_ideafi_device,,,,,,,access myjio fi device changing password
4,how to access that one video is taking how muc...,ideaNewsPaper_devices,,,,,,,access video taking gb


In [217]:
data = df['Utterances']
data_1 = data[0:4000]
test_data = data[4000:]

In [13]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/krishna/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [123]:
# Data preprcessing
# Stemming the words and splitting each sentence into relevant tokens

def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
    return result

In [124]:
# Verify preprocessing

doc_sample = data[4000]
print('original document: ')
print(doc_sample.split())
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['how', 'to', 'use', 'data', 'booster']


 tokenized and lemmatized document: 
['data', 'booster']


In [218]:
# Stem and tokenize each training example

processed_data = data_1.map(preprocess)
processed_test = test_data.map(preprocess)
processed_data[:10]

0                             [access, googl]
1                       [access, miss, alert]
2             [access, card, myideafi, devic]
3    [access, myidea, devic, chang, password]
4                       [access, video, take]
5                              [account, set]
6                                   [account]
7                                   [account]
8                             [account, idea]
9               [achiev, idea, celebr, offer]
Name: Utterances, dtype: object

In [221]:
# Create a dictionary from processed_data containing number of times a word appears in training set
dictionary = gensim.corpora.Dictionary(processed_data)
test_dict = gensim.corpora.Dictionary(processed_test)

# Filter out tokens 
# Keep only tokens that occur more than 3 times and less than in 85 percent of the dataset
# Keep only the 4000 most frequent tokens
dictionary.filter_extremes(no_below = 3, no_above = 0.85, keep_n = 4000)
test_dict.filter_extremes(no_below = 3, no_above = 0.95, keep_n = 4000)

#For each data, create a dictionary to store how many words and how many times does a word appear in a sentence
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_data]
test_corpus = [test_dict.doc2bow(doc) for doc in processed_test]

bow_corpus[-1]

[(17, 1)]

In [222]:
# Preview bag of words for our sample preprocessed document

bow_doc_sample = bow_corpus[-1]

for i in range(len(bow_doc_sample)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_sample[i][0], 
                                               dictionary[bow_doc_sample[i][0]], 
bow_doc_sample[i][1]))

Word 17 ("data") appears 1 time.


## Traning using LDA bag of words and TF-IDF Model

In [223]:
# TF-IDF Model

from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.7272418883021802), (1, 0.6863812613254235)]


In [224]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=15, id2word=dictionary, passes=25, workers=2)
lda_model_topics = []

# Printing the different topics recognised by bag of words model
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \n Words: {}'.format(idx, topic))
    lda_model_topics.append(topic)

Topic: 0 
 Words: 0.165*"idea" + 0.094*"prime" + 0.069*"stop" + 0.068*"tone" + 0.055*"membership" + 0.053*"servic" + 0.039*"appli" + 0.039*"alert" + 0.037*"miss" + 0.035*"logout"
Topic: 1 
 Words: 0.190*"offer" + 0.157*"voucher" + 0.146*"redeem" + 0.126*"idea" + 0.044*"paytm" + 0.044*"unlock" + 0.034*"code" + 0.028*"coupon" + 0.027*"rupe" + 0.024*"celebr"
Topic: 2 
 Words: 0.289*"chang" + 0.192*"plan" + 0.068*"idea" + 0.065*"password" + 0.057*"messag" + 0.038*"avail" + 0.021*"usernam" + 0.018*"current" + 0.018*"reach" + 0.018*"loan"
Topic: 3 
 Words: 0.290*"activ" + 0.158*"pack" + 0.105*"ideasim" + 0.101*"idea" + 0.101*"celebr" + 0.083*"ideaphon" + 0.014*"gvoic" + 0.014*"purchas" + 0.011*"ring" + 0.010*"exchang"
Topic: 4 
 Words: 0.416*"data" + 0.156*"free" + 0.063*"idea" + 0.049*"ideaapp" + 0.040*"money" + 0.026*"share" + 0.020*"booster" + 0.020*"cashback" + 0.017*"extra" + 0.017*"earn"
Topic: 5 
 Words: 0.163*"milk" + 0.148*"dairi" + 0.132*"call" + 0.078*"cadburi" + 0.060*"data" + 0.

In [225]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=15, id2word=dictionary, passes=25, workers=4)
lda_model_tfidf_topics = []

# Printing the different topics recognised by tf-idf model
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \n Word: {}'.format(idx, topic))
    lda_model_tfidf_topics.append(topic)

Topic: 0 
 Word: 0.137*"know" + 0.076*"call" + 0.065*"unlock" + 0.063*"myideanumb" + 0.054*"regist" + 0.053*"money" + 0.050*"appli" + 0.041*"ideamus" + 0.038*"set" + 0.032*"save"
Topic: 1 
 Word: 0.243*"recharg" + 0.090*"remov" + 0.079*"number" + 0.073*"delet" + 0.045*"phone" + 0.044*"card" + 0.036*"tone" + 0.035*"histori" + 0.033*"idea" + 0.032*"cancel"
Topic: 2 
 Word: 0.137*"tune" + 0.097*"play" + 0.089*"download" + 0.065*"caller" + 0.050*"idea" + 0.046*"video" + 0.045*"instal" + 0.026*"track" + 0.025*"custom" + 0.022*"whatsapp"
Topic: 3 
 Word: 0.134*"mobil" + 0.082*"ideanumb" + 0.079*"milk" + 0.074*"dairi" + 0.055*"creat" + 0.045*"recharg" + 0.043*"cadburi" + 0.041*"link" + 0.038*"data" + 0.030*"booster"
Topic: 4 
 Word: 0.157*"account" + 0.089*"ideaapp" + 0.060*"sign" + 0.053*"open" + 0.051*"network" + 0.048*"devic" + 0.048*"logout" + 0.032*"cashback" + 0.028*"roam" + 0.027*"setup"
Topic: 5 
 Word: 0.207*"data" + 0.142*"activ" + 0.109*"plan" + 0.097*"ideasim" + 0.061*"phone" + 0.

In [226]:
print(df["Intents"][2000])
print(data_1[2000])
print(processed_data[2000])

unable_to_download_app
how to increase idea downloading speed
['increas', 'idea', 'download', 'speed']


In [227]:
for index, score in sorted(lda_model_tfidf[bow_corpus[2000]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
    


Score: 0.45875915819628926	 
Topic: 0.120*"increas" + 0.111*"speed" + 0.062*"balanc" + 0.061*"extra" + 0.045*"switch"

Score: 0.3679072623051828	 
Topic: 0.137*"tune" + 0.097*"play" + 0.089*"download" + 0.065*"caller" + 0.050*"idea"

Score: 0.013333406128404108	 
Topic: 0.167*"idea" + 0.098*"deactiv" + 0.096*"internet" + 0.068*"password" + 0.033*"record"

Score: 0.013333363417403968	 
Topic: 0.152*"pack" + 0.149*"offer" + 0.126*"celebr" + 0.069*"idea" + 0.064*"activ"

Score: 0.013333354848762843	 
Topic: 0.232*"free" + 0.107*"connect" + 0.100*"myidea" + 0.068*"data" + 0.050*"idea"

Score: 0.01333335254714329	 
Topic: 0.088*"messag" + 0.086*"updat" + 0.064*"youtub" + 0.063*"onlin" + 0.051*"wifi"

Score: 0.013333351098967895	 
Topic: 0.255*"chang" + 0.125*"number" + 0.054*"share" + 0.053*"membership" + 0.043*"contact"

Score: 0.01333334926875857	 
Topic: 0.207*"data" + 0.142*"activ" + 0.109*"plan" + 0.097*"ideasim" + 0.061*"phone"

Score: 0.013333347470686041	 
Topic: 0.243*"recharg" + 

## Storing the results

In [239]:
lda_topics_data = []
lda_tfidf_data_topics = []

# test_data = data[4000:]

for i in np.arange(len(test_data)):
    sentence = test_data[i+4000]
    lda_index = sorted(lda_model[test_corpus[i]], key=lambda tup: -1*tup[1])[0][0]
    tfidf_index = sorted(lda_model_tfidf[test_corpus[i]], key=lambda tup: -1*tup[1])[0][0]
    lda_topics_data.append(lda_model.print_topic(lda_index, 4))
    lda_tfidf_data_topics.append(lda_model_tfidf.print_topic(tfidf_index, 4))
#     print(lda_index, tfidf_index)
#     print("Sentence:", sentence)
#     print("LDA Model Topic:",  lda_model.print_topic(topic_index, 4))
#     print("LDA-TFIDF Model Topic:",  lda_model_tfidf.print_topic(topic_index, 4))
#     print('\n')
    

dataframe = {
    "Data" : test_data, 
    "LDA_topic" : lda_topics_data,
    "LDA_topic_tfidf" : lda_tfidf_data_topics
}

labelled_dataframe = pd.DataFrame(dataframe)
labelled_dataframe.to_csv("LDA_test.csv")

In [237]:
len(test_data)

565

## Result Analysis

Topic Modelling works well for a lot of examples topics that are represented well in the dataset. For example: 

In [203]:
for i in np.arange(12,18):
    print(labelled_dataframe.iloc[i], "\n")
# print(labelled_dataframe.loc[13])

Data                                  how to activate 10gb data free
LDA_topic          0.475*"data" + 0.158*"free" + 0.120*"chang" + ...
LDA_topic_tfidf    0.260*"data" + 0.180*"free" + 0.054*"regist" +...
Name: 12, dtype: object 

Data                                 how to activate 16 gb free data
LDA_topic          0.475*"data" + 0.158*"free" + 0.120*"chang" + ...
LDA_topic_tfidf    0.260*"data" + 0.180*"free" + 0.054*"regist" +...
Name: 13, dtype: object 

Data                    how to activate 1gp cadbury dairy milk offer
LDA_topic          0.259*"plan" + 0.108*"milk" + 0.102*"dairi" + ...
LDA_topic_tfidf    0.166*"check" + 0.082*"milk" + 0.076*"dairi" +...
Name: 14, dtype: object 

Data                  how to activate 2 years celebrate of idea pack
LDA_topic          0.211*"pack" + 0.133*"idea" + 0.122*"celebr" +...
LDA_topic_tfidf    0.161*"idea" + 0.124*"pack" + 0.118*"plan" + 0...
Name: 15, dtype: object 

Data                                   how to activate 2gb data free

In general, there are a lot of examples similar to these and LDA does a reasonably good job on these examples. In the above examples, we can clearly see that the topic words directly relate to the data example.  

LDA struggles to assign topics to examples which are not represented very well in the dataset. For example: 

In [206]:
for i in [0,1,528]:
    print(labelled_dataframe.iloc[i], "\n")
# print(labelled_dataframe.loc[13])

Data                                            how to access google
LDA_topic          0.160*"account" + 0.159*"ideaphon" + 0.130*"ac...
LDA_topic_tfidf    0.086*"voic" + 0.071*"sign" + 0.071*"password"...
Name: 0, dtype: object 

Data                                 how to access missed call alert
LDA_topic          0.259*"plan" + 0.108*"milk" + 0.102*"dairi" + ...
LDA_topic_tfidf    0.089*"transfer" + 0.077*"link" + 0.068*"stop"...
Name: 1, dtype: object 

Data                                            how to change gender
LDA_topic          0.475*"data" + 0.158*"free" + 0.120*"chang" + ...
LDA_topic_tfidf    0.203*"chang" + 0.076*"login" + 0.059*"ideamus...
Name: 528, dtype: object 



The key tokens in the above examples are very specific to those examples and not found in other examples of the data. Hence, LDA does not properly classify these examples. Also expectedly, LDA does not properly classfiy examples that were labelled garbage in the original datas