In [73]:
import pandas as pd
import numpy as np
import guidedlda
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import pickle

In [74]:
df = pd.read_excel('dataNLU.xlsx')
data = df['Utterances']
data.head()

0                                 how to access google
1                      how to access missed call alert
2          how to access my sd card on myideafi device
3    how to access myidea fi device for changing pa...
4    how to access that one video is taking how muc...
Name: Utterances, dtype: object

In [None]:
guidedlda.datasets.load_data

In [75]:
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [150]:
# Adding some domain specific stopwords
domain_words = ["access", "account", "activate", "remedy", "add on",
               "usage alert", "celebration", "data booster", "idea data booster", 
                "idea data plan", "idea plan", "data plan", "idea tune subscription",
               "idea tune", "slow internet", "internet", "idea subscription", 
               "message", "ideaphone", "ideanet", "ideatv", "idea phone",
               "idea net", "idea tv", "idea sim", "ideasim", "boost", "idea data", 
               "myidea", "postpaid", "special pack", "voucher", "password", "call",
               "account", "application", "delete", "download", "aadhar", 
               "idea app", "recharge", "photos", "messages", "msg", "sms",
               "idea", "caller tune", "customer", "hello tune", "idea",
               "idea tune", "alert", "missed call", "ideamusic", "idea music",
               "hotspot", "network", "stop", "take", "unblock", "how to use", 
               ]
stopwords.extend(domain_words)
stopwords = set(stopwords)

AttributeError: 'set' object has no attribute 'extend'

In [77]:
Y = CountVectorizer(ngram_range=(1,4), min_df=1, max_df=.80, stop_words=stopwords)

In [78]:
X = Y.fit_transform(data)

In [79]:
X.shape

(4565, 10227)

In [80]:
vocab = Y.vocabulary_

In [81]:
tf_feature_names = Y.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(vocab))

In [123]:
seed_topic_list = [
    ["data booster", "data pack"],
    ["dairy milk", "myideaapp", "offer"],
    ["ideatune", "subscription", "recharges"],
    ["sim card", "contact number", "number"],
    ["phone", "booking"],
    ["change", "accounts", "number"],
    ["check", "data", "information"],
    ["myideaapp", "connect", "myideafi device"]
]

In [159]:
model = guidedlda.GuidedLDA(n_topics = 15, n_iter = 500, random_state = 4 ,refresh = 20)

In [160]:
seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [161]:
model.fit(X, seed_topics=seed_topics, seed_confidence=0.25)
# pickle.dump(model, open('model','wb'))

INFO:guidedlda:n_documents: 4565
INFO:guidedlda:vocab_size: 10227
INFO:guidedlda:n_words: 22675
INFO:guidedlda:n_topics: 15
INFO:guidedlda:n_iter: 500
INFO:guidedlda:<0> log likelihood: -330363
INFO:guidedlda:<20> log likelihood: -208892
INFO:guidedlda:<40> log likelihood: -206168
INFO:guidedlda:<60> log likelihood: -204165
INFO:guidedlda:<80> log likelihood: -203104
INFO:guidedlda:<100> log likelihood: -202328
INFO:guidedlda:<120> log likelihood: -202150
INFO:guidedlda:<140> log likelihood: -201612
INFO:guidedlda:<160> log likelihood: -201137
INFO:guidedlda:<180> log likelihood: -201014
INFO:guidedlda:<200> log likelihood: -200730
INFO:guidedlda:<220> log likelihood: -200543
INFO:guidedlda:<240> log likelihood: -200307
INFO:guidedlda:<260> log likelihood: -200206
INFO:guidedlda:<280> log likelihood: -200165
INFO:guidedlda:<300> log likelihood: -199979
INFO:guidedlda:<320> log likelihood: -200023
INFO:guidedlda:<340> log likelihood: -199981
INFO:guidedlda:<360> log likelihood: -199935


<guidedlda.guidedlda.GuidedLDA at 0x11c5a1208>

In [162]:
n_top_words = 1
topic_word = model.topic_word_

no_top_words = 4

def display_topics(feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(topic_idx)
        print("/".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(tf_feature_names, no_top_words)

0
data/add/pack/use
1
tune/change/set/caller
2
set/new/phone/ringtone
3
use/play/coupon/kbc
4
use/redeem/coupons/get
5
voice/phone/set/connect
6
make/change/mobile/device
7
get/free/offer/pack
8
use/vouchers/data/get
9
number/mobile/change/sim
10
data/know/check/balance
11
milk/dairy/dairy milk/data
12
data/get/gb/plan
13
data/speed/increase/check
14
number/another/ideanumber/vouchers


## Result Analysis

Guided LDA Model recognises these 15 topics. The topics are similar to the ones recognised by the LDA Model and once again, garbage data and data with low representation are not included in these sentences. Despite seeding some domain words and topics, the results are no better than provided by LDA