In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer

from nltk.stem.wordnet import WordNetLemmatizer

from gensim.corpora import Dictionary
import pickle as pkl

import guidedlda
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
# get the clearned reviews 
with open("clean_reviews.pkl", "rb") as file:
    clean_reviews = pkl.load(file)
    print(clean_reviews[0])

['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']


In [3]:
with open("clean_review_corpus.pkl", "rb") as file:
    corpus = pkl.load(file)
    print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 3), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 2), (56, 3)]


In [5]:
# get the vocab_list
dictionary = Dictionary(clean_reviews)
dictionary.filter_extremes(no_below=20, no_above=0.5)
vocab_list = list(dictionary.token2id.keys())

In [18]:
with open("vocabulary.pkl", "wb") as file:
    pkl.dump(vocab_list, file)

In [6]:
def create_matrix(processed_text, vocabulary=vocab_list):
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    dtm = vectorizer.fit_transform(processed_text)
    return dtm, vectorizer.vocabulary_

In [None]:
# getting the seed topics in dict and in matrix
# seed_topics = {
#     "doctor": ["medicine", "office", "hurt"],
#     "food": ["coffee", "pizza", "delicious"]
# }   

# processed_seeds = [" ".join(topic) for topic in list(seed_topics.values())]

# seed_topic_list, vocab_dict = create_matrix(processed_seeds)

In [13]:
processed_corpus = [" ".join(tokens) for tokens in clean_reviews]
dtm, vocab_dict = create_matrix(processed_corpus)


In [None]:
seed_topic_list = [["medicine", "office", "hurt"],["coffee", "pizza", "delicious"]]

seed_topics = {}
for topic_id, topic_words in enumerate(seed_topic_list):
    for word in topic_words:
        seed_topics[vocab_dict[word]] = topic_id


In [9]:
# building the model
model = guidedlda.GuidedLDA(n_topics=2, n_iter=10, random_state=7, refresh=20)

In [14]:
# fitting the model
model.fit(dtm, seed_topics=seed_topics, seed_confidence=0.15)

INFO:guidedlda:n_documents: 700000
INFO:guidedlda:vocab_size: 31595
INFO:guidedlda:n_words: 65988636
INFO:guidedlda:n_topics: 2
INFO:guidedlda:n_iter: 10
INFO:guidedlda:<0> log likelihood: -535905597
INFO:guidedlda:<9> log likelihood: -521357147


<guidedlda.guidedlda.GuidedLDA at 0x15ac6fd90>

In [17]:
topic_word = model.topic_word_
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab_list)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: you we they on not have at had
Topic 1: we you they on not have had were


In [16]:
doc_topic = model.transform(dtm)
for i in range(9):
    print("top topic: {} Document: {}".format(doc_topic[i].argmax(),
                                                  ', '.join(np.array(vocab_list)[list(reversed(dtm[i,:].argsort()))[0:5]])))



KeyboardInterrupt: 