In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer

from nltk.stem.wordnet import WordNetLemmatizer

from gensim.corpora import Dictionary
import pickle as pkl

import guidedlda
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
# get the clearned reviews 
with open("clean_reviews.pkl", "rb") as file:
    clean_reviews = pkl.load(file)
    print(clean_reviews[0])

['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']


In [3]:
with open("clean_review_corpus.pkl", "rb") as file:
    corpus = pkl.load(file)
    print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 3), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 2), (56, 3)]


In [5]:
# get the vocab_list
dictionary = Dictionary(clean_reviews)
dictionary.filter_extremes(no_below=20, no_above=0.5)
vocab_list = list(dictionary.token2id.keys())

In [18]:
with open("vocabulary.pkl", "wb") as file:
    pkl.dump(vocab_list, file)

In [6]:
def create_matrix(processed_text, vocabulary=vocab_list):
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    dtm = vectorizer.fit_transform(processed_text)
    return dtm, vectorizer.vocabulary_

In [None]:
# getting the seed topics in dict and in matrix
# seed_topics = {
#     "doctor": ["medicine", "office", "hurt"],
#     "food": ["coffee", "pizza", "delicious"]
# }   

# processed_seeds = [" ".join(topic) for topic in list(seed_topics.values())]

# seed_topic_list, vocab_dict = create_matrix(processed_seeds)

In [13]:
processed_corpus = [" ".join(tokens) for tokens in clean_reviews]
dtm, vocab_dict = create_matrix(processed_corpus)


In [None]:
# create topic seeds
seed_topic_list = [
    ["medicine", "office", "hurt"],
    ["coffee", "pizza", "delicious"]
]

seed_topics = {}
for topic_id, topic_words in enumerate(seed_topic_list):
    for word in topic_words:
        seed_topics[vocab_dict[word]] = topic_id


In [27]:
# building the model
model = guidedlda.GuidedLDA(n_topics=25, n_iter=150, random_state=7, refresh=20)

In [28]:
# fitting the model
# model.fit(dtm, seed_topics=seed_topics, seed_confidence=0.15)
model.fit(dtm, seed_confidence=0.15)

INFO:guidedlda:n_documents: 700000
INFO:guidedlda:vocab_size: 31595
INFO:guidedlda:n_words: 65988636
INFO:guidedlda:n_topics: 25
INFO:guidedlda:n_iter: 150
INFO:guidedlda:<0> log likelihood: -768290932
INFO:guidedlda:<20> log likelihood: -544845646
INFO:guidedlda:<40> log likelihood: -523167504
INFO:guidedlda:<60> log likelihood: -517464505
INFO:guidedlda:<80> log likelihood: -515242154
INFO:guidedlda:<100> log likelihood: -514171247
INFO:guidedlda:<120> log likelihood: -513573261
INFO:guidedlda:<140> log likelihood: -513116436
INFO:guidedlda:<149> log likelihood: -512954448


<guidedlda.guidedlda.GuidedLDA at 0x30f16e200>

In [51]:
topic_word = model.topic_word_
n_top_words = 25
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab_list)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: food good not place they chicken had you have on so like were are very here just be rice thai all sauce restaurant dish at
Topic 1: you food place they not have good are service here time if at there be go get so on great just can like been or
Topic 2: we food were our had place great service good on time not at very have they there so here drink be back bar out night
Topic 3: we were our food had not on they at so there service out table good time be have drink place ordered order came minute back
Topic 4: you on at there are have so they not we get room place like all were if be out just up had here your time
Topic 5: had were on not good we dish food very so sauce you like they chicken have restaurant which at be all just menu some nthe
Topic 6: they me on have not he you had at car be would so them time service when out up an get from back if no
Topic 7: we were had our on not at good very so restaurant food which ordered steak they have all be great dinner menu meal servi

In [73]:
item_term_matrix, _ = create_matrix(processed_corpus[:100:5])
item_term_matrix

<20x31595 sparse matrix of type '<class 'numpy.int64'>'
	with 1288 stored elements in Compressed Sparse Row format>

In [74]:
doc_topic = model.transform(item_term_matrix)

for i in range(len(doc_topic)):  # Use the correct range based on your data
    # Get the top topic for the document
    top_topic = doc_topic[i].argmax()
    
    # Get the top terms more frequent than 1 in the document from the dtm
    row = dtm[i, :].toarray().flatten()  # Convert sparse row to dense
    item_count = np.count_nonzero(row > 1)
    top_terms_indices = row.argsort()[::-1][:15]  # Indices of the top 10 terms
    top_terms = [vocab_list[idx] for idx in top_terms_indices]
    
    # Print the result
    print(f"Top topic: {top_topic}, Document terms: {', '.join(top_terms)}")



Top topic: 20, Document terms: he, you, him, need, really, have, see, without, parent, referral, patient, on, patronizing, offer, practitioner
Top topic: 20, Document terms: have, doctor, many, staff, patient, dr, phone, other, you, answer, get, had, into, incomprehensible, isn
Top topic: 8, Document terms: he, all, very, been, over, your, dr, patient, year, out, think, health, option, question, one
Top topic: 18, Document terms: be, new, almost, will, position, said, he, there, week, arizona, trying, nyc, doctor, you, think
Top topic: 14, Document terms: you, office, he, doctor, not, when, about, away, they, me, call, johnson, will, before, practice
Top topic: 9, Document terms: notch, doctor, top, be, him, his, country, one, minimal, nit, referred, manner, he, because, wonderful
Top topic: 8, Document terms: have, dr, doctor, his, very, we, him, many, who, come, been, ha, diagnosed, every, had
Top topic: 19, Document terms: me, bill, cover, insurance, work, office, visit, can, make, 

In [None]:
# TODO: Try making code to extract 5 documents per topic