In [2]:
import pandas as pd
import numpy as np
import random
from nltk.tokenize import RegexpTokenizer

from nltk.stem.wordnet import WordNetLemmatizer

from gensim.corpora import Dictionary
import pickle as pkl

import guidedlda
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# get the cleaned reviews 
with open("clean_reviews.pkl", "rb") as file:
    clean_reviews = pkl.load(file)
    print(clean_reviews[0])

['dr', 'goldberg', 'offer', 'everything', 'look', 'general', 'practitioner', 'nice', 'easy', 'talk', 'without', 'patronizing', 'always', 'time', 'seeing', 'patient', 'affiliated', 'top', 'notch', 'hospital', 'nyu', 'parent', 'explained', 'important', 'case', 'something', 'happens', 'need', 'surgery', 'get', 'referral', 'see', 'specialist', 'without', 'see', 'first', 'really', 'need', 'sitting', 'trying', 'think', 'complaint', 'really', 'drawing', 'blank']


In [32]:
# with open("clean_review_corpus.pkl", "rb") as file:
#     corpus = pkl.load(file)
#     print(corpus[0])

#### Downsample Reviews for Checkpoint

In [None]:
print(len(clean_reviews))
# index_list = range(len(clean_reviews))
# random.sample(index_list, 10000)
random.seed(10701)
clean_reviews_down = random.sample(clean_reviews, 10000)
print(len(clean_reviews_down))

700000
10000


In [5]:
# get original reviews
with open("reviews.pkl", "rb") as file:
    reviews = pkl.load(file)
    print(reviews[0])

This was the first place in Vegas where the Yelpers let me down. The salt and pepper shrimp appetizer was inedible: all of the shell was left on underneath the crispy fried coating. As a result, the sauce used to marinate the shrimp never penetrated the meat, it coagulated in the head, and EXPLODED on you upon taking a bite. GROSS! The potstickers and the beef chow fun lacked any flavor whatsoever. The only redeeming quality (the service is pretty awful) is the roast duck: crispy and tasty skin with relatively moist meat and a relative bargain compared to places in NY and L.A.


In [6]:
# get the vocab_list
dictionary = Dictionary(clean_reviews_down)
dictionary.filter_extremes(no_below=20, no_above=0.5)
vocab_list = list(dictionary.token2id.keys())

In [None]:
# with open("vocabulary.pkl", "wb") as file:
#     pkl.dump(vocab_list, file)

In [None]:
# with open("vocabulary.pkl", "rb") as file:
#     vocab_list = pkl.load(file)

In [7]:
def create_matrix(processed_text, vocabulary=vocab_list):
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    dtm = vectorizer.fit_transform(processed_text)
    return dtm, vectorizer.vocabulary_

In [8]:
processed_corpus = [" ".join(tokens) for tokens in clean_reviews_down]
dtm, vocab_dict = create_matrix(processed_corpus)


# Hyperparameters and Seeds

In [21]:
# Hyperparameters
N_TOPIC = 7             # number of clusters
N_ITER = 1000           # number of iterations run on dataset
RANDOM_STATE = 7        # initial random seed
REFRESH = 20            # how often you print log-likelihood
SEED_CONFIDENCE=0.5     # probability of using this word as a seed

# create topic seeds
# seed_topic_list = [
#     ["medicine", "office", "hurt"],
#     ["coffee", "pizza", "delicious"]
# ]
seed_topic_list = [
    ["pizza", "food", "chicken", "burger", "taco", "salsa", "mexican", "chip", "bean", "burrito", "enchilada", "rice", "tortilla", "guacamole", "carne", "asada", "shrimp", "lobster", "sushi", "roll", "fish", "sashimi", "tuna", "tempura"],
    ["doctor", "office", "medical", "service", "patient", "health", "insurance", "client", "car", "vehicle"], 
    ["hair", "nail", "salon", "gel", "polish", "manicure"], #maybe counts as service? 
    "de le la u00e9 et est u00e0 un com pa www que da en die yelp u00e9e du select une pour service au mais plus".split(" "), #french
    "bar drink beer night bartender table music atmosphere".split(" "), #bars
    "hotel bed bath shower spa pool casino strip night desk check club show sexy men girl".split(" "),#vegas
    "store shop price item shopping buy need sale mall product boutique outlet walmart".split(" "),#shopping malls
    
    
]

seed_topics = {}
for topic_id, topic_words in enumerate(seed_topic_list):
    for word in topic_words:
        seed_topics[vocab_dict[word]] = topic_id


# Training/Fitting Model

In [23]:
# building the model
model = guidedlda.GuidedLDA(n_topics=N_TOPIC, n_iter=N_ITER, random_state=RANDOM_STATE, refresh=REFRESH)

In [24]:
# fitting the model
model.fit(dtm, seed_topics=seed_topics, seed_confidence=SEED_CONFIDENCE)
# model.fit(dtm, seed_confidence=SEED_CONFIDENCE)

INFO:guidedlda:n_documents: 10000


INFO:guidedlda:vocab_size: 3266
INFO:guidedlda:n_words: 581744
INFO:guidedlda:n_topics: 7
INFO:guidedlda:n_iter: 1000
INFO:guidedlda:<0> log likelihood: -5641643
INFO:guidedlda:<20> log likelihood: -4513566
INFO:guidedlda:<40> log likelihood: -4450729
INFO:guidedlda:<60> log likelihood: -4423268
INFO:guidedlda:<80> log likelihood: -4407645
INFO:guidedlda:<100> log likelihood: -4398073
INFO:guidedlda:<120> log likelihood: -4389204
INFO:guidedlda:<140> log likelihood: -4383532
INFO:guidedlda:<160> log likelihood: -4380337
INFO:guidedlda:<180> log likelihood: -4378016
INFO:guidedlda:<200> log likelihood: -4372005
INFO:guidedlda:<220> log likelihood: -4370342
INFO:guidedlda:<240> log likelihood: -4367901
INFO:guidedlda:<260> log likelihood: -4365128
INFO:guidedlda:<280> log likelihood: -4364856
INFO:guidedlda:<300> log likelihood: -4361293
INFO:guidedlda:<320> log likelihood: -4360134
INFO:guidedlda:<340> log likelihood: -4357725
INFO:guidedlda:<360> log likelihood: -4355782
INFO:guidedlda

<guidedlda.guidedlda.GuidedLDA at 0x3a810bf10>

## Top Words per Topic

In [26]:
topic_word = model.topic_word_
print(len(topic_word))
n_top_words = 25
for i, topic_dist in enumerate(topic_word):
    # print(topic_dist)
    topic_words = np.array(vocab_list)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

7
Topic 0: good food place like great really restaurant chicken one time would go ordered service get sauce menu back also ni nthe cheese try salad pizza
Topic 1: would back time get said told service customer one never call even day called go could asked got like know went ni manager phone car
Topic 2: great nail time place hair massage salon service like get good back staff always done appointment go nice job friendly cut recommend would went really
Topic 3: breakfast coffee cream egg de cake ice le chocolate tea cupcake la sweet bacon pancake brunch bagel u00e9 et toast flavor morning fruit waffle delicious
Topic 4: food service table time drink minute order get place server one back came ordered good restaurant would got asked even took bar wait go went
Topic 5: room hotel vega night like place get one nice show stay good time nthe would strip great casino floor pool really people go got club
Topic 6: place one like time great good get go really ha food store always love lot price 

## Find Topic given Review

In [None]:
dtm_transform = model.transform(dtm)

dtm_topics = np.argmax(dtm_transform, axis=1)



(array([   7,   14,   26,   55,   58,   66,   86,  143,  149,  167,  176,
        177,  184,  190,  210,  219,  240,  242,  279,  301,  311,  320,
        321,  329,  331,  333,  339,  341,  376,  382,  388,  395,  401,
        404,  414,  428,  429,  456,  511,  512,  516,  527,  571,  572,
        580,  590,  609,  614,  631,  648,  661,  683,  689,  691,  697,
        700,  721,  736,  760,  783,  787,  798,  801,  834,  872,  883,
        909,  957,  979,  988, 1002, 1019, 1030, 1096, 1102, 1105, 1124,
       1143, 1155, 1156, 1158, 1159, 1185, 1192, 1210, 1219, 1236, 1243,
       1250, 1253, 1255, 1259, 1272, 1289, 1291, 1303, 1310, 1319, 1323,
       1350, 1358, 1359, 1381, 1411, 1418, 1439, 1489, 1495, 1497, 1515,
       1522, 1555, 1614, 1636, 1638, 1641, 1652, 1745, 1831, 1838, 1856,
       1861, 1865, 1882, 1896, 1897, 1927, 1931, 1945, 1946, 1962, 1968,
       1969, 1977, 1989, 2000, 2006, 2015, 2034, 2035, 2056, 2068, 2090,
       2092, 2105, 2114, 2118, 2140, 2154, 2177, 2

In [71]:
vocab_map = np.vectorize(lambda x: vocab_list[x])

for topic_idx in range(N_TOPIC):
    # Get 5 example documents
    topic_idx_docs = np.where(dtm_topics == topic_idx)[0][:5]
    print(topic_idx_docs)
    doc_dtm = dtm[topic_idx_docs].toarray()
    print(doc_dtm)
    # Get 15 terms from each document
    common_terms = np.argsort(doc_dtm, axis=1)[:, ::-1][:,:15]
    print(common_terms)
    print(vocab_map(common_terms))
    document_words = vocab_map(common_terms)
    for doc in document_words:
        print(" ".join(doc))
    break

[ 7 14 26 55 58]
[[0 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[ 220  395  399  407  406  423  421  110  396  412  400  415  411  413
   414]
 [ 372    1  142  683  140   28  685  392  684  119  678  199  682  681
    89]
 [  52  411  963  904  409  966  603  962  964  965  967  266  968  969
   902]
 [ 142  372   89  834   52  397 1408  362  268   29  202   98  413 1193
   574]
 [ 220   52  834  372   92  351   28  143 1430  912   27 1244 1295 1180
  1432]]
[['good' 'beer' 'brunch' 'flatbread' 'dog' 'patio' 'opted' 'my' 'biscuit'
  'gravy' 'calling' 'grilled' 'fry' 'great' 'greatest']
 ['pizza' 'all' 'we' 'own' 'very' 'on' 'saw' 'than' 'premium' 'one'
  'agreed' 'ever' 'mixed' 'ingredient' 'had']
 ['with' 'fry' 'lunch' 'sub' 'french' 'reasonable' 'nothing' 'free'
  'mention' 'presented' 'recently' 'or' 'soda' 'turkey' 'run']
 ['we' 'pizza' 'had' 'crust' 'with' 'bread' 'delivered' 'cheese'
  'ordered' 'only' 'extra' 'into' 'great' 'rec

In [25]:
# Select 1000 reviews
item_term_matrix, _ = create_matrix(processed_corpus[:1000])
item_term_matrix

<1000x31595 sparse matrix of type '<class 'numpy.int64'>'
	with 75714 stored elements in Compressed Sparse Row format>

In [None]:
doc_topic = model.transform(item_term_matrix)

top_topic = np.argmax(doc_topic, axis=1) #find the topic of the document
print(top_topic)

for topic_idx in range(N_TOPIC):
    # Grab 5 example documents from each topic
    topic_idx_docs = np.where(top_topic == topic_idx)[:5]
    doc_dtm = item_term_matrix[topic_idx_docs].toarray()
    

doc_topics = [np.where(top_topic == topic_idx) for topic_idx in N_TOPIC] # a list of list where the [i][.] corresponds to list of document in topic i
topic_doc_indices = np.where(top_topic == 0)[0][:5] #find documents from topic 0
print(topic_doc_indices)
print(item_term_matrix[topic_doc_indices].toarray()) #print document term matrix for those documents
print(np.argsort(item_term_matrix[topic_doc_indices].toarray(), axis=1)[:,::-1]) #argsort to find the most common terms
print(np.argsort(item_term_matrix[topic_doc_indices].toarray(), axis=1)[:,::-1][:, :5]) #same thing, gives top 5

vectorized_vocab_map = np.vectorize(lambda x: vocab_list[x])

print(vectorized_vocab_map(np.argsort(item_term_matrix[topic_doc_indices].toarray(), axis=1)[:,::-1][:, :5])) #get words corresponding to indices

for i in range(len(doc_topic)):  # Use the correct range based on your data
    # Get the top topic for the document
    top_topic = doc_topic[i].argmax()
    
    # Get the top terms more frequent than 1 in the document from the dtm
    row = item_term_matrix[i, :].toarray().flatten()  # Convert sparse row to dense
    # item_count = np.count_nonzero(row > 1)
    top_terms_indices = row.argsort()[::-1][:15]  # Indices of the top 10 terms
    top_terms = [vocab_list[idx] for idx in top_terms_indices]
    
    # Print the result
    print(f"Top topic: {top_topic}, Document terms: {', '.join(top_terms)}")

[20 20  6 20  0  8  8 24  1 24 14 24  6  6  2  8 24 19  1 19 19 19 19 23
 13  6 17 18 20 20  1  0  3  4 17 15 15 15 15  2 15 17 18  9 13 15  0 15
 20  1]
[ 4 31 46]
[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[  323   332   169 ... 21069 21070 15797]
 [ 1666   554   190 ... 21069 21070     0]
 [ 1137   243   612 ... 21069 21070     0]]
[[ 323  332  169  338   70]
 [1666  554  190  171  586]
 [1137  243  612  169 2057]]
[['hot' 'sauce' 'are' 'wing' 'good']
 ['chinese' 'food' 'even' 'away' 'taste']
 ['soup' 'best' 'around' 'are' 'chow']]
Top topic: 20, Document terms: he, you, him, need, really, have, see, without, parent, referral, patient, on, patronizing, offer, practitioner
Top topic: 20, Document terms: he, all, very, been, over, your, dr, patient, year, out, think, health, option, question, one
Top topic: 6, Document terms: you, office, he, doctor, not, when, about, away, they, me, call, johnson, will, before, practice
Top topic: 20, Document terms: have, dr, doctor,

In [None]:
og_dataset = []
topic_review = [] # reviews in topics 

output_review_num = 5 # output the top 5 reviwew from each topic

for topic in range(N_TOPIC):
    if len(doc_topics[topic]) >= 5:
        end = 5
    else:
        end = len(doc_topics[topic])
    for review_idx in doc_topics[topic][:end]:
        topic_review[topic].append(og_dataset[review_idx])

In [None]:
# TODO: Try making code to extract 5 documents per topic

## Saving the Model

In [None]:
# prunes matrices, saves weights but locks model
# model.purge_extra_matrices()

In [27]:
modelname = f"model_{N_TOPIC}_{N_ITER}_{RANDOM_STATE}_{REFRESH}_{SEED_CONFIDENCE}.pkl"

with open(f"results/{modelname}", 'wb') as file:
    pkl.dump(model, file)

In [29]:
# Loading the model
with open('results/model_7_1000_7_20_0.5.pickle', 'rb') as file:
    model = pkl.load(file)
model.transform(create_matrix(processed_corpus[:1])[0])



array([[9.91332837e-01, 6.84035745e-04, 6.00474419e-03, 1.73764585e-04,
        3.63391083e-04, 9.31279092e-04, 5.09947976e-04]])

In [24]:
model.loglikelihoods_

[-768290932.3669106,
 -544845646.4478563,
 -523167504.4543283,
 -517464505.4454495,
 -515242153.73157394,
 -514171247.29473853,
 -513573261.2712008,
 -513116435.8723612]

# Save Results

Make sure to run this after each experiment (after you've created/fitted the model)

In [56]:
filename = f"result_{N_TOPIC}_{N_ITER}_{RANDOM_STATE}_{REFRESH}_{SEED_CONFIDENCE}.txt"

with open(f"results/{filename}", "w") as f:
    if seed_topic_list:
        for topic_list in seed_topic_list:
            print(f"{topic_list}", file=f)
        print(file=f)
    else:
        print("No Seeds\n", file=f)
        
    dtm_transform = model.transform(dtm)
    dtm_topics = np.argmax(dtm_transform, axis=1)
    vocab_map = np.vectorize(lambda x: vocab_list[x])


    # Save top 25 words per topic
    topic_word = model.topic_word_
    n_top_words = 25
    for idx, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab_list)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print('Topic {}: {}'.format(idx, ' '.join(topic_words)), file=f)
        print("Example Documents:", file=f)
        
        # Get 5 example documents
        topic_idx_docs = np.where(dtm_topics == idx)[0][:5]
        
        doc_dtm = dtm[topic_idx_docs].toarray()
        
        # Get 15 terms from each document
        common_terms = np.argsort(doc_dtm, axis=1)[:, ::-1][:,:15]
        document_words = vocab_map(common_terms)
        for doc in document_words:
            print(" ".join(doc), file=f)
        
        # Newline as separator
        print(file=f)
    

print("Saved in", filename)





Saved in result_15_500_7_20_0.5.txt
