In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer

from nltk.stem.wordnet import WordNetLemmatizer

from gensim.corpora import Dictionary
import pickle as pkl

import guidedlda

In [2]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df_combined.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [6]:
tokenizer = RegexpTokenizer(r'\w+')
reviews = df_combined["text"].tolist()
for idx in range(len(reviews)):
    reviews[idx] = reviews[idx].lower()  # Convert to lowercase.
    reviews[idx] = tokenizer.tokenize(reviews[idx])  # Split into words.

In [8]:
# remove numbers
reviews = [[token for token in doc if not token.isnumeric()] for doc in reviews]

# remove single characters
reviews = [[token for token in doc if len(token) > 1] for doc in reviews]


In [9]:
lemmatizer = WordNetLemmatizer()
reviews = [[lemmatizer.lemmatize(token) for token in doc] for doc in reviews]


In [11]:
with open("clean_reviews.pkl", "wb") as file:
    pkl.dump(reviews, file)


In [13]:
dictionary = Dictionary(reviews)
dictionary.filter_extremes(no_below=20, no_above=0.5)


In [14]:
corpus = [dictionary.doc2bow(doc) for doc in reviews]


In [19]:
with open("clean_review_corpus.pkl", "wb") as file:
    pkl.dump(corpus, file)

In [4]:
with open("clean_reviews.pkl", "rb") as file:
    clean_reviews = pkl.load(file)
    print(clean_reviews[0])

['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']


In [5]:
with open("clean_review_corpus.pkl", "rb") as file:
    corpus = pkl.load(file)
    print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 3), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 2), (56, 3)]


In [13]:
print(len(clean_reviews[0]))
print(clean_reviews[0])
print(len(corpus[0]))
print(corpus[0])

87
['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']
57
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 3), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1),

In [22]:
# create vocab from dictionary

vocab_list = list(dictionary.token2id.keys())

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

processed_corpus = [" ".join(tokens) for tokens in clean_reviews]

def create_dim(processed_corpus, vocabulary=vocab_list):
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    dtm = vectorizer.fit_transform(processed_corpus)
    return dtm, vectorizer.vocabulary_

dtm, vocab_dict = create_dim(processed_corpus)

print(dtm.shape)

(700000, 31595)


In [6]:
dictionary = Dictionary(clean_reviews)
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [9]:
dictionary.doc2bow(["coffee"]) 

[(1285, 1)]

In [28]:
seed_topics = {
    "doctor": ["medicine", "office", "hurt"],
    "food": ["coffee", "pizza", "delicious"]
}   

processed_seeds = [" ".join(topic) for topic in list(seed_topics.values())]

seed_topic_list, vocab_dict = create_dim(processed_seeds)

# seed_topic_list = [[] for _ in range(len(seed_topics))]
# for topic_id, words in enumerate(seed_topics.values()):
#     seed_topic_list[topic_id].append(dictionary.doc2bow(words))
    
print(seed_topic_list)
print(seed_topic_list.shape)

  (0, 83)	1
  (0, 679)	1
  (0, 9090)	1
  (1, 503)	1
  (1, 826)	1
  (1, 1285)	1
(2, 31595)


In [8]:
model = guidedlda.GuidedLDA(n_topics=2, n_iter=10, random_state=7, refresh=20)


In [29]:
model.fit(dtm, seed_topics=seed_topic_list, seed_confidence=0.15)


INFO:guidedlda:n_documents: 700000
INFO:guidedlda:vocab_size: 31595
INFO:guidedlda:n_words: 65988636
INFO:guidedlda:n_topics: 2
INFO:guidedlda:n_iter: 10
  self._initialize(X, seed_topics, seed_confidence)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().