In [None]:
from bertopic import BERTopic
import pickle as pkl
import pandas as pd
import numpy as np
import random
from bertopic.vectorizers import ClassTfidfTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# get original reviews
with open("../reviews.pkl", "rb") as file:
    reviews = pkl.load(file)
    print(reviews[0])

This was the first place in Vegas where the Yelpers let me down. The salt and pepper shrimp appetizer was inedible: all of the shell was left on underneath the crispy fried coating. As a result, the sauce used to marinate the shrimp never penetrated the meat, it coagulated in the head, and EXPLODED on you upon taking a bite. GROSS! The potstickers and the beef chow fun lacked any flavor whatsoever. The only redeeming quality (the service is pretty awful) is the roast duck: crispy and tasty skin with relatively moist meat and a relative bargain compared to places in NY and L.A.


In [8]:
%%python ../../NER/NER.py
from NER import NER

In [None]:
import tqdm

ner = NER()
counter = 0
for review in tqdm.tqdm(reviews, leave=False):
    flag, output = ner.preprocess_text(review)
    if flag:
        counter += 1

print(counter)
print(len(reviews))

KeyboardInterrupt: 

In [5]:
# get the cleaned reviews 
with open("clean_reviews.pkl", "rb") as file:
    clean_reviews = pkl.load(file)
    print(clean_reviews[0])

['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']


In [6]:
print(len(clean_reviews))
# index_list = range(len(clean_reviews))
# random.sample(index_list, 10000)
random.seed(10701)
clean_reviews_down = random.sample(clean_reviews, 10000)
print(len(clean_reviews_down))

700000
10000


In [8]:
# get data in string type
processed_corpus = [" ".join(tokens) for tokens in clean_reviews_down]

In [9]:
# method 1: guided topic modeling

seed_topic_list = [
    ["pizza", "food", "chicken", "burger", "taco", "salsa", "mexican", "chip", "bean", "burrito", "enchilada", "rice", "tortilla", "guacamole", "carne", "asada", "shrimp", "lobster", "sushi", "roll", "fish", "sashimi", "tuna", "tempura"],
    ["doctor", "office", "medical", "service", "patient", "health", "insurance", "client", "car", "vehicle"], 
    ["hair", "nail", "salon", "gel", "polish", "manicure"], #maybe counts as service? 
    "de le la u00e9 et est u00e0 un com pa www que da en die yelp u00e9e du select une pour service au mais plus".split(" "), #french
    "bar drink beer night bartender table music atmosphere".split(" "), #bars
    "hotel bed bath shower spa pool casino strip night desk check club show sexy men girl".split(" "),#vegas
    "store shop price item shopping buy need sale mall product boutique outlet walmart".split(" "),#shopping malls
]


topic_model = BERTopic(seed_topic_list=seed_topic_list, embedding_model="all-MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(processed_corpus)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [11]:
topic_model.get_topic(0)

[('hotel', 0.02655898844111409),
 ('room', 0.02601600934295312),
 ('casino', 0.011447271056643988),
 ('stay', 0.011243704347691839),
 ('the', 0.01119048736400992),
 ('pool', 0.010993659611008044),
 ('bed', 0.010445518928920877),
 ('in', 0.00980066241962438),
 ('to', 0.009528182724897428),
 ('at', 0.008842415827258124)]

In [13]:
topic_model.visualize_topics()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [19]:
# method 2: seeded words 
# https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html

# we can also specify the number of topics using a customized UMAP model
# umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

ctfidf_model = ClassTfidfTransformer(
    seed_words=["pizza", "doctor", "salon", "casino", "car"], 
    seed_multiplier=2
)

# We run the topic model with the seeded words
seed_topic_model = BERTopic(
    min_topic_size=10,
    ctfidf_model=ctfidf_model,
).fit(processed_corpus)


In [20]:
seed_topic_model.get_topic(0)

[('taco', 0.022344823668546638),
 ('mexican', 0.018764607428293764),
 ('salsa', 0.016875962108340734),
 ('chip', 0.011354993455712323),
 ('burrito', 0.011107642597563498),
 ('food', 0.009275184680470875),
 ('the', 0.009235481312870844),
 ('bean', 0.009219418105274624),
 ('and', 0.009191142693618586),
 ('good', 0.009116329264203183)]

In [21]:
seed_topic_model.visualize_topics()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed