In [22]:
from bertopic import BERTopic
import pickle as pkl
import pandas as pd
import numpy as np
import random
from bertopic.vectorizers import ClassTfidfTransformer

In [23]:
# get original reviews
with open("../reviews.pkl", "rb") as file:
    reviews = pkl.load(file)
    print(reviews[0])

This was the first place in Vegas where the Yelpers let me down. The salt and pepper shrimp appetizer was inedible: all of the shell was left on underneath the crispy fried coating. As a result, the sauce used to marinate the shrimp never penetrated the meat, it coagulated in the head, and EXPLODED on you upon taking a bite. GROSS! The potstickers and the beef chow fun lacked any flavor whatsoever. The only redeeming quality (the service is pretty awful) is the roast duck: crispy and tasty skin with relatively moist meat and a relative bargain compared to places in NY and L.A.


In [36]:
%run ../../NER/NER.py 

In [37]:
%%python ../../NER/NER.py
from NER import NER

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [38]:
import tqdm

processed_text = []
processed_text_all = []
ner = NER()
counter = 0
for review in tqdm.tqdm(reviews, leave=False):
    flag, output = ner.preprocess_text(review)
    processed_text_all.append(output)
    if flag:
        counter += 1
        processed_text.append(output)
    

print(counter)
print(len(reviews))

OSError: [E050] Can't find model 'en_core_web_trf'. It doesn't seem to be a Python package or a valid path to a data directory.

In [5]:
# get the cleaned reviews 
with open("clean_reviews.pkl", "rb") as file:
    clean_reviews = pkl.load(file)
    print(clean_reviews[0])

['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']


In [6]:
print(len(clean_reviews))
# index_list = range(len(clean_reviews))
# random.sample(index_list, 10000)
random.seed(10701)
clean_reviews_down = random.sample(clean_reviews, 10000)
print(len(clean_reviews_down))

700000
10000


In [59]:
# get data in string type
processed_corpus_1 = [" ".join(tokens) for tokens in clean_reviews_down]

In [40]:
# get the cleaned reviews 
with open("base.tag", "rb") as file:
    tagged_reviews = pkl.load(file)
    print(tagged_reviews[0])

['first', 'place', 'vega', '<person>', 'let', 'salt', 'pepper', 'shrimp', 'appetizer', 'inedible', 'shell', 'leave', 'underneath', 'crispy', 'fry', 'coating', 'result', 'sauce', 'use', 'marinate', 'shrimp', 'never', 'penetrate', 'meat', 'coagulate', 'head', 'explode', 'upon', 'take', 'bite', 'gross', 'potstickers', 'beef', 'chow', 'fun', 'lack', 'flavor', 'whatsoever', 'redeeming', 'quality', 'service', 'pretty', 'awful', 'roast', 'duck', 'crispy', 'tasty', 'skin', 'relatively', 'moist', 'meat', 'relative', 'bargain', 'compare', 'place', 'ny', 'la']


In [60]:
# get tagged reviews in string type
processed_corpus_2 = [" ".join(tokens) for tokens in tagged_reviews]

In [67]:
# unguided topic modeling

unguided_topic_model = BERTopic()
topics, probs = unguided_topic_model.fit_transform(processed_corpus_2)

In [68]:
unguided_topic_model.get_topic(0)

[('order', 0.009495104222183918),
 ('table', 0.008440899712858524),
 ('restaurant', 0.008034488095463569),
 ('food', 0.007607394272051359),
 ('ask', 0.007509774826434069),
 ('come', 0.0069323314578481985),
 ('menu', 0.006812663321105426),
 ('server', 0.006792388385898539),
 ('eat', 0.006438739360711522),
 ('one', 0.00637333992700273)]

In [70]:
unguided_topic_model.get_topic_info().to_csv('unguided_topics.csv', index=False)

In [79]:
# method 1: guided topic modeling

seed_topic_list = [
    ["pizza", "food", "chicken", "burger", "taco", "salsa", "mexican", "chip", "bean", "burrito", "enchilada", "rice", "tortilla", "guacamole", "carne", "asada", "shrimp", "lobster", "sushi", "roll", "fish", "sashimi", "tuna", "tempura"],
    ["doctor", "office", "medical", "service", "patient", "health", "insurance", "client", "car", "vehicle"], 
    ["hair", "nail", "salon", "gel", "polish", "manicure"], #maybe counts as service? 
    "de le la u00e9 et est u00e0 un com pa www que da en die yelp u00e9e du select une pour service au mais plus".split(" "), #french
    "bar drink beer night bartender table music atmosphere".split(" "), #bars
    "hotel bed bath shower spa pool casino strip night desk check club show sexy men girl".split(" "),#vegas
    "store shop price item shopping buy need sale mall product boutique outlet walmart".split(" "),#shopping malls
]


topic_model = BERTopic(seed_topic_list=seed_topic_list, embedding_model="all-MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(processed_corpus_2)

In [80]:
topic_model.get_topic(0)

[('store', 0.02396121600159621),
 ('car', 0.019250964345364538),
 ('org', 0.01401766080860879),
 ('shop', 0.013073270195769284),
 ('call', 0.01262408014445486),
 ('buy', 0.01222551386951358),
 ('customer', 0.012205159147548847),
 ('need', 0.011820163282104043),
 ('person', 0.01164278172858635),
 ('service', 0.011484148602410716)]

In [81]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3991,-1_food_org_place_good,"[food, org, place, good, go, get, like, order,...",[locate inside <fac> downtown la vega <org> pl...
1,0,1249,0_store_car_org_shop,"[store, car, org, shop, call, buy, customer, n...",[bad place go nothing particular staff managem...
2,1,1109,1_food_order_table_come,"[food, order, table, come, get, restaurant, se...",[put one line <org> 3star restaurant posing 5s...
3,2,707,2_room_hotel_stay_casino,"[room, hotel, stay, casino, bed, pool, desk, o...",[boyfriend first time vega without family nnwe...
4,3,370,3_mexican_salsa_taco_burrito,"[mexican, salsa, taco, burrito, bean, chip, fo...",[wow im begin think great mexican food az move...
...,...,...,...,...,...
65,64,12,64_ramen_noodle_broth_rice,"[ramen, noodle, broth, rice, pork, shio, chash...",[like clean modern feel <org> huge communal ta...
66,65,12,65_dance_club_vip_security,"[dance, club, vip, security, drink, dj, kick, ...",[<org> hat fun midst identity crisis specifica...
67,66,11,66_clean_smell_teriyaki_table,"[clean, smell, teriyaki, table, remodel, cloth...",[food okay service horrible table clean seat g...
68,67,10,67_sandwich_meat_turkey_sub,"[sandwich, meat, turkey, sub, italian, bread, ...",[<org> little atmosphere customer service grea...


In [94]:
topic_model.get_topic_info().to_csv('tag_review_guided.csv', index=False)

In [83]:
topic_model.visualize_topics()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [95]:
# saving the top reviews

# Get full document info
review_info = topic_model.get_document_info(processed_corpus_2)

# For a specific topic (e.g., topic 0)
topic_documents = review_info[review_info.Topic == -1]
topic_documents = pd.DataFrame(topic_documents)

topic_documents['Document'].to_csv('uncluster_review_guided.csv', index=False, header=False)

unclustered_review = topic_documents['Document']

In [96]:
# performing unguided clustering on the unclustered_review
topics, probs = unguided_topic_model.fit_transform(unclustered_review)


In [98]:
unguided_topic_model.get_topic_info().to_csv('unguided_unclustered_clusters.csv', index=False)

In [46]:
# method 2: seeded words 
# https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html

# we can also specify the number of topics using a customized UMAP model
# umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

ctfidf_model = ClassTfidfTransformer(
    seed_words=["pizza", "doctor", "salon", "casino", "car"], 
    seed_multiplier=2
)

# We run the topic model with the seeded words
seed_topic_model = BERTopic(
    min_topic_size=10,
    ctfidf_model=ctfidf_model,
).fit(processed_corpus_2)


In [50]:
seed_topic_model.get_topic(3)

[('mexican', 0.03583264237461713),
 ('salsa', 0.03151384238804967),
 ('taco', 0.029816057285831477),
 ('burrito', 0.025546936126877286),
 ('chip', 0.018284862435536182),
 ('bean', 0.017794452142008216),
 ('food', 0.014786150846995727),
 ('tortilla', 0.014093107649477557),
 ('margarita', 0.012944764061113483),
 ('good', 0.012157636122235497)]

In [53]:
seed_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4467,-1_org_go_get_good,"[org, go, get, good, place, food, like, time, ...",[second last time visit <org> food always dece...
1,0,647,0_room_hotel_casino_stay,"[room, hotel, casino, stay, bed, org, desk, po...",[get back stay <org> last night never stay ive...
2,1,596,1_order_table_food_restaurant,"[order, table, food, restaurant, come, good, m...",[place okay menu seem little generic lack anyt...
3,2,511,2_bar_club_drink_beer,"[bar, club, drink, beer, dance, music, night, ...",[place favorite mine since start go bar defini...
4,3,339,3_mexican_salsa_taco_burrito,"[mexican, salsa, taco, burrito, chip, bean, fo...",[wow im begin think great mexican food az move...
...,...,...,...,...,...
72,71,11,71_shake_wife_steak_taste,"[shake, wife, steak, taste, pudding, naan, lam...",[boyfriend ate <org> anniversary dinner young ...
73,72,11,72_happy_hour_wing_slider,"[happy, hour, wing, slider, yeah, poki, spicy,...",[happy hour actual joke rule fair happy hour p...
74,73,11,73_fly_flight_airline_free,"[fly, flight, airline, free, attendant, portla...",[ive always love <org> since little girl fly f...
75,74,11,74_close_business_sign_830pm,"[close, business, sign, 830pm, reopen, 2011, d...",[want try since reopen sadly disappointed walk...


In [58]:
seed_topic_model.get_topic_info().to_csv('seedtopics.csv', index=False)

In [54]:
seed_topic_model.visualize_topics()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed