# Import Libraries

In [4]:
import pandas as pd
from tqdm import tqdm
from topicutils import sample_polarity, get_topics_bert
import os

# Allow multiple outputs to be displayed for each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Set the tqdm progress bar
tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable tokenizers parallelism to avoid deadlocks
DATASET_DIR = '../dataset/'

2024-08-02 08:07:30.808025: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 08:07:30.808120: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 08:07:30.953679: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load and Sample Data

In [22]:
london = pd.read_csv(DATASET_DIR+'london_preprocessed.csv')
nyc = pd.read_csv(DATASET_DIR+'nyc_preprocessed.csv')

# Drop rows with missing values
london.dropna(inplace=True)
nyc.dropna(inplace=True)

data = pd.concat([london, nyc], axis=0)

# Sample data due to compute resource limitations
sample_size = 50000
sample_london = london.sample(sample_size, random_state=42)
sample_nyc = nyc.sample(sample_size, random_state=42)
sample_positives = sample_polarity(data, 1.0, sample_size)
sample_negatives = sample_polarity(data, 0.0, sample_size)
sample_london_positives = sample_polarity(london, 1.0, sample_size)
sample_london_negatives = sample_polarity(london, 0.0, sample_size)
sample_nyc_positives = sample_polarity(nyc, 1.0, sample_size)
sample_nyc_negatives = sample_polarity(nyc, 0.0, sample_size)

# Run BERTopic

In [25]:
num_topics = 20
tm_london = get_topics_bert(sample_london, 'London', num_topics)
tm_nyc = get_topics_bert(sample_nyc, 'NYC', num_topics)
tm_pos = get_topics_bert(sample_positives, 'Positive Reviews', num_topics)
tm_neg = get_topics_bert(sample_negatives, 'Negative Reviews', num_topics)
tm_london_pos = get_topics_bert(sample_london_positives, 'London Positive Reviews', num_topics)
tm_london_neg = get_topics_bert(sample_london_negatives, 'London Negative Reviews', num_topics)
tm_nyc_pos = get_topics_bert(sample_nyc_positives, 'NYC Positive Reviews', num_topics)
tm_nyc_neg = get_topics_bert(sample_nyc_negatives, 'NYC Negative Reviews', num_topics)

2024-08-02 09:21:43,587 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for London...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2024-08-02 09:22:00,149 - BERTopic - Embedding - Completed ✓
2024-08-02 09:22:00,150 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:22:42,391 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:22:42,393 - BERTopic - Cluster - Start clustering the reduced embeddings

os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

2024-08-02 09:22:46,740 - BERTopic - Cluster - Completed ✓
2024-08-02 09:22:46,742 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:23:25,758 - BERTopic - Representation - Completed ✓
2024-08-02 09:23:25,764 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:23:28,986 - BERTopic - Topic reduction - Reduced number of topics from 565 to 20
2024-08-02 09:23:29,391 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for NYC...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2024-08-02 09:23:45,171 - BERTopic - Embedding - Completed ✓
2024-08-02 09:23:45,172 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:24:26,698 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:24:26,700 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-02 09:24:29,831 - BERTopic - Cluster - Completed ✓
2024-08-02 09:24:29,832 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:25:07,216 - BERTopic - Representation - Completed ✓
2024-08-02 09:25:07,222 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:25:10,545 - BERTopic - Topic reduction - Reduced number of topics from 544 to 20
2024-08-02 09:25:10,896 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for Positive Reviews...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2024-08-02 09:25:26,672 - BERTopic - Embedding - Completed ✓
2024-08-02 09:25:26,672 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:26:09,207 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:26:09,210 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-02 09:26:12,268 - BERTopic - Cluster - Completed ✓
2024-08-02 09:26:12,269 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:26:48,914 - BERTopic - Representation - Completed ✓
2024-08-02 09:26:48,920 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:26:52,203 - BERTopic - Topic reduction - Reduced number of topics from 543 to 20
2024-08-02 09:26:52,549 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for Negative Reviews...


Batches:   0%|          | 0/117 [00:00<?, ?it/s]

2024-08-02 09:26:54,773 - BERTopic - Embedding - Completed ✓
2024-08-02 09:26:54,773 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:27:18,551 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:27:18,552 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-02 09:27:18,708 - BERTopic - Cluster - Completed ✓
2024-08-02 09:27:18,709 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:27:21,102 - BERTopic - Representation - Completed ✓
2024-08-02 09:27:21,104 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:27:22,040 - BERTopic - Topic reduction - Reduced number of topics from 69 to 20
2024-08-02 09:27:22,238 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for London Positive Reviews...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2024-08-02 09:27:39,113 - BERTopic - Embedding - Completed ✓
2024-08-02 09:27:39,115 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:28:23,479 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:28:23,481 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-02 09:28:26,621 - BERTopic - Cluster - Completed ✓
2024-08-02 09:28:26,622 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:29:08,434 - BERTopic - Representation - Completed ✓
2024-08-02 09:29:08,440 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:29:11,767 - BERTopic - Topic reduction - Reduced number of topics from 595 to 20
2024-08-02 09:29:12,084 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for London Negative Reviews...


Batches:   0%|          | 0/68 [00:00<?, ?it/s]

2024-08-02 09:29:13,817 - BERTopic - Embedding - Completed ✓
2024-08-02 09:29:13,818 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:29:24,516 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:29:24,518 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-02 09:29:24,612 - BERTopic - Cluster - Completed ✓
2024-08-02 09:29:24,613 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:29:26,005 - BERTopic - Representation - Completed ✓
2024-08-02 09:29:26,007 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:29:26,780 - BERTopic - Topic reduction - Reduced number of topics from 39 to 20
2024-08-02 09:29:26,957 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for NYC Positive Reviews...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2024-08-02 09:29:42,545 - BERTopic - Embedding - Completed ✓
2024-08-02 09:29:42,546 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:30:29,726 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:30:29,728 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-02 09:30:32,937 - BERTopic - Cluster - Completed ✓
2024-08-02 09:30:32,938 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:31:05,245 - BERTopic - Representation - Completed ✓
2024-08-02 09:31:05,251 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:31:08,531 - BERTopic - Topic reduction - Reduced number of topics from 518 to 20
2024-08-02 09:31:08,874 - BERTopic - Embedding - Transforming documents to embeddings.


Getting topics for NYC Negative Reviews...


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

2024-08-02 09:31:10,385 - BERTopic - Embedding - Completed ✓
2024-08-02 09:31:10,386 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-02 09:31:17,227 - BERTopic - Dimensionality - Completed ✓
2024-08-02 09:31:17,228 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-02 09:31:17,290 - BERTopic - Cluster - Completed ✓
2024-08-02 09:31:17,291 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-02 09:31:17,481 - BERTopic - Representation - Completed ✓
2024-08-02 09:31:17,482 - BERTopic - Topic reduction - Reducing number of topics
2024-08-02 09:31:17,484 - BERTopic - Topic reduction - Reduced number of topics from 2 to 2


## Analysis of Polar Reviews (Both Cities)

In [50]:
def visualize(tm):
    display(tm.get_topic_info().head(num_topics+1))
    fig = tm.visualize_barchart(top_n_topics=num_topics)
    fig.show()
    if len(tm.get_topics()) < 10:
        return
    fig = tm.visualize_topics()
    fig.show()
visualize(tm_pos)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,21221,-1_great place_great location_great stay_great...,"[great place, great location, great stay, grea...",[Great location perfect comfortable room Would...
1,0,23832,0_great location_location great_great place_gr...,"[great location, location great, great place, ...","[Great host great location, Great location gre..."
2,1,1490,1_great stay_excellent stay_wonderful stay_goo...,"[great stay, excellent stay, wonderful stay, g...","[Great stay, Great stay, Great stay]"
3,2,1073,2_comfortable bed_bed comfortable_comfy bed_be...,"[comfortable bed, bed comfortable, comfy bed, ...","[comfortable bed, comfortable bed, comfortable..."
4,3,654,3_great_wonderful_excellent_great great,"[great, wonderful, excellent, great great, goo...","[Great, Great, Great]"
5,4,531,4_everything perfect_everythings perfect_perfe...,"[everything perfect, everythings perfect, perf...","[Everything perfect, Everything perfect, Every..."
6,5,342,5_great location_location great_location excel...,"[great location, location great, location exce...","[Great location Great value money, Great locat..."
7,6,223,6_great location_location great_good location_...,"[great location, location great, good location...","[Great location easy check, Great location eas..."
8,7,189,7_great staff_staff great_wonderful staff_awes...,"[great staff, staff great, wonderful staff, aw...","[great staff, Great location great staff, Grea..."
9,8,87,8_clean nice_nice clean_kind clean_good clean,"[clean nice, nice clean, kind clean, good clea...","[clean nice, clean nice, clean nice]"


In [42]:
visualize(tm_neg)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1444,-1_airbnb_bedroom_hotel_apartment,"[airbnb, bedroom, hotel, apartment, room, bed,...",[host responsive helpful kind well issue place...
1,0,458,0_exploring london_base london_explore london_...,"[exploring london, base london, explore london...","[good base exploring London, good base explori..."
2,1,350,1_cozy place_place cozy_cozy_cozy cozy,"[cozy place, place cozy, cozy, cozy cozy, cozy...","[cozy place, cozy, cozy]"
3,2,330,2_dirty place_place dirty_dirty_room dirty,"[dirty place, place dirty, dirty, room dirty, ...","[Dirty place, DIRTY PLACE Dirty sheet dirty ro..."
4,3,220,3_worst airbnb_airbnb experience_airbnb_hotel,"[worst airbnb, airbnb experience, airbnb, hote...",[Nicely decorated cute apartment unfortunately...
5,4,189,4_cold_hot cold_cold cold_cold heat,"[cold, hot cold, cold cold, cold heat, cold wa...","[Cold, cold, Cold]"
6,5,164,5_uncomfortable bed_bed uncomfortable_bed comf...,"[uncomfortable bed, bed uncomfortable, bed com...","[bed uncomfortable Nice location, Really nice ..."
7,6,134,6_host helpful_helpful host_responsive host_he...,"[host helpful, helpful host, responsive host, ...","[Extremely responsive helpful, Host extremely ..."
8,7,102,7_elevator working_lock box_elevator_check key,"[elevator working, lock box, elevator, check k...",[Checkin difficult info sent phone receiving s...
9,8,76,8_horrible service_service horrible_terrible c...,"[horrible service, service horrible, terrible ...","[bad customer service, terrible customer servi..."


## Analysis of City Reviews

In [43]:
visualize(tm_london)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,21959,-1_great location_great place_perfect location...,"[great location, great place, perfect location...","[great location great host everything need, Wo..."
1,0,23191,0_host great_great host_great place_place great,"[host great, great host, great place, place gr...","[Great place great host, Great place great loc..."
2,1,1320,1_great stay_excellent stay_wonderful stay_goo...,"[great stay, excellent stay, wonderful stay, g...","[Great stay, Great stay, great stay]"
3,2,820,2_everything perfect_everythings perfect_perfe...,"[everything perfect, everythings perfect, perf...","[Everything perfect, Everything perfect, every..."
4,3,762,3_comfortable_comfortable comfortable_comforta...,"[comfortable, comfortable comfortable, comfort...","[comfortable, comfortable, comfortable]"
5,4,493,4_great value_fantastic value_excellent value_...,"[great value, fantastic value, excellent value...","[Great location great value money, Great locat..."
6,5,421,5_great_excellent_great great_good great,"[great, excellent, great great, good great, aw...","[Great, Great, Great]"
7,6,357,6_welcoming host_host welcoming_welcoming plac...,"[welcoming host, host welcoming, welcoming pla...","[warm welcoming host wonderful stay, Nice plac..."
8,7,110,7_clean wifi_wifi place_good wifi_great wifi,"[clean wifi, wifi place, good wifi, great wifi...","[great room clean good wifi Tibor great host, ..."
9,8,103,8_great experience_excellent experience_wonder...,"[great experience, excellent experience, wonde...","[great experience, great experience, great exp..."


In [45]:
visualize(tm_london_pos)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,22900,-1_great location_great place_perfect location...,"[great location, great place, perfect location...",[Great place excellent location wonderful host...
1,0,21766,0_great location_location great_great place_ex...,"[great location, location great, great place, ...","[GREAT LOCATION GREAT HOST, Great location gre..."
2,1,2694,1_great stay_wonderful stay_excellent stay_fan...,"[great stay, wonderful stay, excellent stay, f...","[Great stay great location, Great stay great l..."
3,2,673,2_everything perfect_perfect everything_perfec...,"[everything perfect, perfect everything, perfe...","[Everything perfect 5, Everything perfect, per..."
4,3,562,3_great_great great_good great_wonderful,"[great, great great, good great, wonderful, gr...","[great, great, Great Great Great]"
5,4,427,4_great location_location great_excellent loca...,"[great location, location great, excellent loc...","[Great location great value money, Great locat..."
6,5,253,5_comfortable stay_stay comfortable_clean comf...,"[comfortable stay, stay comfortable, clean com...","[comfortable stay, Clean comfortable stay, cle..."
7,6,115,6_clean tidy_tidy clean_tidy_tidy easy,"[clean tidy, tidy clean, tidy, tidy easy, neat...","[clean tidy, Clean tidy, Clean tidy]"
8,7,93,7_nice appartment_great appartment_nice appart...,"[nice appartment, great appartment, nice appar...","[nice appartment, Nice appartment great locati..."
9,8,81,8_good experience_great experience_wonderful e...,"[good experience, great experience, wonderful ...","[good experience, good experience, good experi..."


In [46]:
visualize(tm_london_neg)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,788,-1_bedroom_hotel_airbnb_apartment,"[bedroom, hotel, airbnb, apartment, room, bath...",[worst host Ive ever encountered stay trip Mia...
1,0,252,0_exploring london_base london_london base_exp...,"[exploring london, base london, london base, e...","[good base exploring London, Good base explori..."
2,1,162,1_worst airbnb_airbnb experience_airbnb_airbnb...,"[worst airbnb, airbnb experience, airbnb, airb...",[horrible experience messaged day never respon...
3,2,158,2_cozy_cozy cozy_place cozy_cozy stay,"[cozy, cozy cozy, place cozy, cozy stay, cozy ...","[cozy, cozy, cozy]"
4,3,119,3_dirty bathroom_room dirty_bathroom dirty_dir...,"[dirty bathroom, room dirty, bathroom dirty, d...",[Bad experience Dirty apartment especially kit...
5,4,107,4_bed uncomfortable_uncomfortable bed_bed comf...,"[bed uncomfortable, uncomfortable bed, bed com...","[Bed sheet uncomfortable, Really nice sofa bed..."
6,5,103,5_wont disappointed_disappointed_disappointed ...,"[wont disappointed, disappointed, disappointed...","[wont disappointed, wont disappointed, wont di..."
7,6,67,6_sleep room_noisy night_extremely noisy_apart...,"[sleep room, noisy night, extremely noisy, apa...",[Clean linen generous room size spotless kitch...
8,7,63,7_cold_bit cold_room cold_cold heating,"[cold, bit cold, room cold, cold heating, cold...",[place meet expectation disappointed heating s...
9,8,51,8_hot water_shower hot_cold shower_shower cold,"[hot water, shower hot, cold shower, shower co...",[place great location met need especially prov...


In [47]:
visualize(tm_nyc)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,21201,-1_great place_great location_place great_plac...,"[great place, great location, place great, pla...",[Wonderful location great room Would definitel...
1,0,24786,0_great location_location great_great place_pl...,"[great location, location great, great place, ...","[Great place great location, Great place great..."
2,1,1786,1_great stay_excellent stay_wonderful stay_ama...,"[great stay, excellent stay, wonderful stay, a...","[Great stay, Great stay, Great stay]"
3,2,979,2_everything great_great everything_everything...,"[everything great, great everything, everythin...","[Everything great, great Everything perfect, E..."
4,3,231,3_easy check_check easy_easy checkin_checkin easy,"[easy check, check easy, easy checkin, checkin...","[Great location Easy check check, Easy check, ..."
5,4,188,4_nice clean_clean nice_beautiful clean_pretty...,"[nice clean, clean nice, beautiful clean, pret...","[nice clean, nice clean, Nice clean]"
6,5,185,5_great communication_communication great_exce...,"[great communication, communication great, exc...","[Great communication, Great communication, Gre..."
7,6,148,6_great time_amazing time_wonderful time_great...,"[great time, amazing time, wonderful time, gre...","[great time Great place, great time great loca..."
8,7,123,7_nice guy_great guy_guy nice_guy great,"[nice guy, great guy, guy nice, guy great, awe...","[friendly nice guy, Nice guy, nice guy]"
9,8,96,8_great experience_wonderful experience_amazin...,"[great experience, wonderful experience, amazi...","[great experience, great experience, great exp..."


In [48]:
visualize(tm_nyc_pos)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,21433,-1_great place_great location_place great_place,"[great place, great location, place great, pla...","[Great place, Great place, Great place]"
1,0,22982,0_great place_great location_location great_pl...,"[great place, great location, location great, ...","[Great place Great host, Great place great loc..."
2,1,1798,1_great stay_excellent stay_wonderful stay_ama...,"[great stay, excellent stay, wonderful stay, a...","[Great stay, great stay, Great stay]"
3,2,1551,2_clean_clean clean_clean good_clean easy,"[clean, clean clean, clean good, clean easy, c...","[clean, clean, clean]"
4,3,619,3_great_wonderful_excellent_great great,"[great, wonderful, excellent, great great, goo...","[great, great, Great]"
5,4,521,4_great location_location great_excellent loca...,"[great location, location great, excellent loc...","[Great Place Great Location great price, Great..."
6,5,389,5_everything perfect_everythings perfect_perfe...,"[everything perfect, everythings perfect, perf...","[Everything perfect, everything perfect, Every..."
7,6,171,6_great time_wonderful time_good time_amazing ...,"[great time, wonderful time, good time, amazin...","[great time, great time great location, great ..."
8,7,127,7_great experience_excellent experience_wonder...,"[great experience, excellent experience, wonde...","[great experience, great experience, Great exp..."
9,8,122,8_nice helpful_helpful nice_helpful_kind helpful,"[nice helpful, helpful nice, helpful, kind hel...","[nice helpful, nice helpful, nice helpful]"


In [51]:
visualize(tm_nyc_neg)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,24,0_horrible_terrible_horrible horrible_horrible...,"[horrible, terrible, horrible horrible, horrib...","[Horrible, Horrible, Horrible]"
1,1,1545,1_cozy place_cozy_bedroom_hotel,"[cozy place, cozy, bedroom, hotel, place, apar...","[cozy, would recommend airbnb stay 5 night gre..."
