In [1]:
# Importing Libraries
from bertopic import BERTopic
import pandas as pd

import string

### Data Cleaning

In [2]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

In [3]:
# Reading the labeled training dataset from CSV

df_train = pd.read_csv("dataset_llm_judge.csv")

In [4]:
# Reading the validation dataset with custom settings to handle formatting

df_valid = pd.read_csv("dataset_valid.csv",
    sep="|",
    engine="python",
    header=None,
    names=["id", "text"],
    quoting=3,            # ignoring quotes
    on_bad_lines="warn",  # warning about bad lines
    encoding="utf-8"      # allowing to switch to latin1 if there are encoding issues
)


In [6]:
import re

# Due to an unexpected nltk bug, we decided to apply re to a small set of stopwords
stop_words = set([
    "the", "and", "is", "to", "of", "in", "that", "for", "it", "on", "with",
    "as", "was", "at", "by", "an", "be", "this", "are", "from", "had"
])

In [7]:
def clean_text(text):
    text = text.lower()
    tokens = re.findall(r'\b[a-z]{2,}\b', text)  # pega palavras com 2+ letras
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [8]:
df_train["clean_text"] = df_train["text"].astype(str).apply(clean_text)
df_valid["clean_text"] = df_valid["text"].astype(str).apply(clean_text)




In [9]:
train_texts = df_train["clean_text"].astype(str).tolist()
valid_texts = df_valid["clean_text"].astype(str).tolist()

### Training Model with BERTopic

In [10]:
# Training and applying BERTopic model
# Training with training data
topic_model = BERTopic()
# Applying model to validated data
train_topics, train_probs = topic_model.fit_transform(train_texts)
# Showing most relevant topics
valid_topics, valid_probs = topic_model.transform(valid_texts)



In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,194,-1_food_but_they_my,"[food, but, they, my, not, all, we, you, were,...",[have reservations about all you can eat deal ...
1,0,68,0_lobster_fish_fresh_salads,"[lobster, fish, fresh, salads, food, delicious...","[fish so very fresh, only things could really ..."
2,1,57,1_service_food_waiter_rude,"[service, food, waiter, rude, me, waitress, we...",[waitress moved our table practically into bat...
3,2,51,2_place_loved_spot_have,"[place, loved, spot, have, fun, friends, there...",[place small intimate you may feel little crow...
4,3,32,3_service_slow_wait_friendly,"[service, slow, wait, friendly, prompt, staff,...","[but service bit slow, service prompt friendly..."
5,4,31,4_restaurant_decor_night_looking,"[restaurant, decor, night, looking, small, pla...","[small cute restaurant, decor night tho but th..."
6,5,28,5_back_again_will_go,"[back, again, will, go, going, would, ll, defi...","[will back, we go back again, we will never go..."
7,6,24,6_sushi_japanese_rolls_mizu,"[sushi, japanese, rolls, mizu, all, sashimi, e...",[boring inside our sushi pretty below average ...
8,7,24,7_gem_we_wrong_pleasantly,"[gem, we, wrong, pleasantly, surprised, red, w...","[you can not go wrong red eye grill, when we s..."
9,8,24,8_food_good_great_excellent,"[food, good, great, excellent, nothing, minnow...","[food great, food very good well considering w..."


In [12]:
# Generating Topic Textual data

for topic_id in topic_model.get_topics().keys():
    if topic_id == -1:
        continue  # Pula o outlier (tópico -1)
    
    topic_words = topic_model.get_topic(topic_id)

    # Garante que é uma lista válida
    if isinstance(topic_words, list):
        top_words = ", ".join([word for word, _ in topic_words[:5]])
        print(f"Topic {topic_id}: {top_words}")
    else:
        print(f"Topic {topic_id}:error extracting keywords")

Topic 0: lobster, fish, fresh, salads, food
Topic 1: service, food, waiter, rude, me
Topic 2: place, loved, spot, have, fun
Topic 3: service, slow, wait, friendly, prompt
Topic 4: restaurant, decor, night, looking, small
Topic 5: back, again, will, go, going
Topic 6: sushi, japanese, rolls, mizu, all
Topic 7: gem, we, wrong, pleasantly, surprised
Topic 8: food, good, great, excellent, nothing
Topic 9: pizza, crust, ingredients, cheese, use
Topic 10: york, new, ny, manhattan, midtown
Topic 11: ambience, atmosphere, so, wonderful, service
Topic 12: prices, worth, felt, also, moderate
Topic 13: menu, wine, appetizer, list, great
Topic 14: wine, list, selection, glass, well
Topic 15: thai, planet, pad, sweet, ordered
