In [21]:
# --- 1. Imports ---
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Download stopwords if not already available
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CLL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# --- 2. Load Data ---
df = pd.read_csv("../data/cleaned_scraped_reviews.csv")

print("Shape:", df.shape)
print(df.columns)

# Focus only on the 'text' column for now
texts = df['text'].dropna().tolist()
print("Sample review:\n", texts[0])


Shape: (3941, 13)
Index(['Unnamed: 0', 'address', 'categories/0', 'categories/1', 'categories/2',
       'categories/3', 'categories/4', 'categoryName', 'name',
       'publishedAtDate', 'text', 'translatedLanguage', 'category'],
      dtype='object')
Sample review:
 Learn about their history, customs, and how they lived!


In [23]:
# --- 3. Preprocessing ---

stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # keep only letters
    tokens = [w for w in text.split() if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

df['clean_text'] = df['text'].dropna().apply(preprocess)
df = df.dropna(subset=['clean_text'])
df.head()

Unnamed: 0.1,Unnamed: 0,address,categories/0,categories/1,categories/2,categories/3,categories/4,categoryName,name,publishedAtDate,text,translatedLanguage,category,clean_text
0,1,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Edgar,2025-08-27T11:31:55.424Z,"Learn about their history, customs, and how th...",en,,learn history customs lived
1,3,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Cecilia Figueroa,2025-08-27T03:40:13.279Z,If you like the history of Mexico or in genera...,en,,like history mexico general good idea come eve...
2,7,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Luis Spirit,2025-08-26T23:49:24.207Z,"Excellent place to visit with the family, enjo...",en,,excellent place visit family enjoy history sce...
3,8,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Juliana Rosenhave,2025-08-26T22:59:59.030Z,It's extremely touristy but worth it. Great vi...,,,extremely touristy worth great views city
4,9,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,DANIEL,2025-08-26T21:57:10.257Z,spectacular beyond words.,en,,spectacular beyond words


In [24]:
# --- 4. TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(
    max_df=0.8,   # ignore words that appear in >80% docs
    min_df=5,     # ignore words that appear in <5 docs
    stop_words='english'
)

X = vectorizer.fit_transform(df['clean_text'])
print("TF-IDF matrix shape:", X.shape)

TF-IDF matrix shape: (3941, 1542)


In [25]:
# --- 5. Topic Modeling (NMF) ---
num_topics = 20  # start with 8, we can tune later
nmf = NMF(n_components=num_topics, random_state=42)
nmf.fit(X)

feature_names = vectorizer.get_feature_names_out()

def display_topics(model, feature_names, no_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print(" | ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print()

display_topics(nmf, feature_names, 10)

Topic 0:
good | attention | price | atmosphere | prices | seasoning | taste | quality | cheap | rich

Topic 1:
excellent | family | location | attention | view | prices | option | restaurant | atmosphere | cathedral

Topic 2:
beautiful | architecture | cathedral | view | square | views | oaxaca | free | worth | tour

Topic 3:
service | attention | attentive | flavor | customer | terrible | quick | speed | tasty | loved

Topic 4:
place | visit | family | history | walk | enjoy | wonderful | incredible | emblematic | spend

Topic 5:
fast | friendly | efficient | prices | rich | clean | attentive | tasty | quite | affordable

Topic 6:
delicious | friendly | flutes | dinner | views | cheap | flan | dishes | quick | wait

Topic 7:
nice | walk | clean | museum | worth | really | safe | tasty | pretty | experience

Topic 8:
city | mexico | visit | history | center | historic | mustsee | heart | zcalo | cathedral

Topic 9:
like | time | dont | order | house | toos | didnt | bad | really | long

In [26]:
# --- 6. Assign topics back to reviews ---
topic_values = nmf.transform(X)
df['topic'] = topic_values.argmax(axis=1)

df[['text', 'topic']].head(20)

Unnamed: 0,text,topic
0,"Learn about their history, customs, and how th...",8
1,If you like the history of Mexico or in genera...,17
2,"Excellent place to visit with the family, enjo...",1
3,It's extremely touristy but worth it. Great vi...,11
4,spectacular beyond words.,18
5,"Reasonably well maintained, despite government...",8
6,✌🏽,0
7,"A beautiful place, the views are unmissable",2
8,One of the best museums I've been to✨✨,14
9,It is a very nice place to visit with the family,7


In [27]:
# --- Reorder columns so that: address | topic | text | (rest) ---

# Make sure topic column exists (empty for now, will be filled after modeling)
if "topic" not in df.columns:
    df["topic"] = None

# Reorder logic
front_cols = ["address", "topic", "text"]
other_cols = [col for col in df.columns if col not in front_cols]

# Reordered DataFrame
df = df[front_cols + other_cols]

df.head(10)

Unnamed: 0.1,address,topic,text,Unnamed: 0,categories/0,categories/1,categories/2,categories/3,categories/4,categoryName,name,publishedAtDate,translatedLanguage,category,clean_text
0,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",8,"Learn about their history, customs, and how th...",1,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Edgar,2025-08-27T11:31:55.424Z,en,,learn history customs lived
1,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",17,If you like the history of Mexico or in genera...,3,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Cecilia Figueroa,2025-08-27T03:40:13.279Z,en,,like history mexico general good idea come eve...
2,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",1,"Excellent place to visit with the family, enjo...",7,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Luis Spirit,2025-08-26T23:49:24.207Z,en,,excellent place visit family enjoy history sce...
3,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",11,It's extremely touristy but worth it. Great vi...,8,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Juliana Rosenhave,2025-08-26T22:59:59.030Z,,,extremely touristy worth great views city
4,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",18,spectacular beyond words.,9,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,DANIEL,2025-08-26T21:57:10.257Z,en,,spectacular beyond words
5,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",8,"Reasonably well maintained, despite government...",11,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Jose Luis Flores,2025-08-26T21:20:36.730Z,en,,reasonably well maintained despite government ...
6,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",0,✌🏽,14,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Corky Jordan,2025-08-26T17:30:22.328Z,,,
7,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",2,"A beautiful place, the views are unmissable",17,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Angel Aguilera,2025-08-26T06:36:08.906Z,en,,beautiful place views unmissable
8,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",14,One of the best museums I've been to✨✨,19,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Miriel Malvaez,2025-08-26T04:34:29.320Z,en,,one best museums ive
9,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",7,It is a very nice place to visit with the family,20,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Arantza Noriega,2025-08-26T03:21:58.974Z,en,,nice place visit family


In [28]:
# --- 7. Save outputs ---
df.to_csv("../data/reviews_with_topics.csv", index=False)