In [61]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Download stopwords if not already done
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CLL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
# Load your cleaned reviews
cleaned_df = pd.read_csv("../data/cleaned_scraped_reviews.csv", encoding='latin1')

# Quick preview
cleaned_df.head()

Unnamed: 0.1,Unnamed: 0,address,categories/0,categories/1,categories/2,categories/3,categories/4,categoryName,name,publishedAtDate,text,translatedLanguage,category
0,1,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Edgar,2025-08-27T11:31:55.424Z,"Learn about their history, customs, and how th...",en,
1,3,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Cecilia Figueroa,2025-08-27T03:40:13.279Z,If you like the history of Mexico or in genera...,en,
2,7,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Luis Spirit,2025-08-26T23:49:24.207Z,"Excellent place to visit with the family, enjo...",en,
3,8,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Juliana Rosenhave,2025-08-26T22:59:59.030Z,It's extremely touristy but worth it. Great vi...,,
4,9,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...",Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,DANIEL,2025-08-26T21:57:10.257Z,spectacular beyond words.,en,


In [63]:
def preprocess_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # remove stopwords
    return text

cleaned_df['cleaned_text'] = cleaned_df['text'].apply(preprocess_text)

In [64]:
# Use TF-IDF or CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=10, ngram_range=(1,1))
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_df['cleaned_text'])

In [65]:
n_topics = 15  # try increasing or decreasing to refine topics
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_topics = lda_model.fit_transform(tfidf_matrix)

In [66]:
# Assign dominant topic to each review
topic_numbers = np.argmax(lda_topics, axis=1)
cleaned_df['topic_number'] = topic_numbers

# Optional: Get top words per topic
def get_top_words(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        topics[topic_idx] = top_features
    return topics

top_words = get_top_words(lda_model, tfidf_vectorizer.get_feature_names_out(), 10)
top_words

{0: ['good',
  'beautiful',
  'place',
  'eat',
  'service',
  'walk',
  'food',
  'always',
  'seasoning',
  'dirty'],
 1: ['love',
  'recommended',
  'highly',
  'expensive',
  'fine',
  'pozole',
  'recommend',
  'everything',
  'great',
  'place'],
 2: ['family',
  'experience',
  'place',
  'best',
  'nice',
  'clean',
  'safe',
  'spend',
  'lot',
  'history'],
 3: ['beautiful',
  'cathedral',
  'history',
  'incredible',
  'many',
  'architecture',
  'mexico',
  'impressive',
  'full',
  'place'],
 4: ['parking',
  'de',
  'casa',
  'dishes',
  'good',
  'places',
  'option',
  'la',
  'delicious',
  'flautas'],
 5: ['closed',
  'interesting',
  'noisy',
  'horrible',
  'favorite',
  'make',
  'due',
  'less',
  'space',
  'open'],
 6: ['nice',
  'great',
  'amazing',
  'place',
  'food',
  'toronto',
  'views',
  'tacos',
  'good',
  'view'],
 7: ['food',
  'service',
  'bad',
  'tacos',
  'order',
  'dont',
  'even',
  'like',
  'time',
  'good'],
 8: ['place',
  'wonderful',


In [67]:
from collections import Counter

lst = []

for topic, words in top_words.items():
    for i in words:
        lst.append(i)

newLst = []
for i in set(lst):
    newLst.append((lst.count(i), i))

print(sorted(newLst, reverse=True))


[(7, 'place'), (7, 'good'), (5, 'service'), (5, 'great'), (4, 'food'), (3, 'everything'), (3, 'beautiful'), (2, 'visit'), (2, 'view'), (2, 'time'), (2, 'tacos'), (2, 'square'), (2, 'nice'), (2, 'mexico'), (2, 'like'), (2, 'history'), (2, 'excellent'), (2, 'delicious'), (2, 'always'), (1, 'wonderful'), (1, 'walk'), (1, 'wait'), (1, 'views'), (1, 'toronto'), (1, 'toos'), (1, 'super'), (1, 'stay'), (1, 'station'), (1, 'staff'), (1, 'spend'), (1, 'space'), (1, 'serve'), (1, 'seasoning'), (1, 'safe'), (1, 'rooms'), (1, 'recommended'), (1, 'recommend'), (1, 'quickly'), (1, 'problem'), (1, 'prices'), (1, 'pretty'), (1, 'pozole'), (1, 'positive'), (1, 'pleasant'), (1, 'places'), (1, 'parking'), (1, 'order'), (1, 'option'), (1, 'open'), (1, 'noisy'), (1, 'negative'), (1, 'much'), (1, 'metro'), (1, 'many'), (1, 'make'), (1, 'main'), (1, 'loved'), (1, 'love'), (1, 'lots'), (1, 'lot'), (1, 'location'), (1, 'less'), (1, 'la'), (1, 'interesting'), (1, 'incredible'), (1, 'impressive'), (1, 'houses'),

In [68]:
# Example: you can just join top words for readability
cleaned_df['topic'] = cleaned_df['topic_number'].apply(lambda x: ', '.join(top_words[x]))

# Reorder columns: address, topic, text, rest
cols = ['address', 'topic', 'text'] + [c for c in cleaned_df.columns if c not in ['address','topic','text']]
cleaned_df = cleaned_df[cols]

# Quick preview
cleaned_df.head()

Unnamed: 0.1,address,topic,text,Unnamed: 0,categories/0,categories/1,categories/2,categories/3,categories/4,categoryName,name,publishedAtDate,translatedLanguage,category,cleaned_text,topic_number
0,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","place, wonderful, visit, castle, great, time, ...","Learn about their history, customs, and how th...",1,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Edgar,2025-08-27T11:31:55.424Z,en,,learn history customs lived,8
1,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","nice, great, amazing, place, food, toronto, vi...",If you like the history of Mexico or in genera...,3,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Cecilia Figueroa,2025-08-27T03:40:13.279Z,en,,like history mexico general good idea come eve...,6
2,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","family, experience, place, best, nice, clean, ...","Excellent place to visit with the family, enjo...",7,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Luis Spirit,2025-08-26T23:49:24.207Z,en,,excellent place visit family enjoy history sce...,2
3,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","nice, great, amazing, place, food, toronto, vi...",It's extremely touristy but worth it. Great vi...,8,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Juliana Rosenhave,2025-08-26T22:59:59.030Z,,,extremely touristy worth great views city,6
4,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","parking, de, casa, dishes, good, places, optio...",spectacular beyond words.,9,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,DANIEL,2025-08-26T21:57:10.257Z,en,,spectacular beyond words,4


In [69]:
cleaned_df.to_csv("../data/cleaned_scraped_reviews_with_topics.csv", index=False, encoding='latin1')

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# 1️⃣ Create TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=10, ngram_range=(1,1))
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_df['text'])

# 2️⃣ Fit NMF
n_topics = 15
nmf_model = NMF(n_components=n_topics, random_state=42)
nmf_model.fit(tfidf_matrix)

# 3️⃣ Transform reviews into topic space
topic_distributions = nmf_model.transform(tfidf_matrix)


In [71]:
threshold = 0.25
dominant_topics_list = [
    [i for i, p in enumerate(probs) if p >= threshold]
    for probs in topic_distributions
]

cleaned_df['dominant_topics'] = dominant_topics_list

In [None]:
# Suppose you have a dict like {0: ['food','restaurant',...], 1: [...], ...}
topic_words_dict = top_words

def topics_to_words(topic_nums):
    return [", ".join(topic_words_dict[i][:5]) for i in list(topic_nums)]


def topics_to_words_flat(topic_nums):
    words = []
    for i in list(topic_nums):
        words.extend(topic_words_dict[i][:5])
    return ", ".join(words)

cleaned_df['dominant_topics_words'] = cleaned_df['dominant_topics'].apply(topics_to_words_flat)

In [86]:
cleaned_df.head()


Unnamed: 0.1,address,topic,text,Unnamed: 0,categories/0,categories/1,categories/2,categories/3,categories/4,categoryName,name,publishedAtDate,translatedLanguage,category,cleaned_text,topic_number,dominant_topics,dominant_topics_words
0,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","place, wonderful, visit, castle, great, time, ...","Learn about their history, customs, and how th...",1,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Edgar,2025-08-27T11:31:55.424Z,en,,learn history customs lived,8,[],
1,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","nice, great, amazing, place, food, toronto, vi...",If you like the history of Mexico or in genera...,3,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Cecilia Figueroa,2025-08-27T03:40:13.279Z,en,,like history mexico general good idea come eve...,6,[],
2,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","family, experience, place, best, nice, clean, ...","Excellent place to visit with the family, enjo...",7,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Luis Spirit,2025-08-26T23:49:24.207Z,en,,excellent place visit family enjoy history sce...,2,[],
3,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","nice, great, amazing, place, food, toronto, vi...",It's extremely touristy but worth it. Great vi...,8,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,Juliana Rosenhave,2025-08-26T22:59:59.030Z,,,extremely touristy worth great views city,6,[],
4,"Bosque de Chapultepec I Secc, Miguel Hidalgo, ...","parking, de, casa, dishes, good, places, optio...",spectacular beyond words.,9,Castle,Historical landmark,Historical place museum,Museum,Tourist attraction,Castle,DANIEL,2025-08-26T21:57:10.257Z,en,,spectacular beyond words,4,[],
