In [69]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from umap import UMAP
import re
import os
from dotenv import load_dotenv
import openai
from openai import OpenAI

In [22]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english")).union(ENGLISH_STOP_WORDS)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Galyna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_csv('data/taxi_data_clean.csv')

In [27]:
custom_stopwords = {'taxidrive', 'taxi', 'drive', 'taxiservice', 'will', 'driver'}
stop_words = set(stopwords.words("english")).union(ENGLISH_STOP_WORDS).union(custom_stopwords)

In [28]:
def preprocess(text):
    text = re.sub(r"[^\w\s]", "", text.lower())  # remove punctuation, lowercase
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [29]:
df["Clean_Reviews"] = df["Reviews"].apply(preprocess)

In [57]:
kmeans = KMeans(n_clusters=6)
topic_model = BERTopic(hdbscan_model=kmeans)

In [61]:
topics, probs = topic_model.fit_transform(df['Clean_Reviews'])
topic_info = topic_model.get_topic_info()

In [80]:
def generate_prompt_for_topic(topic_row):
    topic_id = topic_row['Topic']
    keywords = topic_row['Representation']
    return f"""Give a short, clear topic name for a group of user reviews based on the following top keywords:
    
Top Keywords: {keywords}

Make it sound natural, like a dashboard label. You may include 1 emoji if it adds clarity. Return only the topic name."""

In [70]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [83]:
def get_topic_name(prompt):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a product analyst summarizing customer feedback topics for an internal dashboard. "
                    "Generate short, clear, human-readable topic names. "
                    "Use simple wording that reflects the complaint or experience. "
                    "Optionally include a relevant emoji if appropriate (but only one)."
                )
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0.5,
        max_tokens=25,
    )
    return response.choices[0].message.content.strip()

In [84]:
# Generate new names
topic_info['Topic_Name'] = topic_info.apply(
    lambda row: get_topic_name(generate_prompt_for_topic(row)), axis=1
)

In [96]:
review_counts = df['Topic'].value_counts().reset_index()
review_counts.columns = ['Topic', 'Review_Count']

merged_df = pd.merge(
    topic_sentiment_counts_reset,  # <- replace ambiguous reference
    topic_info[["Topic", "Topic_Name"]],
    on="Topic",
    how="left"
)

# Then merge in review counts
merged_df = pd.merge(merged_df, review_counts, on="Topic", how="left")
merged_df

Unnamed: 0,Topic,Mixed,Negative,Positive,Topic_Name,Review_Count
0,0,0.022277,0.957921,0.019802,"""Complaints about Trip Cancellations and Unexpected Charges 🚖""",404
1,1,0.049275,0.924638,0.026087,"""Customer Service & Account Management Issues 📱""",345
2,2,0.032738,0.89881,0.068452,"""Complaints about Ride Charges and Pricing 💰""",336
3,3,0.042813,0.944954,0.012232,"""Driver Behavior and Car Experience 🚗""",327
4,4,0.191223,0.291536,0.517241,"""Customer Experience with Drivers and Service Quality 🚖""",319
5,5,0.038835,0.906149,0.055016,"""Gift Card Payment Issues 🎁""",309
