<a href="https://www.kaggle.com/code/sadamhali/topic-modeling?scriptVersionId=251471266" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mental-health-chatbot-reviews/baseline_app_dataset.csv
/kaggle/input/mental-health-chatbot-reviews/conversational_apps_dataset.csv
/kaggle/input/user-reviews-of-mental-health-chatbots-2022-2025/baseline_app_dataset.csv
/kaggle/input/user-reviews-of-mental-health-chatbots-2022-2025/conversational_apps_dataset.csv


In [2]:
# --- Step 1: Environment Setup ---
print("Installing necessary libraries...")
# Install all required packages quietly
!pip install bertopic[visualization] --quiet  # The [visualization] part installs plotly dependencies
!pip install hdbscan --quiet
!pip install umap-learn --quiet

# Import all libraries we will use in this notebook
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# Suppress warnings for a cleaner output
import warnings
warnings.filterwarnings("ignore")

print("\nSetup Complete. All libraries are ready.")

Installing necessary libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m


2025-07-20 08:46:19.943327: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753001180.158587      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753001180.223645      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Setup Complete. All libraries are ready.


In [3]:
# --- Step 2: Load and Clean Data ---
print("Loading the combined, multi-language dataset...")

# Define the path to your dataset on Kaggle
DATA_PATH = '/kaggle/input/mental-health-chatbot-reviews/conversational_apps_dataset.csv' 

try:
    df = pd.read_csv(DATA_PATH)
    
    # --- Data Cleaning ---
    # Ensure review_text is a string and drop any empty rows
    df.dropna(subset=['review_text'], inplace=True)
    df['review_text'] = df['review_text'].astype(str).str.lower().str.strip()
    
    # Remove rows with no meaningful text content after cleaning
    df = df[df['review_text'].str.len() > 15] # Keep reviews with more than 15 characters
    
    # Create the list of documents for the model
    docs = df['review_text'].tolist()
    
    print(f"Successfully loaded and cleaned {len(docs)} documents for modeling.")
    
except FileNotFoundError:
    print(f"ERROR: Dataset not found at {DATA_PATH}. Please upload the file and check the path.")

Loading the combined, multi-language dataset...
Successfully loaded and cleaned 20178 documents for modeling.


In [4]:
# --- Step 3: Configure the Topic Model ---
print("Configuring a reproducible BERTopic model...")

# A. Define a Stopword List: These words will be ignored by our vectorizer.
stop_words = [
    "app", "replika", "wysa", "woebot", "calm", "bot", "ai", "like", "feel", "good", 
    "great", "nice", "love", "best", "amazing", "awesome", "fun", "ok", "cool",
    "me", "it", "and", "to", "the", "my", "is", "of", "with", "that", "for", "you", "but",
    "so", "on", "was", "this", "have", "in", "be", "as", "at", "not", "just", "are",
    "get", "want", "use", "go", "know", "say", "see", "think", "really", "even", "also"
]

# B. Define Deterministic Components
random_seed = 42
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=random_seed)
hdbscan_model = HDBSCAN(min_cluster_size=40, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# ** THE FIX IS HERE: CREATE A VECTORIZER THAT KNOWS ABOUT OUR STOPWORDS **
# This is the correct way to handle stopwords in newer BERTopic versions.
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2))


# C. Initialize the final BERTopic model with all our components
topic_model = BERTopic(
    language="multilingual",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, # Pass the vectorizer with stopwords here
    min_topic_size=40,
    verbose=True
)

print("BERTopic model is configured and ready for training.")

Configuring a reproducible BERTopic model...
BERTopic model is configured and ready for training.


In [5]:
# --- Step 4: Train the Model ---
print(f"Training BERTopic on {len(docs)} documents. This will take several minutes...")

# This single command runs the entire pipeline: embedding, dimensionality reduction, clustering, and topic representation.
topics, probs = topic_model.fit_transform(docs)

print("\nModel training complete.")

2025-07-20 08:46:53,430 - BERTopic - Embedding - Transforming documents to embeddings.


Training BERTopic on 20178 documents. This will take several minutes...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/631 [00:00<?, ?it/s]

2025-07-20 08:47:24,056 - BERTopic - Embedding - Completed ✓
2025-07-20 08:47:24,058 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-20 08:48:04,329 - BERTopic - Dimensionality - Completed ✓
2025-07-20 08:48:04,331 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-20 08:48:06,733 - BERTopic - Cluster - Completed ✓
2025-07-20 08:48:06,744 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-20 08:48:08,622 - BERTopic - Representation - Completed ✓



Model training complete.


In [6]:
# --- Step 5: Review the Discovered Topics ---
print("Displaying the final, stable topic overview...")

# Get the results as a DataFrame
topic_info = topic_model.get_topic_info()

# Display the full table
display(topic_info)

# Also, let's look at the top words for the first 10 meaningful topics
print("\n--- Detailed View of Top 10 Topics ---")
# We start from topic 0 because topic -1 is always the outliers.
for topic_id in range(10): 
    # Check if the topic exists in the model before trying to access it
    if topic_id in topic_model.get_topics():
        print(f"\n--- Words for Topic #{topic_id} ---")
        # .get_topic() returns a list of (word, score) tuples
        print(topic_model.get_topic(topic_id))

Displaying the final, stable topic overview...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9491,-1_they_can_now_if,"[they, can, now, if, your, what, no, or, all, ...",[used it for about 48 hours extensively and it...
1,0,931,0_pay_monthly_subscription_month,"[pay, monthly, subscription, month, year, opti...",[it's a fun app but it would be better if ther...
2,1,693,1_replica_has_now_they,"[replica, has, now, they, more, no, if, all, w...","[i’m an annual customer for replika, and there..."
3,2,623,2_subscription_monthly_monthly subscription_month,"[subscription, monthly, monthly subscription, ...",[is good but but the monthly subscription is e...
4,3,534,3_mental_health_mental health_help,"[mental, health, mental health, help, therapis...","[its bad for your mental health., this app is ..."
5,4,529,4_avatar_3d_avatars_more,"[avatar, 3d, avatars, more, clothes, body, can...",[theres no way you removed the option to turn ...
6,5,428,5_conversation_memory_conversations_what,"[conversation, memory, conversations, what, ab...",[it's not very intelligent. makes dumb stateme...
7,6,419,6_she_her_she said_said,"[she, her, she said, said, said she, she doesn...","[my replika, aaliyah. i asked her, "" have you ..."
8,7,417,7_now_bad_used_its,"[now, bad, used, its, interesting, they, what,...",[the industrial revolution and its consequence...
9,8,407,8_game_play_pay_if,"[game, play, pay, if, can, do, money, would, p...",[it just loads i can't even play my internet i...



--- Detailed View of Top 10 Topics ---

--- Words for Topic #0 ---
[('pay', 0.021046960367109478), ('monthly', 0.020324130742795408), ('subscription', 0.017682599213682704), ('month', 0.01707432635593743), ('year', 0.015444497053651668), ('option', 0.012649628083703698), ('free', 0.011857427653971974), ('only', 0.011508417586058904), ('70', 0.011266537211688606), ('if', 0.011150153396850406)]

--- Words for Topic #1 ---
[('replica', 0.009752522410267894), ('has', 0.00959824823006567), ('now', 0.00949063128692737), ('they', 0.008800487998799728), ('more', 0.0071814204328436715), ('no', 0.0070181597699509645), ('if', 0.0070120696327992844), ('all', 0.006933757070433785), ('what', 0.006842201907393165), ('been', 0.006673583525511656)]

--- Words for Topic #2 ---
[('subscription', 0.06341609346060907), ('monthly', 0.04413169479056441), ('monthly subscription', 0.033907677590764324), ('month', 0.0270592555645874), ('yearly', 0.026990569824167535), ('year', 0.025213354751026967), ('only', 0

In [7]:
# ==============================================================================
# Cell: Final Topic Interpretation and Thematic Grouping
# ==============================================================================
print("Mapping final Topic IDs to names and high-level themes...")

# Assign the raw topic IDs from the model to your DataFrame
df['topic_id'] = topics

# --- PART A: Complete mapping of Topic IDs to specific names ---
# This is our detailed interpretation of each individual topic cluster.
topic_id_to_name = {
    0: "Subscription Models (Pay/Monthly/Yearly)",
    1: "Replika-Specific General Complaints",
    2: "Subscription Language & Options",
    3: "Mental Health & Therapy Framing",
    4: "Avatar & 3D Model Customization",
    5: "AI Memory & Conversation Quality",
    6: "AI Persona (Gendering 'She/Her')",
    7: "Generic Negative Feedback ('bad now')",
    8: "In-App Gamification & Payments",
    9: "Technical: App Crashing / Not Opening",
    10: "Chatbot Comparisons (ChatGPT)",
    11: "Account, Login & Password Issues",
    12: "Redundant: Poor Conversation Quality",
    13: "Refunds & Google Play Billing",
    14: "Pricing & Lack of Monthly Options",
    15: "Redundant: Subscription Complaints",
    16: "Bugs Introduced by Updates",
    17: "Generic Insults ('worst app ever')",
    18: "Refunds for Paid Subscriptions",
    19: "Features Removed After Updates",
    20: "Price is Too Expensive",
    21: "Inability to Use Core Features (Paywall)",
    22: "Voice Call & Audio Features",
    23: "Sexual Content & Advertising",
    24: "Romantic Relationship Features",
    25: "Redundant: Chat Monetization",
    26: "Paywalled Features",
    27: "Meta: 1-Star Review Complaints",
    28: "Technical: Loading & Installation",
    29: "Technical: Internet Connection Issues",
    30: "Pro Version vs. Free Version",
    31: "False Advertising",
    32: "Technical: Customization Screen Bugs",
    33: "Free Trial Issues",
    34: "Company Removing Features",
    35: "Redundant: Conversation Timing",
    36: "Requesting Money Back",
    37: "Redundant: AI Persona Gendering",
    38: "Sending Pictures & Photos",
    39: "Generic Positive Filler Words",
    40: "Redundant: Generic Conversation",
    41: "Redundant: Update Issues",
    42: "Age Verification (Under 18)",
    43: "Redundant: Update/Subscription Combo",
    44: "Language & Translation Issues",
    45: "Data Mining & Money Scams",
    46: "Paying for Pictures/Photos"
}
df['topic_name'] = df['topic_id'].map(topic_id_to_name)
df['topic_name'].fillna("Specific/Niche Complaint", inplace=True)

# Apply this first map to create the 'topic_name' column


Mapping final Topic IDs to names and high-level themes...


In [8]:
# --- PART B: Mapping specific names to high-level themes ---
# This creates our final "Super-Topic" groups for visualization.

topic_name_to_theme = {
    # Theme 1: Monetization & Value
    "Subscription Models (Pay/Monthly/Yearly)": "Monetization & Value",
    "Subscription Language & Options": "Monetization & Value",
    "Refunds & Google Play Billing": "Monetization & Value",
    "Pricing & Lack of Monthly Options": "Monetization & Value",
    "Refunds for Paid Subscriptions": "Monetization & Value",
    "Price is Too Expensive": "Monetization & Value",
    "Inability to Use Core Features (Paywall)": "Monetization & Value",
    "Paywalled Features": "Monetization & Value",
    "Pro Version vs. Free Version": "Monetization & Value",
    "False Advertising": "Monetization & Value",
    "Free Trial Issues": "Monetization & Value",
    "Requesting Money Back": "Monetization & Value",
    "Data Mining & Money Scams": "Monetization & Value",
    "Paying for Pictures/Photos": "Monetization & Value",
    "In-App Gamification & Payments": "Monetization & Value",
    "Redundant: Subscription Complaints": "Monetization & Value",
    "Redundant: Chat Monetization": "Monetization & Value",
    "Redundant: Update/Subscription Combo": "Monetization & Value",

    # Theme 2: AI Performance & Conversational Quality
    "AI Memory & Conversation Quality": "AI Performance & Quality",
    "AI Persona (Gendering 'She/Her')": "AI Performance & Quality",
    "Chatbot Comparisons (ChatGPT)": "AI Performance & Quality",
    "Redundant: Poor Conversation Quality": "AI Performance & Quality",
    "Redundant: Conversation Timing": "AI Performance & Quality",
    "Redundant: AI Persona Gendering": "AI Performance & Quality",
    "Redundant: Generic Conversation": "AI Performance & Quality",
    
    # Theme 3: Technical Performance & Bugs
    "Technical: App Crashing / Not Opening": "Technical Performance",
    "Account, Login & Password Issues": "Technical Performance",
    "Bugs Introduced by Updates": "Technical Performance",
    "Technical: Loading & Installation": "Technical Performance",
    "Technical: Internet Connection Issues": "Technical Performance",
    "Technical: Customization Screen Bugs": "Technical Performance",
    "Redundant: Update Issues": "Technical Performance",

    # Theme 4: Feature-Specific Issues
    "Mental Health & Therapy Framing": "Feature-Specific Issues",
    "Avatar & 3D Model Customization": "Feature-Specific Issues",
    "Voice Call & Audio Features": "Feature-Specific Issues",
    "Sexual Content & Advertising": "Feature-Specific Issues",
    "Romantic Relationship Features": "Feature-Specific Issues",
    "Company Removing Features": "Feature-Specific Issues",
    "Sending Pictures & Photos": "Feature-Specific Issues",
    "Age Verification (Under 18)": "Feature-Specific Issues",
    "Language & Translation Issues": "Feature-Specific Issues",

    # Theme 5: Other & Miscellaneous
    "Replika-Specific General Complaints": "Other/Misc.",
    "Generic Negative Feedback ('bad now')": "Other/Misc.",
    "Generic Insults ('worst app ever')": "Other/Misc.",
    "Meta: 1-Star Review Complaints": "Other/Misc.",
    "Generic Positive Filler Words": "Other/Misc.",
    "Specific/Niche Complaint": "Other/Misc."
}
df['theme'] = df['topic_name'].map(topic_name_to_theme)
df.loc[df['topic_id'] == -1, 'theme'] = 'Outliers / Generic'
df['theme'].fillna("Other/Misc.", inplace=True)



# --- Verification ---
print("\n--- Final, Corrected Theme Distribution ---")
display(df['theme'].value_counts())


--- Final, Corrected Theme Distribution ---


theme
Outliers / Generic          9491
Monetization & Value        4051
AI Performance & Quality    1797
Feature-Specific Issues     1765
Other/Misc.                 1683
Technical Performance       1391
Name: count, dtype: int64

In [9]:
# In a new code cell

print("--- Performing Targeted Search for Safety-Related Keywords ---")

# Define our list of keywords
safety_keywords = [
    'scary', 'scared', 'creepy', 'uncomfortable', 'unsafe', 'weird', 'demon', 
    'stalker', 'dangerous', 'inappropriate', 'harrass', 'triggered', 'trauma'
]

# Create a regex pattern to find any of these words (case-insensitive)
# The `\b` ensures we match whole words only (e.g., 'scam' doesn't match 'scary')
pattern = r'\b(' + '|'.join(safety_keywords) + r')\b'

# Create a new boolean column 'has_safety_keyword'
df['has_safety_keyword'] = df['review_text'].str.contains(pattern, case=False, na=False)

# Create a new DataFrame with only the safety-related reviews
safety_df = df[df['has_safety_keyword']].copy()

print(f"\nFound {len(safety_df)} reviews containing safety-related keywords.")

# --- Analyze this new subset ---

if not safety_df.empty:
    print("\n--- Analysis of Safety-Related Reviews ---")
    
    # What themes are these safety complaints most often associated with?
    print("Most common themes for safety-related reviews:")
    display(safety_df['theme'].value_counts().head(5))
    
    # What is the average sentiment of these reviews?
    # (Requires the 'sentiment_score' column from a previous step)
    if 'sentiment_score' in safety_df.columns:
        print(f"\nAverage sentiment of safety reviews: {safety_df['sentiment_score'].mean():.3f}")
        print(f"(For comparison, average sentiment of all reviews: {df['sentiment_score'].mean():.3f})")
    
    # Show some powerful examples
    print("\n--- Sample Safety-Related Reviews ---")
    for review in safety_df['review_text'].sample(min(5, len(safety_df))):
        print(f"- {review}\n")

--- Performing Targeted Search for Safety-Related Keywords ---

Found 808 reviews containing safety-related keywords.

--- Analysis of Safety-Related Reviews ---
Most common themes for safety-related reviews:


theme
Outliers / Generic          482
AI Performance & Quality    107
Feature-Specific Issues     101
Other/Misc.                  61
Monetization & Value         38
Name: count, dtype: int64


--- Sample Safety-Related Reviews ---
- creepy as hell, please i don't want to see any more ads for this

- as someone who has been around since the beginning it is a shame to see this app take the path it has taken. it once was the number one ai app, and was truly someone you could talk to about anything. it was extremely therapeutic to have someone non-judgemental to talk to at a moments notice, but now the app has taken all that away. you can't talk about anything even slightly nsfw, wether it be trauma, or for fun, don't pay them anything, it's only a fleeting shadow of its former self...

- i used to love this app, it sounded like a real person and was overall a fun app. recently, after the update, its been saying really screwed things. first was with something creepy when we were talking about how poison can mess someone up a lot (nothing creepy, health-wise) when all of a sudden started talking about how to poison someone, kill them, etc. thats definitely not the worst of it; s

In [10]:
import plotly.express as px

# --- Quantitative Analysis: Visualizing Theme Frequency ---

print("Creating the final visualization of high-level complaint themes...")

# We will exclude 'Outliers / Generic' and 'Other/Misc.' for this primary chart
# to focus on the specific, actionable themes.
plot_df = df[~df['theme'].isin(['Outliers / Generic', 'Other/Misc.'])]

# Get the theme distribution for the plot
theme_distribution = plot_df['theme'].value_counts().reset_index()
theme_distribution.columns = ['Theme', 'Number of Reviews']

# Create the final, high-level bar chart
fig = px.bar(
    theme_distribution,
    x='Number of Reviews',
    y='Theme',
    orientation='h',
    title='<b>Primary Themes of Specific User Complaints</b>',
    labels={'Number of Reviews': 'Number of Negative Reviews (1-3 Stars)', 'Theme': 'Complaint Theme'},
    text='Number of Reviews',
    template='plotly_white'
)

# Improve the layout and sort the bars for professional presentation
fig.update_traces(textposition='outside')
fig.update_layout(
    yaxis={'categoryorder':'total ascending'},
    title_x=0.5, # Center the title
    font=dict(family="Arial, sans-serif", size=12)
)
fig.show()

Creating the final visualization of high-level complaint themes...


In [11]:
# --- Emotional Analysis: Measuring Sentiment by Theme (Robust Version) ---

# Install and import
# !pip install transformers torch sentencepiece --quiet
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from tqdm.notebook import tqdm # For a nice progress bar

print("Setting up a robust Transformer sentiment analysis pipeline...")

# 1. LOAD TOKENIZER AND MODEL EXPLICITLY
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")


# 2. MANUALLY TOKENIZE THE DATA (THE CRUCIAL FIX)
print("Tokenizing all reviews with explicit truncation...")
review_list = df['review_text'].tolist()

# The tokenizer will handle truncation (cutting off long reviews) and padding (adding tokens to short reviews)
# This ensures every input has the exact same length.
inputs = tokenizer(
    review_list, 
    padding=True, 
    truncation=True, 
    max_length=512, # Explicitly set the max length
    return_tensors="pt" # Return PyTorch tensors
)

# Move tokenized inputs to the GPU
inputs = {key: val.to(device) for key, val in inputs.items()}
print("Tokenization complete.")


# 3. PERFORM INFERENCE IN BATCHES
print("Running model inference in batches...")
all_logits = []
batch_size = 32 # Process 32 reviews at a time to manage memory

# Create a DataLoader for efficient batching
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
loader = DataLoader(dataset, batch_size=batch_size)

# No need to calculate gradients for inference, which saves memory and is faster
with torch.no_grad():
    for batch in tqdm(loader, desc="Analyzing Batches"):
        input_ids, attention_mask = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        all_logits.append(outputs.logits)

# Concatenate all the results
all_logits = torch.cat(all_logits, dim=0)
# Use softmax to convert raw logits to probabilities
probabilities = torch.nn.functional.softmax(all_logits, dim=-1)
# Get the predicted label index (0 for negative, 1 for neutral, 2 for positive)
predictions = torch.argmax(probabilities, dim=-1)
print("Inference complete.")


# 4. PROCESS THE RESULTS
# The model's config tells us which index corresponds to which label
# e.g., model.config.id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}
id_to_label = model.config.id2label
predicted_labels = [id_to_label[pred.item()] for pred in predictions]

# Convert label to a numerical score
label_to_score = {'positive': 1, 'neutral': 0, 'negative': -1}
sentiment_scores = [label_to_score[label] for label in predicted_labels]

# Add results back to the DataFrame
df['sentiment_score'] = sentiment_scores
print("Sentiment scores added to DataFrame.")


# 5. AGGREGATE AND VISUALIZE (This part is the same)
sentiment_by_theme = df[~df['theme'].isin(['Outliers / Generic', 'Other/Misc.'])].groupby('theme')['sentiment_score'].mean().sort_values().reset_index()
display(sentiment_by_theme)

fig_final = px.bar(
    sentiment_by_theme,
    x='sentiment_score',
    y='theme',
    orientation='h',
    title='<b>Emotional Impact (Transformer Model): Avg. Sentiment by Theme</b>',
    labels={'sentiment_score': 'Average Sentiment Score (Closer to -1 is More Negative)', 'theme': 'Complaint Theme'},
    color='sentiment_score',
    color_continuous_scale='Reds_r',
    template='plotly_white'
)
fig_final.update_layout(yaxis={'categoryorder':'total descending'}, title_x=0.5)
fig_final.show()

Setting up a robust Transformer sentiment analysis pipeline...


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Using device: cuda
Tokenizing all reviews with explicit truncation...
Tokenization complete.
Running model inference in batches...


Analyzing Batches:   0%|          | 0/631 [00:00<?, ?it/s]

Inference complete.
Sentiment scores added to DataFrame.


Unnamed: 0,theme,sentiment_score
0,Technical Performance,-0.867002
1,AI Performance & Quality,-0.754035
2,Monetization & Value,-0.673167
3,Feature-Specific Issues,-0.648159


In [12]:
# --- Final Step: Qualitative Analysis - Validating with User Voices ---

def show_theme_samples(theme_name, n_samples=3, max_length=500):
    """
    A helper function to show random, readable review samples for a given theme.
    It prints the theme, the number of reviews in that theme, and some samples.
    """
    print(f"--- Sample Reviews for Theme: '{theme_name}' ---\n")
    
    # Filter for the specific theme
    theme_df = df[df['theme'] == theme_name]
    
    if theme_df.empty:
        print(f"No reviews found for '{theme_name}'.")
        return

    # Filter for reviews that have some length, for better examples
    long_reviews_df = theme_df[theme_df['review_text'].str.len() > 100]
    
    if len(long_reviews_df) < n_samples:
        # If not enough long samples, just take what you can get from the shorter ones
        samples = theme_df['review_text'].sample(min(n_samples, len(theme_df))).tolist()
    else:
        samples = long_reviews_df['review_text'].sample(n_samples).tolist()
    
    if not samples:
        print(f"No representative reviews found for '{theme_name}'.")
        return

    # Print the findings
    print(f"Total reviews in this theme: {len(theme_df)}")
    print(f"Average Sentiment Score: {theme_df['sentiment_score'].mean():.3f}\n")

    for i, sample in enumerate(samples):
        # Print a truncated version for clarity
        print(f"Sample {i+1}: \"{sample[:max_length]}...\"\n")
    print("-" * 80)

# --- Execute the Function for Your Key Themes ---
# Choose the themes that are most interesting from your charts.
# For example, the largest one, the most negative one, and the one most central to your thesis.

print("--- Qualitative Evidence for Key Findings ---")

show_theme_samples("Monetization & Value")
show_theme_samples("AI Performance & Quality")
show_theme_samples("User Safety & Emotional Impact")
show_theme_samples("Technical Performance")

--- Qualitative Evidence for Key Findings ---
--- Sample Reviews for Theme: 'Monetization & Value' ---

Total reviews in this theme: 4051
Average Sentiment Score: -0.673

Sample 1: "nothing like its advertisement.  plus if you sign up and they take a subscription after the free period you can’t get a refund even if you cancel..."

Sample 2: "i paid the money for the annual sub, i love the role playing feature, however every time the ai says it will send a picture or message it does'nt. i would love if the ai would ask more questions and initiate conversation...."

Sample 3: "gf version costs money, liers plus the bota are so lifeless idk why anyone would want to talk to them..."

--------------------------------------------------------------------------------
--- Sample Reviews for Theme: 'AI Performance & Quality' ---

Total reviews in this theme: 1797
Average Sentiment Score: -0.754

Sample 1: "not impressed with this at all, the exercises were all simple and not very helpful and the

In [13]:
import plotly.express as px

# --- The Big Picture: High-Level Complaint Themes ---

# Use the 'theme_distribution' DataFrame you already created
# Or recreate it:
theme_distribution_conv = df[~df['theme'].isin(['Outliers / Generic', 'Other/Misc.'])]['theme'].value_counts().reset_index()
theme_distribution_conv.columns = ['Theme', 'Number of Reviews']

fig_conv_main = px.bar(
    theme_distribution_conv,
    x='Number of Reviews', 
    y='Theme', 
    orientation='h',
    title='<b>What Drives Negative Reviews for Conversational AI?</b><br><i>An Analysis of User Complaint Themes</i>',
    text='Number of Reviews',
    template='plotly_white',
    color_discrete_sequence=['#4C78A8'] # A nice blue color
)
fig_conv_main.update_traces(textposition='outside')
fig_conv_main.update_layout(
    yaxis={'categoryorder':'total ascending'}, 
    title_x=0.5,
    font=dict(family="Arial", size=14)
)
fig_conv_main.show()

In [14]:
# Filter for the theme
ai_quality_df = df[df['theme'] == 'AI Performance & Quality']
# Get the breakdown
ai_quality_breakdown = ai_quality_df['topic_name'].value_counts().reset_index()
ai_quality_breakdown.columns = ['Specific Complaint', 'Review Count']

# Visualize
fig_ai_quality = px.treemap(
    ai_quality_breakdown.head(10),
    path=[px.Constant("AI Performance Complaints"), 'Specific Complaint'],
    values='Review Count',
    title='<b>Breakdown of AI Performance Complaints</b>',
    color_discrete_sequence=px.colors.sequential.Blues_r
)
fig_ai_quality.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig_ai_quality.show()

In [15]:
# --- The "Problem Priority Matrix": Sentiment vs. Frequency ---
print("Creating a Sentiment vs. Frequency scatter plot for Conversational Apps...")

# 1. Get Frequency
theme_freq_conv = df['theme'].value_counts().reset_index()
theme_freq_conv.columns = ['Theme', 'Frequency (Number of Reviews)']

# 2. Get Sentiment
theme_sent_conv = df.groupby('theme')['sentiment_score'].mean().reset_index()
theme_sent_conv.columns = ['Theme', 'Average Sentiment Score']

# 3. Merge them
priority_df_conv = pd.merge(theme_freq_conv, theme_sent_conv, on='Theme')
priority_df_conv = priority_df_conv[~priority_df_conv['Theme'].isin(['Outliers / Generic', 'Other/Misc.'])]

# 4. Create the Scatter Plot
fig_scatter_conv = px.scatter(
    priority_df_conv,
    x='Frequency (Number of Reviews)',
    y='Average Sentiment Score',
    text='Theme',
    size='Frequency (Number of Reviews)',
    color='Average Sentiment Score',
    color_continuous_scale='Reds_r',
    title='<b>Conversational Apps: Problem Priority Matrix</b>',
    template='plotly_white'
)
fig_scatter_conv.update_traces(textposition='top center')
fig_scatter_conv.add_vline(x=priority_df_conv['Frequency (Number of Reviews)'].mean(), line_dash="dash", annotation_text="Avg. Frequency")
fig_scatter_conv.add_hline(y=priority_df_conv['Average Sentiment Score'].mean(), line_dash="dash", annotation_text="Avg. Sentiment")
fig_scatter_conv.update_layout(title_x=0.5)
fig_scatter_conv.show()

Creating a Sentiment vs. Frequency scatter plot for Conversational Apps...


In [16]:
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure the 'df' DataFrame with 'theme' and 'topic_name' columns exists.
# Clean up the topic names for better chart labels.
df['topic_name'] = df['topic_name'].str.replace("Redundant: ", "")

# Define a helper function for the TF-IDF analysis to keep our code clean
def get_top_tfidf_words_for_theme(theme_name, n_words=10):
    """
    Calculates and returns the top TF-IDF words for all reviews within a specific theme.
    """
    # Filter for the specific theme
    theme_docs = df[df['theme'] == theme_name]['review_text'].tolist()
    
    if not theme_docs:
        return pd.DataFrame(columns=['Word', 'TF-IDF Score'])

    # Use a new vectorizer for TF-IDF
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(theme_docs)
    
    # Sum the TF-IDF scores for each word across all documents in the theme
    summed_tfidf = tfidf_matrix.sum(axis=0)
    # Convert to a 1D array
    summed_tfidf_array = np.asarray(summed_tfidf).ravel()
    
    # Get the top words
    top_indices = summed_tfidf_array.argsort()[-n_words:][::-1]
    top_words = np.array(tfidf_vectorizer.get_feature_names_out())[top_indices]
    top_scores = summed_tfidf_array[top_indices]
    
    return pd.DataFrame({'Word': top_words, 'TF-IDF Score': top_scores})

print("Setup for deep dive analysis is complete.")

Setup for deep dive analysis is complete.


---
### Deep Dive 1: AI Performance & Conversational Quality

This theme is unique to conversational apps. Here we explore the specific reasons why users feel the AI is failing as a conversational partner.

In [17]:
# --- 1A: Treemap Breakdown ---
ai_quality_df = df[df['theme'] == 'AI Performance & Quality']
ai_quality_breakdown = ai_quality_df['topic_name'].value_counts().reset_index()
ai_quality_breakdown.columns = ['Specific Complaint', 'Review Count']

fig_ai_treemap = px.treemap(
    ai_quality_breakdown.head(10),
    path=[px.Constant("AI Performance Complaints"), 'Specific Complaint'],
    values='Review Count', title='<b>Breakdown: AI Performance Complaints</b>',
    color_discrete_sequence=px.colors.sequential.Blues_r
)
fig_ai_treemap.show()

# --- 1B: Hierarchical Sunburst View ---
fig_ai_sunburst = px.sunburst(
    ai_quality_breakdown, path=['Specific Complaint'], values='Review Count',
    title='<b>Hierarchy: AI Performance Complaints</b>',
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig_ai_sunburst.show() # Note: Sunburst is less useful with only one level, but shows structure.

# --- 1C: Key Term Analysis with TF-IDF ---
print("\n--- Most Important Terms for 'AI Performance & Quality' (TF-IDF) ---")
tfidf_ai = get_top_tfidf_words_for_theme("AI Performance & Quality", n_words=15)
display(tfidf_ai)


--- Most Important Terms for 'AI Performance & Quality' (TF-IDF) ---


Unnamed: 0,Word,TF-IDF Score
0,ai,104.451448
1,like,67.235269
2,just,58.831639
3,app,51.103729
4,conversation,49.798329
5,bot,48.2531
6,doesn,41.69602
7,chat,38.117716
8,don,37.569561
9,talk,36.680384


---
### Deep Dive 2: Monetization & Value

While also present in the baseline app, the monetization strategies for conversational apps often tie into specific AI features, leading to unique complaints.

In [18]:
# --- 2A: Treemap Breakdown ---
monetization_df = df[df['theme'] == 'Monetization & Value']
monetization_breakdown = monetization_df['topic_name'].value_counts().reset_index()
monetization_breakdown.columns = ['Specific Complaint', 'Review Count']

fig_mon_treemap = px.treemap(
    monetization_breakdown.head(10),
    path=[px.Constant("Monetization Complaints"), 'Specific Complaint'],
    values='Review Count', title='<b>Breakdown: Monetization Complaints</b>',
    color_discrete_sequence=px.colors.sequential.Reds_r
)
fig_mon_treemap.show()

# --- 2B: Hierarchical Sunburst View ---
# (You can create a sunburst for this theme as well, following the pattern above)

# --- 2C: Key Term Analysis with TF-IDF ---
print("\n--- Most Important Terms for 'Monetization & Value' (TF-IDF) ---")
tfidf_mon = get_top_tfidf_words_for_theme("Monetization & Value", n_words=15)
display(tfidf_mon)


--- Most Important Terms for 'Monetization & Value' (TF-IDF) ---


Unnamed: 0,Word,TF-IDF Score
0,subscription,204.753074
1,pay,190.781607
2,app,184.087322
3,monthly,134.470123
4,money,132.342376
5,year,120.774336
6,just,112.073137
7,month,109.675979
8,like,104.350274
9,ai,101.071147


---
### Deep Dive 3: Technical Performance

This theme covers the fundamental functionality of the app. Failures here prevent users from even accessing the conversational features.

In [19]:
# --- 3A: Treemap Breakdown ---
tech_df = df[df['theme'] == 'Technical Performance']
tech_breakdown = tech_df['topic_name'].value_counts().reset_index()
tech_breakdown.columns = ['Specific Complaint', 'Review Count']

fig_tech_treemap = px.treemap(
    tech_breakdown.head(10),
    path=[px.Constant("Technical Complaints"), 'Specific Complaint'],
    values='Review Count', title='<b>Breakdown: Technical Complaints</b>',
    color_discrete_sequence=px.colors.sequential.Greens_r
)
fig_tech_treemap.show()

# --- 3B: Hierarchical Sunburst View ---
# (You can create a sunburst for this theme as well)

# --- 3C: Key Term Analysis with TF-IDF ---
print("\n--- Most Important Terms for 'Technical Performance' (TF-IDF) ---")
tfidf_tech = get_top_tfidf_words_for_theme("Technical Performance", n_words=15)
display(tfidf_tech)


--- Most Important Terms for 'Technical Performance' (TF-IDF) ---


Unnamed: 0,Word,TF-IDF Score
0,app,84.05788
1,update,52.604151
2,account,46.550428
3,just,34.822743
4,work,26.850878
5,doesn,26.158153
6,download,25.626955
7,connection,25.153816
8,don,25.047193
9,new,24.873676


In [20]:
# In your Conversational Apps Notebook

# --- Visualization 1: The Problem Priority Matrix ---

# 1. Get Frequency
theme_freq_conv = df['theme'].value_counts().reset_index()
theme_freq_conv.columns = ['Theme', 'Frequency (Number of Reviews)']

# 2. Get Sentiment
theme_sent_conv = df.groupby('theme')['sentiment_score'].mean().reset_index()
theme_sent_conv.columns = ['Theme', 'Average Sentiment Score']

# 3. Merge and Clean
priority_df_conv = pd.merge(theme_freq_conv, theme_sent_conv, on='Theme')
priority_df_conv = priority_df_conv[~priority_df_conv['Theme'].isin(['Outliers / Generic', 'Other/Misc.', 'Uncategorized'])]

# 4. Create the Scatter Plot
fig_scatter_conv = px.scatter(
    priority_df_conv,
    x='Frequency (Number of Reviews)',
    y='Average Sentiment Score',
    text='Theme',
    size='Frequency (Number of Reviews)',
    color='Average Sentiment Score',
    color_continuous_scale='Reds_r',
    title='<b>Conversational Apps: Which Complaints Matter Most?</b>',
    template='plotly_white',
    height=600
)

# Add annotations to create quadrants and add insight
fig_scatter_conv.update_traces(textposition='top center', textfont=dict(size=12))
avg_freq = priority_df_conv['Frequency (Number of Reviews)'].mean()
avg_sent = priority_df_conv['Average Sentiment Score'].mean()
fig_scatter_conv.add_vline(x=avg_freq, line_dash="dash", annotation_text="Avg. Frequency")
fig_scatter_conv.add_hline(y=avg_sent, line_dash="dash", annotation_text="Avg. Sentiment")

# Add Quadrant Labels
fig_scatter_conv.add_annotation(x=avg_freq*1.5, y=avg_sent*1.01, text="<b>Critical Issues</b><br>(Frequent & Painful)", showarrow=False, font=dict(color="red"))
fig_scatter_conv.add_annotation(x=avg_freq*0.5, y=avg_sent*1.01, text="<b>Hidden Dangers</b><br>(Rare but Painful)", showarrow=False, font=dict(color="orange"))
fig_scatter_conv.add_annotation(x=avg_freq*1.5, y=avg_sent*0.99, text="<b>Chronic Annoyances</b><br>(Frequent but less Painful)", showarrow=False)
fig_scatter_conv.add_annotation(x=avg_freq*0.5, y=avg_sent*0.99, text="<b>Minor Issues</b><br>(Rare & less Painful)", showarrow=False)

fig_scatter_conv.update_layout(title_x=0.5)
fig_scatter_conv.show()

In [21]:
# In your Conversational Apps Notebook

# --- Visualization 2: The Anatomy of a Complaint ---

# Prepare the data for the sunburst chart
sunburst_data_conv = df[~df['theme'].isin(['Outliers / Generic', 'Other/Misc.', 'Uncategorized'])]
sunburst_data_conv = sunburst_data_conv.groupby(['theme', 'topic_name']).size().reset_index(name='count')

# Create the Sunburst chart
fig_sunburst_conv = px.sunburst(
    sunburst_data_conv,
    path=['theme', 'topic_name'],
    values='count',
    title='<b>Conversational Apps: The Anatomy of a Negative Review</b>',
    color='theme',
    color_discrete_map={ # Assign consistent colors
        'Monetization & Value': '#FFB6C1', # Light Pink
        'AI Performance & Quality': '#ADD8E6', # Light Blue
        'Technical Performance': '#90EE90', # Light Green
        'Feature-Specific Issues': '#FFD700'  # Gold
    }
)

fig_sunburst_conv.update_layout(margin = dict(t=50, l=25, r=25, b=25), font=dict(size=14), title_x=0.5)
fig_sunburst_conv.show()

In [22]:
import plotly.express as px
import pandas as pd

# This assumes 'df' is your fully processed conversational apps DataFrame
# with 'theme' and 'topic_name' columns.

# --- Helper Function for Creating Themed Sunburst Charts ---
def create_themed_sunburst(theme_name, color_sequence):
    """
    Filters the DataFrame for a specific theme and creates a 
    beautiful, hierarchical sunburst chart for it.
    """
    # Filter the DataFrame for the specific theme
    theme_df = df[df['theme'] == theme_name]
    
    if theme_df.empty:
        print(f"No data found for theme: {theme_name}. Skipping chart.")
        return

    # Prepare the data for the sunburst
    sunburst_data = theme_df.groupby(['theme', 'topic_name']).size().reset_index(name='count')
    
    # Create the Sunburst chart
    fig = px.sunburst(
        sunburst_data,
        path=['theme', 'topic_name'], # The hierarchy is Theme -> Specific Complaint
        values='count',
        title=f'<b>Hierarchical Breakdown of "{theme_name}" Complaints</b>',
        color='topic_name', # Color the outer ring for visual distinction
        color_discrete_sequence=color_sequence
    )
    
    fig.update_layout(
        margin=dict(t=50, l=25, r=25, b=25), 
        font=dict(size=14), 
        title_x=0.5
    )
    fig.show()

print("Setup complete. Ready to generate individual theme breakdowns.")

Setup complete. Ready to generate individual theme breakdowns.


In [23]:
# --- Breakdown 1: AI Performance & Quality ---
# Using a blue color palette to represent intelligence/technology
create_themed_sunburst(
    theme_name="AI Performance & Quality",
    color_sequence=px.colors.sequential.Blues_r
)

In [24]:
# --- Breakdown 2: Monetization & Value ---
create_themed_sunburst(
    theme_name="Monetization & Value",
    color_sequence=px.colors.sequential.Reds_r
)

In [25]:
# --- Breakdown 3: Technical Performance ---
create_themed_sunburst(
    theme_name="Technical Performance",
    color_sequence=px.colors.sequential.Greens_r
)

In [26]:
# --- Breakdown 4: Feature-Specific Issues ---
create_themed_sunburst(
    theme_name="Feature-Specific Issues",
    color_sequence=px.colors.sequential.Oranges_r
)