In [1]:
import os
import pandas as pd
from google_play_scraper import reviews, Sort
from tqdm import tqdm
import time

# Configuration
NUM_REVIEWS = 5000
LANG = "en"
COUNTRY = "us"
BASE_DIR = "data"  # Folder to store all apps

# Load your top apps list
apps_df = pd.read_csv("data/top_apps.csv")  # Must have 'app_id' column

# Ensure base data directory exists
os.makedirs(BASE_DIR, exist_ok=True)

for _, row in tqdm(apps_df.iterrows(), total=len(apps_df), desc="Scraping apps"):
    app_id = row["app_id"]
    app_folder = os.path.join(BASE_DIR, app_id)
    os.makedirs(app_folder, exist_ok=True)

    out_path = os.path.join(app_folder, "reviews.csv")
    if os.path.exists(out_path):
        print(f"✔️  Skipping {app_id} (already exists)")
        continue

    try:
        result, _ = reviews(
            app_id,
            lang=LANG,
            country=COUNTRY,
            sort=Sort.NEWEST,
            count=NUM_REVIEWS,
            filter_score_with=None
        )
        df = pd.DataFrame(result)
        df = df[["reviewId", "userName", "content", "score", "thumbsUpCount", "reviewCreatedVersion", "at"]]
        df.columns = ["id", "user", "text", "rating", "likes", "app_version", "date"]
        df.to_csv(out_path, index=False)
        print(f"✅ Saved {len(df)} reviews for {app_id}")
        
        time.sleep(2)
    except Exception as e:
        print(f"❌ Error scraping {app_id}: {e}")


Scraping apps:   0%|                                     | 0/17 [00:00<?, ?it/s]

✅ Saved 5000 reviews for com.roblox.client


Scraping apps:   6%|█▋                           | 1/17 [00:05<01:25,  5.33s/it]

✅ Saved 5000 reviews for com.mojang.minecraftpe


Scraping apps:  12%|███▍                         | 2/17 [00:10<01:17,  5.19s/it]

✅ Saved 5000 reviews for com.king.candycrushsaga


Scraping apps:  18%|█████                        | 3/17 [00:15<01:10,  5.05s/it]

✅ Saved 5000 reviews for com.outfit7.mytalkingtom2


Scraping apps:  24%|██████▊                      | 4/17 [00:20<01:08,  5.30s/it]

✅ Saved 5000 reviews for com.outfit7.mytalkingtomfriends


Scraping apps:  29%|████████▌                    | 5/17 [00:25<01:00,  5.08s/it]

✅ Saved 5000 reviews for com.tocaboca.tocalifeworld


Scraping apps:  35%|██████████▏                  | 6/17 [00:31<00:57,  5.27s/it]

✅ Saved 5000 reviews for org.scratchjr.android


Scraping apps:  41%|███████████▉                 | 7/17 [00:36<00:50,  5.08s/it]

✅ Saved 5000 reviews for com.getepic.Epic


Scraping apps:  47%|█████████████▋               | 8/17 [00:40<00:45,  5.02s/it]

✅ Saved 5000 reviews for org.pbskids.gamesapp


Scraping apps:  53%|███████████████▎             | 9/17 [00:45<00:39,  4.97s/it]

✅ Saved 5000 reviews for com.rvappstudios.abc_kids_toddler_tracing_phonics


Scraping apps:  59%|████████████████▍           | 10/17 [00:51<00:36,  5.15s/it]

✅ Saved 5000 reviews for com.sinyee.babybus.restaurant


Scraping apps:  65%|██████████████████          | 11/17 [00:56<00:30,  5.13s/it]

✅ Saved 5000 reviews for com.miniclip.carrom


Scraping apps:  71%|███████████████████▊        | 12/17 [01:00<00:24,  4.94s/it]

✅ Saved 5000 reviews for com.imangi.templerun2


Scraping apps:  76%|█████████████████████▍      | 13/17 [01:05<00:19,  4.86s/it]

✅ Saved 5000 reviews for com.google.android.apps.youtube.kids


Scraping apps:  82%|███████████████████████     | 14/17 [01:10<00:14,  4.89s/it]

✅ Saved 1201 reviews for com.future.HappyKids


Scraping apps:  88%|████████████████████████▋   | 15/17 [01:13<00:08,  4.39s/it]

✅ Saved 627 reviews for com.abqappsource.childgrowthtracker


Scraping apps:  94%|██████████████████████████▎ | 16/17 [01:16<00:03,  3.84s/it]

✅ Saved 5000 reviews for org.khankids.android


Scraping apps: 100%|████████████████████████████| 17/17 [01:20<00:00,  4.76s/it]


In [2]:
import os
from openai import OpenAI
from dotenv import load_dotenv

# Load from .env if present
load_dotenv()

# Get the API key from environment variable
api_key = os.getenv("OPENAI_API_KEY")

# Safety check
if not api_key:
    raise ValueError("Missing OpenAI API key. Set OPENAI_API_KEY in your environment or .env file.")

# Instantiate the OpenAI client
client = OpenAI(api_key=api_key)

In [3]:
import pandas as pd
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline
import torch
from tqdm import tqdm
import textstat
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from openai import OpenAI

# Global models
analyzer = SentimentIntensityAnalyzer()
emotion_pipeline = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    top_k=1,
    model_kwargs={"torch_dtype": torch.float32}
)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
client = OpenAI()

def batch_emotion_labels(texts, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Emotion Classification"):
        batch = texts[i:i+batch_size]
        try:
            batch_preds = emotion_pipeline(batch, truncation=True)
            for result in batch_preds:
                if isinstance(result, list):
                    results.append(result[0]['label'])
                else:
                    results.append(result['label'])
        except Exception as e:
            print(f"Error on batch {i}: {e}")
            results.extend(["error"] * len(batch))
    return results

def suggest_label(topic_num, top_words, sample_reviews):
    top_words_str = ', '.join(top_words)
    prompt = f"""
You are analyzing app reviews that have been grouped into topics. 
Your job is to give a concise label (2-5 words max) that summarizes the theme of the topic.

Here are the top words for Topic {topic_num}:
{top_words_str}

Here are 3 representative reviews:
1. {sample_reviews[0]}
2. {sample_reviews[1]}
3. {sample_reviews[2]}

What is the best label for this topic?
Respond with only the label.
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.4,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"❌ Error labeling topic {topic_num}: {e}")
        return "Unknown"

def process_reviews_file(app_id: str, base_dir="data"):
    input_path = os.path.join(base_dir, app_id, "reviews.csv")
    output_path = os.path.join(base_dir, app_id, f"{app_id}_reviews_preprocessed.csv")
    app_folder = os.path.join(base_dir, app_id)
    os.makedirs(app_folder, exist_ok=True)

    if not os.path.exists(input_path):
        print(f"❌ Missing input file for {app_id}")
        return

    print(f"🔄 Processing {app_id}...")
    df = pd.read_csv(input_path)
    df = df.dropna(subset=["text"])
    df = df[df["text"].str.strip().astype(bool)]

    # Basic preprocessing
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["text_length"] = df["text"].str.len()
    df["vader_sentiment"] = df["text"].apply(lambda x: analyzer.polarity_scores(x)["compound"])
    df["emotion"] = batch_emotion_labels(df["text"].tolist())
    df["readability"] = df["text"].apply(textstat.flesch_reading_ease)

    # Keyword flags
    bug_keywords = ['bug', 'crash', 'glitch', 'error', 'issue', 'broken', 'freeze']
    request_keywords = ['add', 'feature', 'please', 'would like', 'suggest', 'can you', 'request']
    df['contains_bug_keywords'] = df['text'].str.lower().apply(lambda x: any(k in x for k in bug_keywords))
    df['contains_request_keywords'] = df['text'].str.lower().apply(lambda x: any(k in x for k in request_keywords))

    # Topic modeling
    print("🔍 Running BERTopic...")
    topic_model = BERTopic(embedding_model=embedding_model, verbose=True)
    topics, probs = topic_model.fit_transform(df['text'].tolist())
    df['topic'] = topics
    df['topic_prob'] = probs

    # GPT topic labeling
    docs = df['text'].tolist()
    topic_info = topic_model.get_topic_info()
    labels = {}

    for topic_num in tqdm(topic_info['Topic']):
        if topic_num == -1:
            continue
        top_words = [word for word, _ in topic_model.get_topic(topic_num)[:10]]
        sample_reviews = [docs[i] for i, t in enumerate(topics) if t == topic_num][:3]
        if len(sample_reviews) < 3:
            continue
        label = suggest_label(topic_num, top_words, sample_reviews)
        labels[topic_num] = label

    # Merge topics with same label
    label_to_new_topic = {}
    new_topic_id = 0
    topic_remap = {}

    for old_topic, label in labels.items():
        norm_label = label.strip().lower()
        if norm_label not in label_to_new_topic:
            label_to_new_topic[norm_label] = new_topic_id
            new_topic_id += 1
        topic_remap[old_topic] = label_to_new_topic[norm_label]

    df["merged_topic"] = df["topic"].map(topic_remap)
    df["merged_topic_label"] = df["merged_topic"].map(
        lambda x: [k for k, v in label_to_new_topic.items() if v == x][0] if x in df["merged_topic"].unique() else "Unknown"
    )

    # Pain point detection
    negative_emotions = ['anger', 'sadness', 'fear', 'disgust']
    topic_counts = df.groupby('merged_topic_label').size()
    neg_counts = df[df['emotion'].isin(negative_emotions)].groupby('merged_topic_label').size()
    neg_proportion = (neg_counts / topic_counts).fillna(0)
    threshold = 0.4
    pain_points = neg_proportion[neg_proportion > threshold]
    df['pain_point'] = df['merged_topic_label'].isin(pain_points.index)

    # Save processed reviews
    df.to_csv(output_path, index=False)

    # Save topic summary with pain point flag
    topic_summary = df.groupby("merged_topic_label").agg({
        "text": "count",
        "vader_sentiment": "mean",
        "text_length": "mean",
        "readability": "mean",
        "contains_bug_keywords": "mean",
        "contains_request_keywords": "mean"
    }).rename(columns={
        "text": "total_reviews",
        "vader_sentiment": "avg_vader_sentiment",
        "text_length": "avg_text_length",
        "readability": "avg_readability",
        "contains_bug_keywords": "bug_keyword_rate",
        "contains_request_keywords": "request_keyword_rate"
    }).reset_index()
    topic_summary["pain_point_flag"] = topic_summary["merged_topic_label"].isin(pain_points.index)
    topic_summary.to_csv(os.path.join(app_folder, "topics.csv"), index=False)

    # Save pain summary
    pain_summary = pd.DataFrame({
        'negative_proportion': neg_proportion,
        'total_reviews': topic_counts,
        'pain_point_flag': neg_proportion > threshold
    }).reset_index()
    pain_summary.to_csv(os.path.join(app_folder, "pain_point_summary.csv"), index=False)

    # Save normalized emotion distribution
    emotion_dist = (
        df[["merged_topic_label", "emotion"]]
        .value_counts(normalize=True)
        .reset_index(name="proportion")
    )
    pivoted = emotion_dist.pivot(index="merged_topic_label", columns="emotion", values="proportion").fillna(0)
    pivoted.to_csv(os.path.join(app_folder, "emotion_distribution_by_topic.csv"))


Device set to use mps:0


In [4]:
top_apps = pd.read_csv("data/top_apps.csv")
for app_id in top_apps["app_id"]:
    process_reviews_file(app_id)


🔄 Processing com.roblox.client...


Emotion Classification: 100%|█████████████████| 157/157 [00:47<00:00,  3.32it/s]
2025-07-26 16:16:29,455 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:16:34,584 - BERTopic - Embedding - Completed ✓
2025-07-26 16:16:34,585 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:16:41,364 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:16:41,365 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:16:41,428 - BERTopic - Cluster - Completed ✓
2025-07-26 16:16:41,430 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:16:41,476 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 110/110 [01:24<00:00,  1.30it/s]


🔄 Processing com.mojang.minecraftpe...


Emotion Classification: 100%|█████████████████| 157/157 [00:42<00:00,  3.65it/s]
2025-07-26 16:18:49,314 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:18:52,935 - BERTopic - Embedding - Completed ✓
2025-07-26 16:18:52,935 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:18:54,244 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:18:54,245 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:18:54,312 - BERTopic - Cluster - Completed ✓
2025-07-26 16:18:54,313 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:18:54,364 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 120/120 [01:34<00:00,  1.27it/s]


🔄 Processing com.king.candycrushsaga...


Emotion Classification: 100%|█████████████████| 157/157 [00:39<00:00,  3.99it/s]
2025-07-26 16:21:08,623 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:21:10,295 - BERTopic - Embedding - Completed ✓
2025-07-26 16:21:10,295 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:21:11,969 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:21:11,969 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:21:12,031 - BERTopic - Cluster - Completed ✓
2025-07-26 16:21:12,033 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:21:12,067 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 119/119 [01:31<00:00,  1.30it/s]


🔄 Processing com.outfit7.mytalkingtom2...


Emotion Classification: 100%|█████████████████| 157/157 [00:42<00:00,  3.73it/s]
2025-07-26 16:23:26,401 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:23:28,424 - BERTopic - Embedding - Completed ✓
2025-07-26 16:23:28,424 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:23:29,945 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:23:29,946 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:23:30,009 - BERTopic - Cluster - Completed ✓
2025-07-26 16:23:30,010 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:23:30,047 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 115/115 [01:23<00:00,  1.38it/s]


🔄 Processing com.outfit7.mytalkingtomfriends...


Emotion Classification: 100%|█████████████████| 157/157 [00:41<00:00,  3.76it/s]
2025-07-26 16:25:35,658 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:25:37,671 - BERTopic - Embedding - Completed ✓
2025-07-26 16:25:37,671 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:25:39,137 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:25:39,138 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:25:39,204 - BERTopic - Cluster - Completed ✓
2025-07-26 16:25:39,206 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:25:39,243 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 115/115 [01:27<00:00,  1.32it/s]


🔄 Processing com.tocaboca.tocalifeworld...


Emotion Classification: 100%|█████████████████| 157/157 [00:53<00:00,  2.93it/s]
2025-07-26 16:28:00,720 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:28:05,825 - BERTopic - Embedding - Completed ✓
2025-07-26 16:28:05,825 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:28:07,000 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:28:07,000 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:28:07,069 - BERTopic - Cluster - Completed ✓
2025-07-26 16:28:07,071 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:28:07,123 - BERTopic - Representation - Completed ✓
100%|███████████████████████████████████████████| 82/82 [01:03<00:00,  1.30it/s]


🔄 Processing org.scratchjr.android...


Emotion Classification: 100%|█████████████████| 157/157 [00:57<00:00,  2.73it/s]
2025-07-26 16:30:08,376 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:30:11,365 - BERTopic - Embedding - Completed ✓
2025-07-26 16:30:11,366 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:30:12,714 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:30:12,715 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:30:12,776 - BERTopic - Cluster - Completed ✓
2025-07-26 16:30:12,777 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:30:12,821 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 111/111 [01:26<00:00,  1.28it/s]


🔄 Processing com.getepic.Epic...


Emotion Classification: 100%|█████████████████| 157/157 [00:55<00:00,  2.81it/s]
2025-07-26 16:32:35,717 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:32:38,711 - BERTopic - Embedding - Completed ✓
2025-07-26 16:32:38,711 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:32:40,006 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:32:40,007 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:32:40,070 - BERTopic - Cluster - Completed ✓
2025-07-26 16:32:40,072 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:32:40,119 - BERTopic - Representation - Completed ✓
100%|███████████████████████████████████████████| 94/94 [01:06<00:00,  1.42it/s]


🔄 Processing org.pbskids.gamesapp...


Emotion Classification: 100%|█████████████████| 157/157 [00:49<00:00,  3.15it/s]
2025-07-26 16:34:36,765 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:34:39,952 - BERTopic - Embedding - Completed ✓
2025-07-26 16:34:39,952 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:34:41,255 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:34:41,255 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:34:41,333 - BERTopic - Cluster - Completed ✓
2025-07-26 16:34:41,334 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:34:41,378 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 102/102 [01:13<00:00,  1.39it/s]


🔄 Processing com.rvappstudios.abc_kids_toddler_tracing_phonics...


Emotion Classification: 100%|█████████████████| 157/157 [00:54<00:00,  2.89it/s]
2025-07-26 16:36:49,434 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:36:51,509 - BERTopic - Embedding - Completed ✓
2025-07-26 16:36:51,510 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:36:53,027 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:36:53,027 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:36:53,088 - BERTopic - Cluster - Completed ✓
2025-07-26 16:36:53,089 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:36:53,129 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 117/117 [01:26<00:00,  1.35it/s]


🔄 Processing com.sinyee.babybus.restaurant...


Emotion Classification: 100%|█████████████████| 157/157 [00:41<00:00,  3.78it/s]
2025-07-26 16:39:01,722 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:39:03,975 - BERTopic - Embedding - Completed ✓
2025-07-26 16:39:03,975 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:39:05,425 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:39:05,425 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:39:05,491 - BERTopic - Cluster - Completed ✓
2025-07-26 16:39:05,493 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:39:05,528 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 107/107 [01:22<00:00,  1.30it/s]


🔄 Processing com.miniclip.carrom...


Emotion Classification: 100%|█████████████████| 157/157 [00:39<00:00,  3.95it/s]
2025-07-26 16:41:08,169 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:41:09,836 - BERTopic - Embedding - Completed ✓
2025-07-26 16:41:09,836 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:41:11,659 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:41:11,660 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:41:11,720 - BERTopic - Cluster - Completed ✓
2025-07-26 16:41:11,721 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:41:11,753 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 106/106 [01:14<00:00,  1.42it/s]


🔄 Processing com.imangi.templerun2...


Emotion Classification: 100%|█████████████████| 157/157 [00:40<00:00,  3.88it/s]
2025-07-26 16:43:07,550 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:43:09,267 - BERTopic - Embedding - Completed ✓
2025-07-26 16:43:09,267 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:43:10,769 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:43:10,769 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:43:10,834 - BERTopic - Cluster - Completed ✓
2025-07-26 16:43:10,836 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:43:10,873 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 113/113 [01:22<00:00,  1.38it/s]


🔄 Processing com.google.android.apps.youtube.kids...


Emotion Classification: 100%|█████████████████| 157/157 [00:41<00:00,  3.79it/s]
2025-07-26 16:45:14,852 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:45:16,307 - BERTopic - Embedding - Completed ✓
2025-07-26 16:45:16,308 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:45:17,956 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:45:17,956 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:45:18,019 - BERTopic - Cluster - Completed ✓
2025-07-26 16:45:18,021 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:45:18,056 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 109/109 [01:20<00:00,  1.35it/s]


🔄 Processing com.future.HappyKids...


Emotion Classification: 100%|███████████████████| 38/38 [00:10<00:00,  3.61it/s]
2025-07-26 16:46:49,761 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/38 [00:00<?, ?it/s]

2025-07-26 16:46:50,355 - BERTopic - Embedding - Completed ✓
2025-07-26 16:46:50,355 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:46:52,478 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:46:52,478 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:46:52,492 - BERTopic - Cluster - Completed ✓
2025-07-26 16:46:52,493 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:46:52,507 - BERTopic - Representation - Completed ✓
100%|███████████████████████████████████████████| 31/31 [00:23<00:00,  1.34it/s]


🔄 Processing com.abqappsource.childgrowthtracker...


Emotion Classification: 100%|███████████████████| 20/20 [00:05<00:00,  3.64it/s]
2025-07-26 16:47:21,257 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2025-07-26 16:47:21,623 - BERTopic - Embedding - Completed ✓
2025-07-26 16:47:21,623 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:47:21,969 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:47:21,969 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:47:21,977 - BERTopic - Cluster - Completed ✓
2025-07-26 16:47:21,978 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:47:21,984 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████████| 9/9 [00:05<00:00,  1.58it/s]


🔄 Processing org.khankids.android...


Emotion Classification: 100%|█████████████████| 157/157 [00:51<00:00,  3.03it/s]
2025-07-26 16:48:19,741 - BERTopic - Embedding - Transforming documents to embeddings.


🔍 Running BERTopic...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-07-26 16:48:22,072 - BERTopic - Embedding - Completed ✓
2025-07-26 16:48:22,072 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-26 16:48:23,400 - BERTopic - Dimensionality - Completed ✓
2025-07-26 16:48:23,401 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-26 16:48:23,468 - BERTopic - Cluster - Completed ✓
2025-07-26 16:48:23,469 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-26 16:48:23,512 - BERTopic - Representation - Completed ✓
100%|█████████████████████████████████████████| 111/111 [01:21<00:00,  1.37it/s]
