In [None]:
# ============================================================
# üß† 03_sentiment_topics_analysis.ipynb
# Multilingual Sentiment + Topic extraction for App Reviews
# ============================================================

# ‚úÖ Requirements (run once)
%pip install pandas numpy transformers spacy keybert sentence-transformers langdetect tqdm

# For language support (download models)
#!python -m spacy download en_core_web_sm
#!python -m spacy download it_core_news_sm
#!python -m spacy download fr_core_news_sm
# ============================================================

import pandas as pd
import numpy as np
import spacy
from pathlib import Path
import json
from tqdm import tqdm
from langdetect import detect
from transformers import AutoTokenizer, pipeline
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

# ============================================================
# üìÅ Resolve project paths
# ============================================================

def find_project_root(marker="data"):
    root = Path.cwd()
    while not (root / marker).exists() and root != root.parent:
        root = root.parent
    if not (root / marker).exists():
        raise FileNotFoundError("Could not locate project root containing data/.")
    return root

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
CONFIG_DIR = PROJECT_ROOT / "config"
OUTPUT_DIR = DATA_DIR / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)



In [None]:
# ============================================================
# üì• 1. LOAD DATA
# ============================================================

INPUT_PATH = DATA_DIR / "ml" / "processed_reviews.csv"
df = pd.read_csv(INPUT_PATH)

# Keep only useful columns
df = df[['id','app_name','country','rating','cleaned_content','review_date']]

print(f"Loaded {len(df)} reviews")
df.head(3)


In [None]:
# ============================================================
# üåç 2. DETECT LANGUAGE (fallback if not already in dataset)
# ============================================================

if 'language' not in df.columns:
    tqdm.pandas()
    df['language'] = df['cleaned_content'].progress_apply(lambda x: detect(x) if isinstance(x, str) else 'unknown')

print(df['language'].value_counts())


In [None]:
# ============================================================
# ‚úÇÔ∏è 3. SPLIT LONG REVIEWS INTO SENTENCES
# ============================================================

# Load multilingual spaCy models on demand
CONFIG_PATH = CONFIG_DIR / "apps.json"
if CONFIG_PATH.exists():
    with open(CONFIG_PATH) as f:
        cfg = json.load(f)
        COUNTRIES = cfg.get("countries", [])
else:
    # Option B ‚Äî fallback from dataset
    COUNTRIES = df["country"].unique().tolist()

# Country ‚Üí language mapping
COUNTRY_LANG_MAP = {
    "fr": "fr",
    "us": "en",
    "gb": "en",
    "ca": "en",  # could also be 'fr'
    "de": "de",
    "se": "sv",  # no direct model; fallback to English
    "it": "it",
    "es": "es",
}

# Prepare language list from dataset
languages_to_load = sorted(set(COUNTRY_LANG_MAP.get(c, "en") for c in COUNTRIES))
print("Detected languages:", languages_to_load)

# Load only what we need
models = {}
for lang in languages_to_load:
    try:
        if lang == "sv":  # fallback for Swedish
            models[lang] = spacy.load("en_core_web_sm")
        else:
            models[lang] = spacy.load(f"{lang}_core_news_sm")
    except OSError:
        print(f"‚ö†Ô∏è Missing spaCy model for {lang}, using English fallback.")
        models[lang] = spacy.load("en_core_web_sm")

print(f"‚úÖ Loaded {len(models)} language models:", list(models.keys()))

def split_sentences(text, lang_code):
    lang_code = COUNTRY_LANG_MAP.get(lang_code, lang_code)
    nlp = models.get(lang_code, models.get("en"))
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]


tqdm.pandas()
df['sentences'] = df.progress_apply(lambda r: split_sentences(str(r.cleaned_content), r.language), axis=1)
df['n_sentences'] = df['sentences'].apply(len)
df.head(2)


In [None]:
# ============================================================
# üí¨ 4. SENTIMENT ANALYSIS (multilingual tolerant)
# ============================================================

sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model)
MAX_TOKENS = sentiment_tokenizer.model_max_length  # keep inputs within model limit

sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=sentiment_model,
    tokenizer=sentiment_tokenizer,
)

def analyze_sentiments(sent_list, batch_size=32):
    sentences = [s for s in sent_list if isinstance(s, str) and s.strip()]
    if not sentences:
        return []

    labels = []
    for start in range(0, len(sentences), batch_size):
        batch = sentences[start:start + batch_size]
        outputs = sentiment_analyzer(
            batch,
            truncation=True,
            padding=True,
            max_length=MAX_TOKENS,
        )
        labels.extend(pred["label"] for pred in outputs)
    return labels

tqdm.pandas()
df["sentence_sentiments"] = df["sentences"].progress_apply(analyze_sentiments)


def aggregate_sentiments(labels):
    pos = sum("POS" in l.upper() for l in labels)
    neg = sum("NEG" in l.upper() for l in labels)
    total = pos + neg
    
    # No polarity at all
    if total == 0:
        return "neutral"
    
    ratio = pos / total

    # Define smooth thresholds
    if 0.4 <= ratio <= 0.6:
        return "mixed"
    elif ratio > 0.6:
        return "positive"
    else:
        return "negative"


df['sentiment_mode'] = df['sentence_sentiments'].apply(aggregate_sentiments)
df.head(2)


In [None]:
# ============================================================
# üß© 5. TOPIC EXTRACTION (KeyBERT using SentenceTransformer)
# ============================================================

kw_model = KeyBERT(model=SentenceTransformer("all-MiniLM-L6-v2"))

def extract_topics_per_sentence(sent_list, lang):
    topics = []
    for s in sent_list:
        try:
            kw = kw_model.extract_keywords(s, keyphrase_ngram_range=(1,2), stop_words='english', top_n=2)
            topics.append("; ".join([k for k, _ in kw]))
        except:
            topics.append("")
    return topics

tqdm.pandas()
df['sentence_topics'] = df.progress_apply(lambda r: extract_topics_per_sentence(r.sentences, r.language), axis=1)

# Aggregate top topics per review
def merge_topics(topics_lists):
    tokens = []
    for t in topics_lists:
        if isinstance(t, list):
            tokens.extend(t)
    flat = "; ".join(tokens)
    unique = list(dict.fromkeys([x.strip() for x in flat.split(';') if x.strip()]))
    return "; ".join(unique[:5])

df['topics'] = df['sentence_topics'].apply(merge_topics)
df.head(2)


In [None]:
# ============================================================
# üß† 6. STRUCTURE PER-REVIEW SENTIMENT/TOPIC TABLE
# ============================================================

def sentence_details(sentences, sentiments, topics):
    return [
        {"sentence": s, "sentiment": sen, "topics": t}
        for s, sen, t in zip(sentences, sentiments, topics)
    ]

df['details'] = df.apply(lambda r: sentence_details(r.sentences, r.sentence_sentiments, r.sentence_topics), axis=1)

structured_df = df[['id','app_name','country','language','rating','cleaned_content',
                    'sentiment_mode','topics','details','review_date']]

reviews_export_path = OUTPUT_DIR / "reviews_sentiment_topics.csv"
structured_df.to_csv(reviews_export_path, index=False)
print(f"‚úÖ Exported {len(structured_df)} rows to {reviews_export_path}")


In [None]:
# ============================================================
# üó£Ô∏è 7. GENERATE NOTEBOOKLM SENTENCE SUMMARIES
# ============================================================

def notebook_sentence(row):
    lang_info = f"In {row['country'].upper()}, a {row['app_name'].capitalize()} user wrote in {row['language']}."
    tone = f"The overall sentiment is {row['sentiment_mode']}."
    topic_part = f"It mainly discusses: {row['topics']}."
    quotes = " ".join([f"'{d['sentence']}' [{d['sentiment']}]" for d in row['details']])
    return f"{lang_info} {tone} {topic_part} Example sentences: {quotes}"

df['notebook_sentence'] = df.apply(notebook_sentence, axis=1)

notebook_df = df[['id','app_name','country','sentiment_mode','topics','notebook_sentence']]
notebook_export_path = OUTPUT_DIR / "notebooklm_reviews.csv"
notebook_df.to_csv(notebook_export_path, index=False)
print(f"‚úÖ Exported NotebookLM-ready summaries to {notebook_export_path}")


In [None]:
# ============================================================
# üìä 8. OPTIONAL ‚Äî TOPIC SUMMARY TABLE (for NotebookLM clustering)
# ============================================================

summary = (
    structured_df
    .explode('topics')
    .groupby(['app_name','country','topics','sentiment_mode'])
    .size()
    .reset_index(name='count')
)

topic_summary_path = OUTPUT_DIR / "topic_summary.csv"
summary.to_csv(topic_summary_path, index=False)
print(f"‚úÖ Exported aggregated topic summary to {topic_summary_path}")
summary.head(5)
