# 04 · Sentiment & Topic Analysis

Robust multilingual sentiment and topic extraction for App Review Insights.

## Requirements

Optional pip installs (run manually in a cell if needed).

In [None]:
# !pip install -r requirements.txt
# !python -m spacy download en_core_web_sm
# !python -m spacy download fr_core_news_sm
# !python -m spacy download de_core_news_sm
# !python -m spacy download es_core_news_sm
# !python -m spacy download it_core_news_sm
# !python -m spacy download sv_core_news_sm


## Imports & configuration


In [None]:
from __future__ import annotations
import json
import logging
import time
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm

from ml.pipeline import sentiment_topics as st

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
tqdm.pandas()


In [None]:
CONFIG_PATH = Path('config/apps.json')
DATA_PATH = Path('data/processed_reviews.csv')
OUTPUT_DIR = Path('data/output')
BATCH_SIZE = 32

with CONFIG_PATH.open() as fh:
    app_config = json.load(fh)
COUNTRIES = app_config.get('countries', list(st.DEFAULT_COUNTRY_LANGUAGE_MAP.keys()))
COUNTRY_LANGUAGE_MAP = {**st.DEFAULT_COUNTRY_LANGUAGE_MAP}
COUNTRY_LANGUAGE_MAP.update({c: COUNTRY_LANGUAGE_MAP.get(c, st.DEFAULT_COUNTRY_LANGUAGE_MAP.get(c, 'en')) for c in COUNTRIES})
COUNTRY_LANGUAGE_MAP


## Load processed reviews


In [None]:
start_time = time.perf_counter()
reviews_df = pd.read_csv(DATA_PATH)
if reviews_df.empty:
    raise ValueError('No reviews found in data/processed_reviews.csv')
required_cols = {'id', 'app_name', 'country', 'cleaned_content', 'rating', 'review_date'}
missing = required_cols - set(reviews_df.columns)
if missing:
    raise ValueError(f'Missing required columns: {missing}')
reviews_df = reviews_df.dropna(subset=['cleaned_content']).copy()
reviews_df.head()


## Detect languages


In [None]:
reviews_df['detected_language'] = st.detect_languages(reviews_df, COUNTRY_LANGUAGE_MAP)
reviews_df['detected_language'].value_counts()


## Load spaCy language pipelines


In [None]:
models, lang_resolution = st.load_spacy_models(reviews_df['detected_language'].unique(), COUNTRY_LANGUAGE_MAP)
sorted(models.keys())


## Split reviews into sentences


In [None]:
sentences_df = st.split_sentences(reviews_df, models, lang_resolution)
if sentences_df.empty:
    raise ValueError('Sentence splitting produced no data')
sentences_df.head()


## Run multilingual sentiment analysis


In [None]:
sentiment_scores = st.run_sentiment(sentences_df, batch_size=BATCH_SIZE)
sentences_with_sentiment = sentences_df.join(sentiment_scores, how='left')
sentences_with_sentiment.head()


## Extract topics per sentence


In [None]:
structured_rows = []
reviews_by_id = reviews_df.set_index('id')
for review_id, group in tqdm(sentences_with_sentiment.groupby('id'), desc='Aggregate reviews'):
    review_meta = reviews_by_id.loc[review_id]
    sentences_list = group['sentence'].tolist()
    sentiments_list = group['sentiment_label'].fillna('neutral').tolist()
    language = review_meta['detected_language']
    topics_per_sentence = st.extract_topics(sentences_list, language, options={'top_n': 5, 'ngram_range': (1, 2), 'diversity': 0.6})
    review_topics = st.merge_topics(topics_per_sentence, limit=5)
    details = st.build_details(sentences_list, sentiments_list, topics_per_sentence)
    label, score = st.aggregate_sentiment(sentiments_list)
    structured_rows.append({
        'id': review_id,
        'app_name': review_meta['app_name'],
        'country': review_meta['country'],
        'language': language,
        'rating': review_meta.get('rating'),
        'cleaned_content': review_meta.get('cleaned_content'),
        'sentiment_label': label,
        'sentiment_score': score,
        'topics': review_topics,
        'details': details,
        'review_date': review_meta.get('review_date'),
    })
structured_df = pd.DataFrame(structured_rows)
structured_df.head()


## NotebookLM export & topic summary


In [None]:
structured_df['notebook_sentence'] = structured_df.apply(st.make_notebook_sentence, axis=1)
notebook_df = structured_df[['id', 'app_name', 'country', 'language', 'sentiment_label', 'sentiment_score', 'notebook_sentence']].copy()
topic_summary = (
    structured_df.explode('topics')
    .dropna(subset=['topics'])
    .groupby(['app_name', 'country', 'topics', 'sentiment_label'], as_index=False)
    .agg(review_count=('id', 'count'))
    .rename(columns={'topics': 'topic'})
)
structured_df.head()


## Export CSV artifacts


In [None]:
st.write_csvs(structured_df.drop(columns=['notebook_sentence']), notebook_df, topic_summary, OUTPUT_DIR)
sentiment_counts = structured_df['sentiment_label'].value_counts()
print('Sentiment distribution:')
print(sentiment_counts)
print('Sample NotebookLM sentences:')
for example in notebook_df['notebook_sentence'].head(3):
    print('-', example)
elapsed = time.perf_counter() - start_time
print(f'Total runtime: {elapsed:.2f}s')
