# Sentiment & Thematic Analysis (Task 2)
Outputs created:
- data/processed/reviews_with_sentiment_and_theme.csv
- data/processed/keyword_topics_per_bank.csv

Notes:
- If you want a lighter-weight run for quick experiments, set USE_DEEP_MODEL = False (it will use TextBlob only).
- For Transformer inference, a GPU is recommended but not required. The code detects CUDA if available.

In [1]:
# --- Imports and config ---
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import math
import json
from tqdm.auto import tqdm

tqdm.pandas()

# NLP
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy
try:
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
except Exception:
    # If model not available, prompt user to install: python -m spacy download en_core_web_sm
    nlp = spacy.blank('en')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

from textblob import TextBlob

# Transformers
USE_DEEP_MODEL = True  # set False to skip distilBERT & use TextBlob only
try:
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
    import torch
    device = 0 if torch.cuda.is_available() else -1
except Exception:
    pipeline = None
    device = -1

# Paths
DEFAULT_IN = Path('./data/processed/reviews_processed.csv')
DEFAULT_OUT = Path('./data/processed/reviews_with_sentiment_and_theme.csv')
KEYWORDS_OUT = Path('./data/processed/keyword_topics_per_bank.csv')

# If you have a config module with DATA_PATHS, prefer that
try:
    IN_PATH = Path(DATA_PATHS.get('processed_reviews', DEFAULT_IN))
    OUT_PATH = Path(DATA_PATHS.get('sentiment_theme_output', DEFAULT_OUT))
except Exception:
    IN_PATH = DEFAULT_IN
    OUT_PATH = DEFAULT_OUT

# IN_PATH = f"../{IN_PATH}"
# OUT_PATH = f"../{OUT_PATH}"
# KEYWORDS_OUT = f"../{KEYWORDS_OUT}"

print('Input:', IN_PATH)
print(pipeline)
print('Output:', OUT_PATH)
print('Keywords output:', KEYWORDS_OUT)
print('Transformer device:', device)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Input: data/processed/reviews_processed.csv
<function pipeline at 0x7a4a6c77c7c0>
Output: data/processed/reviews_with_sentiment_and_theme.csv
Keywords output: data/processed/keyword_topics_per_bank.csv
Transformer device: -1


In [2]:
df = pd.read_csv(IN_PATH, index_col=None)
print('Loaded', df.shape[0], 'rows with columns:', list(df.columns))

# Ensure minimal columns and harmonize names
expected_columns = {
    'review_text': ['review_text', 'review', 'text'],
    'rating': ['rating', 'stars'],
    'review_date': ['review_date', 'date'],
    'bank_code': ['bank_code', 'bank'],
    'source': ['source']
}
col_map = {}
for canonical, variants in expected_columns.items():
    for v in variants:
        if v in df.columns:
            col_map[v] = canonical
            break

df = df.rename(columns=col_map)

if 'review_text' not in df.columns:
    raise KeyError('No review_text column found after renaming — please provide a processed CSV with a review_text column')

# Ensure review_id exists
if 'review_id' not in df.columns:
    df.insert(0, 'review_id', df.index.astype(str))

# Normalize date
if 'review_date' in df.columns:
    df['review_date'] = pd.to_datetime(df['review_date'], utc=True, errors='coerce')
else:
    df['review_date'] = pd.NaT

# Fill small gaps
df['bank_code'] = df.get('bank_code', df.get('bank', pd.Series('UNKNOWN')))
df['rating'] = df.get('rating', pd.Series(np.nan)).astype(pd.Int64Dtype())

df = df.reset_index(drop=True)
print('After normalization: ', df.shape)


Loaded 1200 rows with columns: ['review_id', 'review_text', 'rating', 'review_date', 'review_year', 'review_month', 'bank_code', 'bank_name', 'user_name', 'thumbs_up', 'text_length', 'source']
After normalization:  (1200, 12)


## Sentiment scoring


In [3]:
# Setup transformer pipeline (if available and requested)
sentiment_pipe = None
if USE_DEEP_MODEL and pipeline is not None:
    try:
        model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
        print('Loading transformer model:', model_name)
        sentiment_pipe = pipeline('sentiment-analysis', model=model_name, device=device)
    except Exception as e:
        print('Transformer model load failed, falling back to TextBlob. Error:', e)
        sentiment_pipe = None
else:
    print('Skipping transformer model (USE_DEEP_MODEL=False or transformers not installed)')

def textblob_sentiment(text):
    tb = TextBlob(str(text))
    return tb.sentiment.polarity

def transformer_sentiment_batch(texts, batch_size=32):
    """Return list of signed scores in [-1,1] where positive means positive sentiment."""
    out_scores = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        results = sentiment_pipe(batch, truncation=True)
        # results is list of {'label': 'POSITIVE'|'NEGATIVE', 'score': float}
        for r in results:
            if r['label'].upper().startswith('POS'):
                out_scores.append(float(r['score']))
            else:
                out_scores.append(-float(r['score']))
    return out_scores

def compute_sentiment_series(text_series):
    text_series = text_series.fillna('').astype(str)
    if sentiment_pipe is not None:
        print('Computing transformer sentiment in batches...')
        scores = transformer_sentiment_batch(list(text_series), batch_size=64)
        return pd.Series(scores, index=text_series.index)
    else:
        print('Computing TextBlob sentiment (polarity [-1,1]) — slower but available')
        return text_series.progress_apply(textblob_sentiment)


Loading transformer model: distilbert-base-uncased-finetuned-sst-2-english


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [4]:
# Run sentiment scoring and categorize
if 'sentiment_score' not in df.columns:
    df['sentiment_score'] = compute_sentiment_series(df['review_text'])
else:
    print('sentiment_score already present — skipping compute')

def sentiment_label_from_score(s, pos_thresh=0.25, neg_thresh=-0.25):
    if pd.isna(s):
        return 'Neutral'
    if s >= pos_thresh:
        return 'Positive'
    elif s <= neg_thresh:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment_label'] = df['sentiment_score'].apply(sentiment_label_from_score)

print(df['sentiment_label'].value_counts(dropna=False))


Computing transformer sentiment in batches...
sentiment_label
Positive    694
Negative    506
Name: count, dtype: int64


## Thematic analysis (per-bank)

Pipeline:
- Simple text cleaning + spaCy lemmatization
- TF-IDF (1-2 grams) per bank
- NMF topic modeling with n_components=4 (adjustable)
- Extract top words per topic and build an automatic theme name by joining top keywords
- Assign each review to the dominant topic (theme) by highest topic weight

Outputs: theme label per review and CSV of keywords/topics per bank.

In [5]:
# Preprocessing helpers
stop_words = set(stopwords.words('english'))
import re
RE_PUNCT = re.compile(r'[\W_]+')

def simple_clean(text):
    if pd.isna(text):
        return ''
    s = str(text).lower()
    s = s.replace('\n', ' ').replace('\r', ' ')
    s = RE_PUNCT.sub(' ', s)
    return s.strip()

def spacy_lemmatize(text):
    doc = nlp(text)
    lemmas = []
    for t in doc:
        if t.is_stop or t.is_punct or t.is_space:
            continue
        lemmas.append(t.lemma_)
    return ' '.join(lemmas)

def preprocess_texts(series, use_spacy=True):
    cleaned = series.fillna('').astype(str).progress_apply(simple_clean)
    if use_spacy and nlp.has_pipe('tagger'):
        return cleaned.progress_apply(spacy_lemmatize)
    else:
        return cleaned


In [6]:
# Perform thematic extraction per bank
banks = df['bank_code'].fillna('UNKNOWN').unique().tolist()
print('Banks found:', banks)

all_topic_rows = []
theme_assignments = []

for bank in banks:
    sub = df[df['bank_code'] == bank].copy()
    if sub.shape[0] == 0:
        continue
    print('\n---', bank, '(', sub.shape[0], 'reviews ) ---')

    texts_for_tfidf = preprocess_texts(sub['review_text'])

    # TF-IDF
    vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=max(2, int(0.005*len(texts_for_tfidf))), max_df=0.85)
    X = vectorizer.fit_transform(texts_for_tfidf)
    print('TF-IDF shape:', X.shape)

    # NMF topic modeling
    n_topics = 4
    if X.shape[1] < n_topics:
        n_topics = max(1, X.shape[1]//2)
    if n_topics < 1:
        n_topics = 1
    nmf = NMF(n_components=n_topics, random_state=42, init='nndsvda', max_iter=400)
    W = nmf.fit_transform(X)  # doc-topic
    H = nmf.components_       # topic-term

    feature_names = np.array(vectorizer.get_feature_names_out())

    topic_keywords = []
    for t in range(H.shape[0]):
        top_idx = H[t].argsort()[::-1][:12]
        top_words = feature_names[top_idx].tolist()
        topic_keywords.append(top_words)
        all_topic_rows.append({
            'bank_code': bank,
            'topic_id': t,
            'top_keywords': ', '.join(top_words[:12])
        })

    # simple theme name: join top 3 keywords
    topic_names = [' '.join([w.replace('_', ' ') for w in kws[:3]]) for kws in topic_keywords]

    # assign dominant topic to each review
    normW = normalize(W, norm='l1', axis=1)
    dominant_topic = normW.argmax(axis=1)

    # save assignments
    sub = sub.reset_index(drop=True)
    assignment = pd.DataFrame({
        'review_id': sub['review_id'],
        'bank_code': bank,
        'dominant_topic': dominant_topic,
        'topic_confidence': normW.max(axis=1)
    })
    # map to readable theme
    assignment['identified_theme'] = assignment['dominant_topic'].apply(lambda t: topic_names[int(t)])
    theme_assignments.append(assignment)

    print('Extracted topics:')
    for tid, words in enumerate(topic_keywords):
        print(f'  Topic {tid}:', ', '.join(words[:8]))

# Merge theme assignments back into df
theme_df = pd.concat(theme_assignments, ignore_index=True) if theme_assignments else pd.DataFrame(columns=['review_id','bank_code','dominant_topic','topic_confidence','identified_theme'])
df = df.merge(theme_df[['review_id','identified_theme','dominant_topic','topic_confidence']], on='review_id', how='left')

print('\nTheme assignment complete. Example:')
display(df[['review_id','bank_code','review_text','identified_theme']].head(6))


Banks found: ['BOA', 'CBE', 'DASHN']

--- BOA ( 400 reviews ) ---


  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

TF-IDF shape: (400, 334)
Extracted topics:
  Topic 0: good, good app, good good, good bank, application, good service, improve, business
  Topic 1: app, good app, bad, bad app, bank, banking app, banking, bank app
  Topic 2: work, doesn, doesn work, update, app work, fix, need, banking
  Topic 3: nice, use, easy, nice app, mobile, banking, mobile banking, banking app

--- CBE ( 400 reviews ) ---


  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

TF-IDF shape: (400, 254)
Extracted topics:
  Topic 0: good, good app, application, update, good job, need, job, good application
  Topic 1: app, good app, cbe, update, use, nice app, bad, smart
  Topic 2: nice, nice app, app, app nice, application, nice application, like, useful
  Topic 3: well, bank, app well, work, find, thing, cbe, friendly

--- DASHN ( 400 reviews ) ---


  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

TF-IDF shape: (400, 394)
Extracted topics:
  Topic 0: good, good app, app good, version, say, good application, service, 100
  Topic 1: app, bank, good app, dashen, super, super app, dashen bank, banking
  Topic 2: nice, nice app, app, application, limit, app transaction, app see, see
  Topic 3: wow, wow dashen, dashen bank, dashen, bank, bank super, super app, super

Theme assignment complete. Example:


Unnamed: 0,review_id,bank_code,review_text,identified_theme
0,a6cbfa34-f2b1-4a16-96b6-c94f58cea76f,BOA,Very Good,good good app good good
1,fc67d12c-92e2-45aa-a9e0-011f58a583bc,BOA,goof,work doesn doesn work
2,11306fb9-5571-4950-8d32-604c5402242f,BOA,good!,good good app good good
3,809c46d2-730e-446a-9061-2a45e978ad9d,BOA,good jop,good good app good good
4,f28a3a3c-eb94-4aab-88d2-89bcecebcc7b,BOA,bad exprience...it is so crushed,app good app bad
5,4ed89e8c-16dc-4763-94ca-04d05cf799a5,BOA,not user friendly at all it requires a huge co...,app good app bad


In [7]:
# Save outputs
# OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_out_columns = ['review_id','review_text','rating','review_date','bank_code','bank_name' if 'bank_name' in df.columns else 'bank_code','source','sentiment_score','sentiment_label','identified_theme','topic_confidence']
df_out_columns = [c for c in df_out_columns if c in df.columns]
df[df_out_columns].to_csv(OUT_PATH, index=False)
print('Saved combined sentiment+theme CSV to', OUT_PATH)

pd.DataFrame(all_topic_rows).to_csv(KEYWORDS_OUT, index=False)
print('Saved keywords/topics per bank to', KEYWORDS_OUT)


Saved combined sentiment+theme CSV to data/processed/reviews_with_sentiment_and_theme.csv
Saved keywords/topics per bank to data/processed/keyword_topics_per_bank.csv
