# Thematic Analysis Notebook

This notebook assigns sentiment scores and extracts actionable themes from app reviews. It expects the preprocessed CSV at `Data/processed/reviews_processed.csv`. Outputs are saved to `Data/processed/reviews_final.csv` and `Data/processed/theme_examples.json`.

In [20]:
# (Optional) Install dependencies - run once in this environment
# !pip install -q scikit-learn nltk spacy tqdm matplotlib seaborn wordcloud
# !python -m spacy download en_core_web_sm -q
print('Skip installs if already available')

Skip installs if already available


In [25]:
# Imports and setup
import re
import json
from pathlib import Path
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Try to use project config for data paths when available
try:
    from Scripts.config import DATA_PATHS
    candidate = DATA_PATHS.get('processed_reviews')
except Exception:
    candidate = None

# Candidate paths to search (absolute-resolved later)
candidates = [candidate, 'Data/processed/reviews_processed.csv', 'data/processed/reviews_processed.csv']
searched = []
src = None
# Try multiple base directories so notebook works when running from `notebooks/` or repo root
bases = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent, Path.cwd().parent.parent.parent]
for c in candidates:
    if not c:
        continue
    p = Path(c)
    # If absolute, check directly
    if p.is_absolute():
        searched.append(str(p))
        if p.exists():
            src = p
            break
    else:
        # Try resolving relative to several likely base directories
        for base in bases:
            candp = base / p
            searched.append(str(candp))
            if candp.exists():
                src = candp
                break
        if src:
            break
# As a final guard, look for Data/processed under upward parents of cwd
if src is None:
    for base in bases:
        repo_try = base / 'Data' / 'processed' / 'reviews_processed.csv'
        searched.append(str(repo_try))
        if repo_try.exists():
            src = repo_try
            break
if src is None:
    raise FileNotFoundError('Could not find processed reviews CSV. Searched:\n' + '\n'.join(searched))

print('Loading processed reviews from:', src)
df = pd.read_csv(src)
print(f'Loaded {len(df):,} reviews')

Loading processed reviews from: c:\Users\hp\Desktop\10  Academy\week 2\Assigniments\Customer-Experience-Analytics-for-Fintech-Apps\Data\processed\reviews_processed.csv
Loaded 2,100 reviews


## Sentiment analysis (VADER)

In [26]:
# Ensure resources
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)
sia = SentimentIntensityAnalyzer()

def vader_score(text):
    try:
        return sia.polarity_scores(str(text))['compound']
    except Exception:
        return 0.0

df['sentiment_score'] = df['review_text'].apply(vader_score)
df['sentiment_label'] = df['sentiment_score'].apply(lambda s: 'positive' if s>=0.05 else ('negative' if s<=-0.05 else 'neutral'))
df['sentiment_label'].value_counts()

sentiment_label
positive    1231
neutral      591
negative     278
Name: count, dtype: int64

## Preprocessing and lemmatization

In [27]:
# Clean text function
def clean_text(t):
    if pd.isna(t):
        return ''
    s = str(t)
    s = re.sub(r'http\S+|www\.\S+', ' ', s)
    s = re.sub(r'[^\x00-\x7F]+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip().lower()
    return s

df['clean_text'] = df['review_text'].apply(clean_text)

# Try spaCy lemmatization if available
try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    def lemmatize_texts(texts):
        docs = list(nlp.pipe(texts, batch_size=64))
        out = []
        for doc in docs:
            tokens = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]
            out.append(' '.join(tokens))
        return out
except Exception:
    from nltk.corpus import stopwords
    sw = set(stopwords.words('english'))
    def lemmatize_texts(texts):
        out = []
        for s in texts:
            tokens = re.findall(r'\b[a-z]{2,}\b', s)
            tokens = [w for w in tokens if w not in sw]
            out.append(' '.join(tokens))
        return out

df['lemmatized'] = lemmatize_texts(df['clean_text'].fillna('').tolist())
df[['review_text','clean_text','lemmatized']].head()

Unnamed: 0,review_text,clean_text,lemmatized
0,üôèüëç,,
1,Very Good,very good,good
2,goof,goof,goof
3,good!,good!,good
4,good jop,good jop,good jop


## TF-IDF keyword extraction per bank

In [28]:
def top_tfidf(series, n=20, ngram_range=(1,2)):
    try:
        vec = TfidfVectorizer(max_features=500, ngram_range=ngram_range, token_pattern=r'\b[a-z]{2,}\b')
        X = vec.fit_transform(series.fillna(''))
        sums = X.sum(axis=0).A1
        terms = vec.get_feature_names_out()
        top_idx = sums.argsort()[::-1][:n]
        return [terms[i] for i in top_idx]
    except Exception:
        return []

bank_keywords = {}
for bank, sub in df.groupby('bank_name'):
    kws = top_tfidf(sub['lemmatized'], n=25)
    bank_keywords[bank] = kws
    print(f'Bank: {bank} ‚Üí ' + ', '.join(kws[:12]))

Bank: Bank of Abyssinia ‚Üí good, app, work, bad, nice, bank, boa, well, good app, mobile, banking, bad app
Bank: Commercial Bank of Ethiopia ‚Üí good, app, good app, nice, well, excellent, cbe, ok, bank, update, like, use
Bank: Dashen Bank ‚Üí good, app, nice, bank, dashen, super, wow, fast, good app, easy, work, banking


## Rule-based theme assignment

In [29]:
THEME_KEYWORDS = {
    'Account Access Issues': ['login','otp','password','pin','sign in','blocked','access'],
    'Performance & Reliability': ['slow','lag','crash','error','not working','freeze','hang','failed'],
    'User Interface & Experience': ['ui','user friendly','navigation','design','confusing','layout'],
    'Transactions & Payments': ['transfer','payment','deposit','withdraw','transaction','balance'],
    'Customer Support': ['support','service','response','contact','ignored','help']
}

def assign_theme(text):
    t = str(text).lower()
    hits = []
    for theme, kws in THEME_KEYWORDS.items():
        for kw in kws:
            if kw in t:
                hits.append(theme)
                break
    if not hits:
        return ['Other']
    return list(dict.fromkeys(hits))

df['identified_themes'] = df['lemmatized'].apply(assign_theme)
from collections import Counter
c = Counter([t for row in df['identified_themes'] for t in row])
print('Top themes (counts):', c.most_common(10))

Top themes (counts): [('Other', 1677), ('Transactions & Payments', 156), ('Performance & Reliability', 129), ('Customer Support', 109), ('User Interface & Experience', 98), ('Account Access Issues', 77)]


In [30]:
# Save final file and theme examples
out = Path('Data/processed/reviews_final.csv')
out.parent.mkdir(parents=True, exist_ok=True)
cols = ['review_id','review_text','rating','review_date','bank_name','sentiment_label','sentiment_score','identified_themes']
present = [c for c in cols if c in df.columns]
df.to_csv(out, columns=present, index=False)
print(f'Saved final CSV to: {out}')
# theme examples
theme_examples = {}
for theme in set([t for row in df['identified_themes'] for t in row]):
    theme_examples[theme] = []
for _, r in df.iterrows():
    for t in r['identified_themes']:
        if len(theme_examples[t]) < 5:
            theme_examples[t].append({'review_id': r.get('review_id'), 'text': r.get('review_text')})
with open('Data/processed/theme_examples.json','w',encoding='utf-8') as fh:
    json.dump(theme_examples, fh, ensure_ascii=False, indent=2)
print('Saved theme examples to Data/processed/theme_examples.json')

Saved final CSV to: Data\processed\reviews_final.csv
Saved theme examples to Data/processed/theme_examples.json
Saved theme examples to Data/processed/theme_examples.json


---
Notes:
- This notebook expects `Data/processed/reviews_processed.csv` (matching repo).
- If you prefer `data/` lowercase, change `candidate` or rename the folder.
- To improve themes, update `THEME_KEYWORDS` or add a clustering step.