In [None]:
# 📦 Install necessary libraries
!pip install vaderSentiment nltk scikit-learn

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
# 📚 Import libraries
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Download NLTK resources
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#  Load your cleaned reviews file
df = pd.read_csv('clean_bank_reviews.csv')

In [None]:
#  Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(str(text))['compound']
    label = 'positive' if score >= 0.05 else 'negative' if score <= -0.05 else 'neutral'
    return pd.Series([score, label])

df[['sentiment_score', 'sentiment_label']] = df['review'].apply(get_sentiment)

In [None]:
# Text Preprocessing
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['cleaned_review'] = df['review'].apply(preprocess_text)

In [None]:
# 🔍 TF-IDF Keyword Extraction per Bank
themes = {}

for bank in df['bank'].unique():
    bank_reviews = df[df['bank'] == bank]['cleaned_review']
    tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, min_df=2, stop_words='english')
    X = tfidf.fit_transform(bank_reviews)
    scores = zip(tfidf.get_feature_names_out(), X.sum(axis=0).A1)
    top_keywords = sorted(scores, key=lambda x: -x[1])[:10]
    themes[bank] = [kw for kw, _ in top_keywords]

In [None]:
# Thematic Clustering (Rule-Based)
def assign_theme(text):
    text = text.lower()
    if any(k in text for k in ['login', 'access', 'password', 'otp']):
        return 'Account Access Issues'
    elif any(k in text for k in ['slow', 'transfer', 'crash', 'freeze', 'loading']):
        return 'Transaction Performance'
    elif any(k in text for k in ['interface', 'ui', 'design']):
        return 'User Interface'
    elif any(k in text for k in ['support', 'help', 'response']):
        return 'Customer Support'
    elif any(k in text for k in ['feature', 'update', 'add', 'new']):
        return 'Feature Request'
    else:
        return 'Other'

df['theme'] = df['cleaned_review'].apply(assign_theme)

In [None]:
# Save Annotated CSV
df.to_csv('annotated_reviews_task2.csv', index=False)
print("✅ Task 2 completed: Saved as 'annotated_reviews_task2.csv'")

✅ Task 2 completed: Saved as 'annotated_reviews_task2.csv'


In [None]:
#  Show theme summary
df.groupby(['bank', 'theme', 'sentiment_label']).size().unstack(fill_value=0)

Unnamed: 0_level_0,sentiment_label,negative,neutral,positive
bank,theme,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BOA,Account Access Issues,7,2,4
BOA,Customer Support,1,0,6
BOA,Feature Request,5,4,4
BOA,Other,85,135,107
BOA,Transaction Performance,17,11,14
BOA,User Interface,1,3,4
CBE,Account Access Issues,1,1,6
CBE,Customer Support,0,0,5
CBE,Feature Request,2,7,5
CBE,Other,28,99,211


In [None]:
# ✅ KPI CHECK 2 — Themes per bank
print("\n🔍 Number of themes per bank (unique themes used):")
print(df.groupby('bank')['theme'].nunique())


🔍 Number of themes per bank (unique themes used):
bank
BOA       6
CBE       6
Dashen    6
Name: theme, dtype: int64


In [None]:
#  Example: Show Top 5 keywords per bank
print("\n Top Keywords Extracted Per Bank:")
for bank, keywords in themes.items():
    print(f"\n{bank}:")
    print(", ".join(keywords))


 Top Keywords Extracted Per Bank:

CBE:
app, good, best, nice, cbe, bank, like, good app, great, application

BOA:
app, good, work, bank, boa, working, doesnt, banking, worst, mobile

Dashen:
app, best, dashen, bank, super, good, banking, dashen bank, amazing, use
