In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import re
import os

nltk.download('stopwords')


df = pd.read_csv("../data/bank_reviews.csv")

analyzer = SentimentIntensityAnalyzer()

# Sentiment Analysis function
def get_sentiment_label(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Compute sentiment scores & labels
df['sentiment_score'] = df['review'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])
df['sentiment_label'] = df['sentiment_score'].apply(get_sentiment_label)


stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

df['clean_review'] = df['review'].apply(preprocess)

# TF-IDF vectorizer to extract keywords/ngrams
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=100)
tfidf_matrix = vectorizer.fit_transform(df['clean_review'])


keywords = vectorizer.get_feature_names_out()


bank_keywords = {}
for bank in df['bank'].unique():
    idxs = df[df['bank'] == bank].index
    bank_tfidf = tfidf_matrix[idxs,:].mean(axis=0).A1  
    top_indices = bank_tfidf.argsort()[::-1][:10]  
    bank_keywords[bank] = [keywords[i] for i in top_indices]

themes = {
    "CBE": {
        "Account Access Issues": ["login", "password", "error", "account", "fail"],
        "Transaction Performance": ["slow", "transfer", "loading", "payment", "delay"],
        "User Interface": ["ui", "interface", "easy", "design", "navigation"]
    },
    "Dashen": {
        "Performance": ["slow", "loading", "crash", "lag", "freeze"],
        "Support": ["customer", "support", "help", "service"],
        "Features": ["fingerprint", "login", "feature", "update", "option"]
    },
    "BOA": {
        "App Stability": ["crash", "bug", "error", "freeze", "unresponsive"],
        "Transaction Issues": ["transfer", "payment", "delay", "fail", "processing"],
        "User Experience": ["ui", "design", "easy", "interface", "simple"]
    }
}


# Function to assign themes to reviews based on presence of theme keywords
def assign_themes(review, bank):
    review = review.lower()
    assigned = []
    for theme, keywords_list in themes[bank].items():
        if any(kw in review for kw in keywords_list):
            assigned.append(theme)
    if not assigned:
        assigned.append("Other")
    return ", ".join(assigned)

df['themes'] = df.apply(lambda row: assign_themes(row['review'], row['bank']), axis=1)

# Save results CSV
os.makedirs("data", exist_ok=True)
df.to_csv("data/reviews_with_sentiment_themes.csv", index=False)

print("✅ Analysis complete. Saved to data/reviews_with_sentiment_themes.csv")
print("\nSample theme keywords by bank:")
for bank, kws in bank_keywords.items():
    print(f"{bank}: {kws}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bezat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Analysis complete. Saved to data/reviews_with_sentiment_themes.csv

Sample theme keywords by bank:
CBE: ['app', 'good', 'best', 'nice', 'cbe', 'like', 'bank', 'great', 'good app', 'screenshot']
Dashen: ['app', 'dashen', 'super', 'best', 'bank', 'banking', 'dashen bank', 'one', 'amazing', 'good']
BOA: ['app', 'work', 'bank', 'good', 'boa', 'working', 'worst', 'please', 'banking', 'use']


In [6]:
print(df['bank'].unique())


['CBE' 'Dashen' 'BOA']
