In [6]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from google_play_scraper import Sort, reviews
sys.path.append(os.path.abspath("../scripts"))
from sentiment_analysis import compute_vader_sentiment
from thematic_analysis import extract_keywords, tag_themes, assign_themes
import warnings
warnings.filterwarnings('ignore')

In [3]:
# %%
os.chdir("..")  # Go up a directory
#print(os.getcwd())

# %%
# print(os.getcwd())

In [8]:
df = pd.read_csv("data/cleaned_bank_reviews.csv")

    # --- Sentiment Analysis ---
df_sentiment = compute_vader_sentiment(df)

In [9]:
# --- Keyword Extraction + Theming ---
# Thematic Analysis
keywords_df = extract_keywords(df_sentiment)
themes_df = tag_themes(keywords_df)

    # Optional: merge some themes back to reviews by bank (for summary reporting)
df_sentiment.to_csv("data/sentiment_themes_output.csv", index=False)
themes_df.to_csv("data/theme_keywords_by_bank.csv", index=False)
print(themes_df.head(100))

      bank      keyword      score                        theme
0      CBE          app  46.385938                        Other
1      CBE         good  36.230127                        Other
2      CBE         best  19.553832                        Other
3      CBE         nice  17.068269                        Other
4      CBE          cbe  16.376945                        Other
5      CBE         bank  14.841774                        Other
6      CBE         like  13.154101                        Other
7      CBE     good app  10.981530                        Other
8      CBE        great   9.648849                        Other
9      CBE  application   9.202452                        Other
10     BOA          app  51.882815                        Other
11     BOA         good  18.223353                        Other
12     BOA         work  16.478320                        Other
13     BOA         bank  16.126075                        Other
14     BOA      working  14.224103      

In [12]:
df_sen = pd.read_csv("data/sentiment_themes_output.csv")
df_sen.head()

Unnamed: 0,Review Description,User,Rating,Date,Bank,Source,sentiment_label,sentiment_score
0,So bad now and hard to use,Daniel Ephrem,5,2025-06-09,CBE,Google Play Store,negative,-0.6361
1,"it is so amazing app. but, it is better to upd...",abdulkerim habib,5,2025-06-09,CBE,Google Play Store,positive,0.9049
2,v.good app,Abdulhalim Bedre,4,2025-06-09,CBE,Google Play Store,neutral,0.0
3,very good app,Moha Yimer,1,2025-06-09,CBE,Google Play Store,positive,0.4927
4,Very amazing app indeed. I'm enjoying it,Puoch chuol Wath,5,2025-06-08,CBE,Google Play Store,positive,0.8173


In [14]:
df_thematic = pd.read_csv("data/theme_keywords_by_bank.csv")
df_thematic.head(100)

Unnamed: 0,bank,keyword,score,theme
0,CBE,app,46.385938,Other
1,CBE,good,36.230127,Other
2,CBE,best,19.553832,Other
3,CBE,nice,17.068269,Other
4,CBE,cbe,16.376945,Other
5,CBE,bank,14.841774,Other
6,CBE,like,13.154101,Other
7,CBE,good app,10.98153,Other
8,CBE,great,9.648849,Other
9,CBE,application,9.202452,Other


In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# Use English stopwords
stop_words = stopwords.words('english')

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words=stop_words,
    ngram_range=(1, 2),     # Unigrams and bigrams
    max_df=0.85,
    min_df=5                # Only terms appearing in at least 5 reviews
)

# Fit TF-IDF model
tfidf_matrix = vectorizer.fit_transform(df['Review Description'])
feature_names = vectorizer.get_feature_names_out()

# Get top keywords by mean TF-IDF score
import numpy as np
mean_scores = tfidf_matrix.mean(axis=0).A1
top_idx = np.argsort(mean_scores)[::-1][:30]  # Top 30 terms
top_keywords = [(feature_names[i], round(mean_scores[i], 4)) for i in top_idx]

# Display results
print("Top keywords and n-grams by TF-IDF score:\n")
for keyword, score in top_keywords:
    print(f"{keyword}: {score}")


bank_keywords = {}
for bank in df['Bank'].unique():
    bank_reviews = df[df['Bank'] == bank]['Review Description'].fillna('')
    tfidf = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 2), max_df=0.85, min_df=5)
    matrix = tfidf.fit_transform(bank_reviews)
    names = tfidf.get_feature_names_out()
    scores = matrix.mean(axis=0).A1
    idx = np.argsort(scores)[::-1][:15]
    bank_keywords[bank] = [(names[i], round(scores[i], 4)) for i in idx]

# Print top terms per bank
for bank, keywords in bank_keywords.items():
    print(f"\nTop Keywords for {bank}:")
    for term, score in keywords:
        print(f"  {term}: {score}")

Top keywords and n-grams by TF-IDF score:

app: 0.0883
good: 0.0506
best: 0.0369
bank: 0.0328
work: 0.0237
nice: 0.0236
use: 0.0224
banking: 0.0224
one: 0.0204
like: 0.0195
fast: 0.0192
application: 0.0188
dashen: 0.0187
easy: 0.0184
amazing: 0.0174
great: 0.017
super: 0.0167
working: 0.0163
mobile: 0.0159
cbe: 0.0152
ever: 0.015
good app: 0.0148
best app: 0.0136
please: 0.0125
mobile banking: 0.0121
ነው: 0.0116
always: 0.0114
apps: 0.0111
wow: 0.0111
user: 0.0111

Top Keywords for CBE:
  app: 0.1214
  good: 0.092
  best: 0.0486
  cbe: 0.0438
  nice: 0.0426
  bank: 0.0393
  like: 0.0343
  good app: 0.0279
  great: 0.0256
  ነው: 0.0249
  fast: 0.0229
  application: 0.0227
  use: 0.0225
  easy: 0.0221
  screenshot: 0.0218

Top Keywords for BOA:
  app: 0.112
  work: 0.0477
  good: 0.0424
  bank: 0.0371
  boa: 0.0312
  working: 0.0296
  please: 0.0259
  banking: 0.0255
  worst: 0.0236
  use: 0.0232
  ever: 0.0222
  mobile: 0.0222
  application: 0.0202
  best: 0.0191
  bad: 0.0188

Top Keywor

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Belay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
