In [13]:
import os
from google_play_scraper import Sort, reviews
import pandas as pd
from tqdm import tqdm
from langdetect import detect
from deep_translator import GoogleTranslator
from transformers import pipeline
import os
import time
from langdetect.lang_detect_exception import LangDetectException

from collections import Counter

In [14]:

# Load your preprocessed review file
df = pd.read_csv("../data/reviews_with_sentimentfinal.csv")
df.head()

Unnamed: 0,review,rating,date,bank,source,translated_review,lang_detected,sentiment_label,sentiment_score
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06 09:54:11,Cbe,Google Play,"""Why don’t your ATMs support account-to-accoun...",en,NEGATIVE,0.996465
1,what is this app problem???,1,2025-06-05 22:16:56,Cbe,Google Play,what is this app problem???,en,NEGATIVE,0.999623
2,the app is proactive and a good connections.,5,2025-06-05 15:55:10,Cbe,Google Play,the app is proactive and a good connections.,en,POSITIVE,0.999868
3,I cannot send to cbebirr app. through this app.,3,2025-06-05 11:12:49,Cbe,Google Play,I cannot send to cbebirr app. through this app.,en,NEGATIVE,0.995335
4,not functional,1,2025-06-05 07:38:12,Cbe,Google Play,not functional,en,NEGATIVE,0.999779


In [15]:
#Step 2: Perform Sentiment Analysis
from transformers import pipeline

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Apply it on a sample (for speed, you can scale up later)
def analyze_sentiment(text):
    try:
        result = sentiment_pipeline(text[:512])[0]  # Truncate long texts
        return result['label'], result['score']
    except:
        return 'ERROR', 0.0

# Apply to your reviews
df[['sentiment_label', 'sentiment_score']] = df['review'].apply(lambda x: pd.Series(analyze_sentiment(str(x))))

Device set to use cpu


In [16]:
#Step 3: Aggregate Sentiment
# Group by bank and rating
sentiment_summary = df.groupby(['bank', 'rating'])[['sentiment_score']].mean().reset_index()
print(sentiment_summary)

        bank  rating  sentiment_score
0   Absiniya       1         0.989370
1   Absiniya       2         0.964561
2   Absiniya       3         0.983056
3   Absiniya       4         0.963430
4   Absiniya       5         0.977867
5        Cbe       1         0.994166
6        Cbe       2         0.998415
7        Cbe       3         0.986878
8        Cbe       4         0.956228
9        Cbe       5         0.984998
10    Dashin       1         0.998587
11    Dashin       2         0.986041
12    Dashin       3         0.997215
13    Dashin       4         0.994640
14    Dashin       5         0.991836


In [20]:
#Step 4: Keyword & Theme Extraction (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
import re
import spacy
stop_words = set(stopwords.words("english"))
nlp = spacy.load("en_core_web_sm")

stop_words = set(stopwords.words('english'))


In [22]:
# -------------------- SETUP SENTIMENT ANALYSIS --------------------
# Load sentiment analysis pipeline using DistilBERT
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Apply sentiment model to review text (truncate if too long)
def analyze_sentiment(text):
    try:
        result = sentiment_pipeline(text[:512])[0]
        return result['label'], result['score']
    except:
        return 'ERROR', 0.0

# Run sentiment analysis on the review column
df[['sentiment_label', 'sentiment_score']] = df['review'].astype(str).apply(lambda x: pd.Series(analyze_sentiment(x)))

# -------------------- AGGREGATE SENTIMENT BY BANK + RATING --------------------
# Calculate mean sentiment scores grouped by bank and rating
sentiment_summary = df.groupby(['bank', 'rating'])[['sentiment_score']].mean().reset_index()
print("Sentiment Summary:\n", sentiment_summary.head())

# -------------------- TEXT CLEANING FOR KEYWORD EXTRACTION --------------------
# Clean and normalize text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    return text

df['cleaned_review'] = df['review'].astype(str).apply(clean_text)

# -------------------- KEYWORD EXTRACTION USING TF-IDF --------------------
# Use TF-IDF to identify important words/phrases
def extract_keywords(text_series, max_features=30):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features, ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(text_series)
    features = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.sum(axis=0).A1
    keywords_df = pd.DataFrame({'keyword': features, 'score': scores})
    return keywords_df.sort_values(by='score', ascending=False)

# Example: Extract top keywords for CBE
keywords_cbe = extract_keywords(df[df['bank'] == 'CBE']['cleaned_review'])
print("\nTop Keywords for CBE:\n", keywords_cbe.head(10))

# -------------------- GROUP KEYWORDS INTO THEMES --------------------
# Manual theme mapping based on observed keywords
theme_groups = {
    "Login Issues": ["login", "access", "account", "password"],
    "Slow Performance": ["slow", "transfer", "load", "delay"],
    "Good UX": ["ui", "interface", "design", "navigation"],
    "Support Complaints": ["support", "help", "response", "contact"],
    "Feature Requests": ["feature", "add", "fingerprint", "notification"]
}

# -------------------- KEYWORD EXTRACTION PER BANK (Safe Version) --------------------
def extract_keywords(text_series, max_features=30):
    if text_series.isnull().all() or text_series.str.strip().str.len().sum() == 0:
        raise ValueError("No valid text to process.")
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features, ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(text_series)
    features = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.sum(axis=0).A1
    keywords_df = pd.DataFrame({'keyword': features, 'score': scores})
    return keywords_df.sort_values(by='score', ascending=False)

# -------------------- PROCESS ALL BANKS --------------------
print("\nTop Keywords by Bank:")
for bank_name in df['bank'].dropna().unique():
    subset = df[df['bank'] == bank_name]
    
    # Filter out too-short reviews
    valid_reviews = subset['cleaned_review'].dropna()
    valid_reviews = valid_reviews[valid_reviews.str.strip().str.len() > 10]

    if len(valid_reviews) == 0:
        print(f"\n⚠️ Skipped '{bank_name}' — no valid reviews to process.")
        continue
    
    try:
        keywords = extract_keywords(valid_reviews)
        print(f"\n🔹 {bank_name}:\n", keywords.head(10))
    except ValueError:
        print(f"\n⚠️ Skipped '{bank_name}' — empty vocabulary after cleaning.")

Device set to use cpu


Sentiment Summary:
        bank  rating  sentiment_score
0  Absiniya       1         0.989370
1  Absiniya       2         0.964561
2  Absiniya       3         0.983056
3  Absiniya       4         0.963430
4  Absiniya       5         0.977867


ValueError: empty vocabulary; perhaps the documents only contain stop words