In [1]:

### In[2]:

# Cell 1: Setup and Imports

import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import warnings

# Suppress warnings for a cleaner notebook output
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 150)

print("Libraries imported successfully.")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully.


In [2]:
# Cell 2: Load Project Configuration
# This cell imports the configuration from your `config.py` file.
# It adjusts the system path to find the config file at the project root.

try:
    # Add the project root directory to the Python path
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
    import config
    print("Configuration file 'config.py' loaded successfully.")
    print(f"Processed data directory: {config.PROCESSED_DATA_DIR}")
except ImportError:
    print("ERROR: config.py not found.")
    print("Please ensure this notebook is in a 'notebooks' directory and config.py is in the project root.")
    config = None

Configuration file 'config.py' loaded successfully.
Processed data directory: data/processed/


In [3]:
# Cell 3: Download NLTK Resources and Initialize
# This function checks for necessary NLTK resources and downloads them if missing.

def download_nltk_resources():
    resources = {
        "stopwords": "corpora/stopwords",
        "punkt": "tokenizers/punkt",
        "wordnet": "corpora/wordnet",
        "omw-1.4": "corpora/omw-1.4"
    }
    for resource_name, resource_path in resources.items():
        try:
            nltk.data.find(resource_path)
            print(f"NLTK resource '{resource_name}' already downloaded.")
        except LookupError:
            print(f"NLTK resource '{resource_name}' not found. Downloading...")
            nltk.download(resource_name)

download_nltk_resources()

# Initialize global objects for NLP tasks
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
print("\nNLTK resources ready.")

NLTK resource 'stopwords' already downloaded.
NLTK resource 'punkt' already downloaded.
NLTK resource 'wordnet' not found. Downloading...
NLTK resource 'omw-1.4' not found. Downloading...

NLTK resources ready.


[nltk_data] Downloading package wordnet to /home/fentahun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/fentahun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
# Cell 4: Load Sentiment Analysis Model
# Loads the pre-trained sentiment analysis model from Hugging Face specified in the config.

if config:
    print("Loading sentiment analysis model...")
    try:
        sentiment_classifier = pipeline(
            "sentiment-analysis",
            model=config.SENTIMENT_MODEL_NAME,
            device=-1  # Use -1 for CPU, 0 for CUDA GPU if available
        )
        print("Sentiment model loaded successfully.")
    except Exception as e:
        print(f"Error loading sentiment model ({config.SENTIMENT_MODEL_NAME}): {e}")
        print("Sentiment analysis might be impacted or skipped.")
        sentiment_classifier = None
else:
    print("Skipping model loading because config was not found.")
    sentiment_classifier = None

Loading sentiment analysis model...


Device set to use cpu


Sentiment model loaded successfully.


In [5]:
# Cell 5: Define Analysis Helper Functions
# These functions will be used to process the data in the subsequent steps.

def get_sentiment_batch(text_batch):
    """Gets sentiment for a batch of texts using the loaded pipeline."""
    if sentiment_classifier is None:
        return [{'label': 'NEUTRAL', 'score': 0.0}] * len(text_batch)

    truncated_batch = [text[:510] for text in text_batch]
    try:
        return sentiment_classifier(truncated_batch)
    except Exception as e:
        print(f"Error during sentiment prediction: {e}")
        return [{'label': 'ERROR', 'score': 0.0}] * len(text_batch)

def preprocess_nlp_text(text):
    """Cleans text for NLP: lowercasing, removing numbers/punctuation, tokenizing, lemmatizing, and removing stopwords."""
    if pd.isna(text): return ""
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

def extract_themes_from_text_v2(nlp_text, original_text):
    """Identifies themes from text by matching keywords from the config."""
    identified_themes = set()
    search_text = (str(nlp_text) + " " + str(original_text).lower()).strip()

    if config:
        for theme, keywords in config.THEMES_KEYWORDS.items():
            for keyword in keywords:
                if re.search(r'\b' + re.escape(keyword) + r'\b', search_text, re.IGNORECASE):
                    identified_themes.add(theme)
    
    return list(identified_themes) if identified_themes else ['General Feedback']

print("Helper functions defined.")

Helper functions defined.


In [6]:
# Cell 6: Load Cleaned Data from File
# This cell loads the preprocessed data from the path specified in `config.py`.
# It assumes `preprocess.py` has already been run.

if config:
    cleaned_file_path = os.path.join(config.PROCESSED_DATA_DIR, "all_banks_reviews_cleaned.csv")
    analyzed_file_path = os.path.join(config.PROCESSED_DATA_DIR, "all_banks_reviews_analyzed.csv")

    if not os.path.exists(cleaned_file_path):
        print(f"Cleaned data file not found: {cleaned_file_path}")
        print("Please run the `scripts/preprocess.py` script first.")
    else:
        df = pd.read_csv(cleaned_file_path)
        # Ensure cleaned_review_text is string, handle potential NaN from CSV read
        df['cleaned_review_text'] = df['cleaned_review_text'].astype(str).fillna('')
        print(f"Loaded {len(df)} cleaned reviews for analysis.")
        display(df.head())
else:
    print("Skipping data loading because config was not found.")

Loaded 5413 cleaned reviews for analysis.


Unnamed: 0,review_id,cleaned_review_text,rating,date,bank_name,source,user_name,thumbs_up_count,app_version,review_text
0,54f070b9-895f-40e8-be18-acb16f8af7fa,best mobile banking app ever,5,2025-06-04,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.0.4,Best Mobile Banking app ever
1,9af95d2a-e42c-45ff-a4c9-5c38765df4a4,it was good app but it have some issues like it doesnt give me the right amount that i have in the bank and have some issues in transferring,2,2025-06-04,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.1.0,it was good app but it have some issues like it doesnt give me the right amount that I have in the bank and have some issues in transferring
2,fce00cda-d71a-486e-a4c2-7479ab7793bc,best app of finance,5,2025-06-04,Commercial Bank of Ethiopia,Google Play,A Google user,0,,best app of finance
3,2470d199-834a-4134-a0f9-8c684ba75491,engida kebede fetera,5,2025-06-03,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.1.0,Engida Kebede Fetera
4,428e5117-387f-4c9d-b095-3230e3f83a8a,it is not safety,1,2025-06-03,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.1.0,it is not safety


In [7]:
# Cell 7: Perform and Interpret Sentiment Analysis
# Runs the sentiment analysis model on the review texts in batches.
# It then adjusts the final sentiment label based on the star rating for better accuracy.

if 'df' in locals() and sentiment_classifier:
    print("Performing sentiment analysis...")
    batch_size = 64
    sentiments_data = []
    num_batches = (len(df) - 1) // batch_size + 1

    for i in range(0, len(df), batch_size):
        batch_texts = df['cleaned_review_text'][i:i+batch_size].tolist()
        batch_sentiments = get_sentiment_batch(batch_texts)
        sentiments_data.extend(batch_sentiments)
        print(f"  Processed sentiment batch {i//batch_size + 1}/{num_batches}")

    df['sentiment_label_raw'] = [s['label'] for s in sentiments_data]
    df['sentiment_score_raw'] = [s['score'] for s in sentiments_data]

    # Define the interpretation function to adjust sentiment based on rating
    def interpret_sentiment(row):
        label_raw, score_raw, rating = row['sentiment_label_raw'], row['sentiment_score_raw'], row['rating']
        
        final_label, numeric_sentiment = 'NEUTRAL', 0
        if label_raw == 'POSITIVE':
            final_label, numeric_sentiment = 'POSITIVE', 1
        elif label_raw == 'NEGATIVE':
            final_label, numeric_sentiment = 'NEGATIVE', -1
        
        # Override logic
        if rating == 3:
            final_label, numeric_sentiment = 'NEUTRAL', 0
        elif rating <= 2 and final_label == 'POSITIVE':
            final_label, numeric_sentiment = 'NEGATIVE', -1
        elif rating >= 4 and final_label == 'NEGATIVE':
            final_label, numeric_sentiment = 'POSITIVE', 1

        return final_label, score_raw, numeric_sentiment

    sentiment_cols = df.apply(interpret_sentiment, axis=1, result_type='expand')
    df['sentiment_label'] = sentiment_cols[0]
    df['sentiment_score_model'] = sentiment_cols[1]
    df['sentiment_numeric'] = sentiment_cols[2]

    print("\nSentiment analysis completed.")
    print("\nSentiment distribution (after adjustment):")
    print(df['sentiment_label'].value_counts(normalize=True).round(3))
else:
    print("Sentiment analysis skipped (DataFrame or model not loaded).")
    if 'df' in locals():
        df['sentiment_label'] = 'NOT_COMPUTED'
        df['sentiment_score_model'] = 0.0
        df['sentiment_numeric'] = 0

Performing sentiment analysis...
  Processed sentiment batch 1/85
  Processed sentiment batch 2/85
  Processed sentiment batch 3/85
  Processed sentiment batch 4/85
  Processed sentiment batch 5/85
  Processed sentiment batch 6/85
  Processed sentiment batch 7/85
  Processed sentiment batch 8/85
  Processed sentiment batch 9/85
  Processed sentiment batch 10/85
  Processed sentiment batch 11/85
  Processed sentiment batch 12/85
  Processed sentiment batch 13/85
  Processed sentiment batch 14/85
  Processed sentiment batch 15/85
  Processed sentiment batch 16/85
  Processed sentiment batch 17/85
  Processed sentiment batch 18/85
  Processed sentiment batch 19/85
  Processed sentiment batch 20/85
  Processed sentiment batch 21/85
  Processed sentiment batch 22/85
  Processed sentiment batch 23/85
  Processed sentiment batch 24/85
  Processed sentiment batch 25/85
  Processed sentiment batch 26/85
  Processed sentiment batch 27/85
  Processed sentiment batch 28/85
  Processed sentiment ba

In [8]:
# Cell 8: Perform Thematic Analysis
# Applies NLP preprocessing and uses keyword matching to assign one or more themes to each review.

if 'df' in locals():
    print("Performing thematic analysis...")
    df['nlp_processed_text'] = df['cleaned_review_text'].apply(preprocess_nlp_text)
    df['identified_themes'] = df.apply(
        lambda row: extract_themes_from_text_v2(row['nlp_processed_text'], row['review_text']), axis=1
    )
    df['themes_str'] = df['identified_themes'].apply(lambda x: ', '.join(sorted(x)))

    print("Thematic analysis completed.")
    
    # Display the top themes found across all reviews
    themes_exploded = df.explode('identified_themes')
    print("\nTop 15 themes identified overall:")
    display(themes_exploded['identified_themes'].value_counts().nlargest(15))

Performing thematic analysis...
Thematic analysis completed.

Top 15 themes identified overall:


identified_themes
General Feedback               2651
Bugs & Reliability             1042
Transaction Performance         976
Feature Requests                845
User Interface & Experience     727
Account Access Issues           262
Customer Support                191
App Speed & Performance         153
Security Concerns               143
Name: count, dtype: int64

In [9]:
# Cell 9: Extract Top Keywords per Bank (TF-IDF)
# Uses TF-IDF to find the most relevant and important unigrams and bigrams for each bank,
# helping to identify unique bank-specific topics.

if 'df' in locals():
    print("Extracting top keywords per bank using TF-IDF...")
    
    for bank_name in sorted(df['bank_name'].unique()):
        print(f"\n--- Top Keywords for {bank_name} ---")
        bank_mask = df['bank_name'] == bank_name
        bank_texts = df.loc[bank_mask, 'nlp_processed_text'].dropna()
        
        if len(bank_texts) > 1: # TF-IDF needs at least two documents
            try:
                vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1,2), stop_words='english')
                tfidf_matrix = vectorizer.fit_transform(bank_texts)
                feature_names = vectorizer.get_feature_names_out()
                
                sum_tfidf = tfidf_matrix.sum(axis=0)
                tfidf_scores = [(feature_names[col], sum_tfidf[0, col]) for col in range(sum_tfidf.shape[1])]
                sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
                top_keywords = [word for word, score in sorted_tfidf_scores[:15]]
                
                print(', '.join(top_keywords))
                
            except ValueError as e:
                 print(f"Could not extract keywords: {e}")
        else:
            print(f"Not enough reviews to extract TF-IDF keywords.")

Extracting top keywords per bank using TF-IDF...

--- Top Keywords for Bank of Abyssinia ---
app, bank, work, good, working, update, time, banking, worst, mobile, boa, doesnt, use, best, like

--- Top Keywords for Commercial Bank of Ethiopia ---
app, good, best, use, easy, like, update, bank, nice, application, transaction, cbe, working, work, time

--- Top Keywords for Dashen Bank ---
app, dashen, bank, super, best, banking, good, dashen bank, use, feature, easy, super app, fast, amazing, step

--- Top Keywords for Enat Bank ---
app, use, good, work, update, easy, best, working, bank, open, mobile, easy use, doesnt, banking, application


In [10]:
# Cell 10: Save Final Analyzed Data
# Selects the final set of columns and saves the enriched DataFrame to a new CSV file.

if 'df' in locals():
    # Select relevant columns for the final output file
    cols_to_save = [
        'review_id', 'review_text', 'cleaned_review_text', 'rating', 'date', 'bank_name', 'source',
        'user_name', 'thumbs_up_count', 'app_version',
        'sentiment_label', 'sentiment_score_model', 'sentiment_numeric',
        'nlp_processed_text', 'themes_str'
    ]
    # Ensure all selected columns exist in the DataFrame
    final_df_cols = [col for col in cols_to_save if col in df.columns]
    df_analyzed = df[final_df_cols]

    # Save the dataframe to a new CSV file
    df_analyzed.to_csv(analyzed_file_path, index=False, encoding='utf-8')
    
    print(f"\nAnalyzed data with sentiment and themes saved to:")
    print(analyzed_file_path)
    
    print("\nPreview of the final analyzed data:")
    display(df_analyzed.head())


Analyzed data with sentiment and themes saved to:
data/processed/all_banks_reviews_analyzed.csv

Preview of the final analyzed data:


Unnamed: 0,review_id,review_text,cleaned_review_text,rating,date,bank_name,source,user_name,thumbs_up_count,app_version,sentiment_label,sentiment_score_model,sentiment_numeric,nlp_processed_text,themes_str
0,54f070b9-895f-40e8-be18-acb16f8af7fa,Best Mobile Banking app ever,best mobile banking app ever,5,2025-06-04,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.0.4,POSITIVE,0.999729,1,best mobile banking app ever,General Feedback
1,9af95d2a-e42c-45ff-a4c9-5c38765df4a4,it was good app but it have some issues like it doesnt give me the right amount that I have in the bank and have some issues in transferring,it was good app but it have some issues like it doesnt give me the right amount that i have in the bank and have some issues in transferring,2,2025-06-04,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.1.0,NEGATIVE,0.998119,-1,good app issue like doesnt give right amount bank issue transferring,Bugs & Reliability
2,fce00cda-d71a-486e-a4c2-7479ab7793bc,best app of finance,best app of finance,5,2025-06-04,Commercial Bank of Ethiopia,Google Play,A Google user,0,,POSITIVE,0.99965,1,best app finance,General Feedback
3,2470d199-834a-4134-a0f9-8c684ba75491,Engida Kebede Fetera,engida kebede fetera,5,2025-06-03,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.1.0,POSITIVE,0.839674,1,engida kebede fetera,General Feedback
4,428e5117-387f-4c9d-b095-3230e3f83a8a,it is not safety,it is not safety,1,2025-06-03,Commercial Bank of Ethiopia,Google Play,A Google user,0,5.1.0,NEGATIVE,0.999787,-1,safety,General Feedback
