# Task 2: Sentiment and Thematic Analysis

This notebook performs sentiment analysis and thematic analysis on the Google Play Store reviews collected in Task 1.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sys
import os

# Add src to path to import custom modules
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

from utils.analyzer import SentimentAnalyzer, ThematicAnalyzer

# For downloading spacy model if not present
# import spacy
# try:
#     nlp = spacy.load('en_core_web_sm')
# except OSError:
#     print('Downloading language model for spaCy...')
#     from spacy.cli import download
#     download('en_core_web_sm')
#     nlp = spacy.load('en_core_web_sm')


Added c:\Users\Eyoel\Desktop\Files\scraping-week-2\src to sys.path


  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Data

In [2]:
DATA_PATH = '../data/google_play_reviews.csv'
df_reviews = pd.read_csv(DATA_PATH)

print(f"Loaded {len(df_reviews)} reviews from {DATA_PATH}")
df_reviews.head()

Loaded 1185 reviews from ../data/google_play_reviews.csv


Unnamed: 0,review,rating,date,bank,source
0,yetemeta,1,2025-06-03,Commercial Bank of Ethiopia,Google Play
1,Engida Kebede Fetera,5,2025-06-03,Commercial Bank of Ethiopia,Google Play
2,good,5,2025-06-03,Commercial Bank of Ethiopia,Google Play
3,it is not safety,1,2025-06-03,Commercial Bank of Ethiopia,Google Play
4,NICE bank,5,2025-06-03,Commercial Bank of Ethiopia,Google Play


## 2. Sentiment Analysis

In [3]:
sentiment_analyzer = SentimentAnalyzer()

# Ensure 'review' column is of string type and handle NaNs by converting to empty string
df_reviews['review_text_for_sentiment'] = df_reviews['review'].astype(str).fillna('')

# Predict sentiment for all reviews (can be time-consuming for large datasets)
# Consider processing in batches if memory/time is an issue
review_texts = df_reviews['review_text_for_sentiment'].tolist()

# For testing, process a small subset first
# test_sentiments = sentiment_analyzer.predict_sentiment(review_texts[:10])
# print(test_sentiments)

if sentiment_analyzer.sentiment_pipeline: # Proceed only if model loaded
    print(f"Starting sentiment prediction for {len(review_texts)} reviews...")
    sentiments = sentiment_analyzer.predict_sentiment(review_texts)
    print(f"Finished sentiment prediction.")
    
    # Add sentiment to DataFrame
    if sentiments and len(sentiments) == len(df_reviews):
        df_reviews['sentiment_label'] = [s['label'] for s in sentiments]
        df_reviews['sentiment_score'] = [s['score'] for s in sentiments]
        print("Sentiment labels and scores added to DataFrame.")
    else:
        print("Could not add sentiment labels/scores. Mismatch in length or empty results.")
        df_reviews['sentiment_label'] = 'Error'
        df_reviews['sentiment_score'] = np.nan
else:
    print("Sentiment model not loaded. Skipping sentiment analysis.")
    df_reviews['sentiment_label'] = 'Not Processed'
    df_reviews['sentiment_score'] = np.nan

df_reviews[['review', 'sentiment_label', 'sentiment_score']].head()

Sentiment pipeline loaded successfully with model: distilbert-base-uncased-finetuned-sst-2-english
Starting sentiment prediction for 1185 reviews...
Finished sentiment prediction.
Sentiment labels and scores added to DataFrame.


Unnamed: 0,review,sentiment_label,sentiment_score
0,yetemeta,POSITIVE,0.655367
1,Engida Kebede Fetera,NEGATIVE,0.839674
2,good,POSITIVE,0.999816
3,it is not safety,NEGATIVE,0.999787
4,NICE bank,POSITIVE,0.999806


### 2.1. Aggregate Sentiment by Bank and Rating

In [4]:
# Convert sentiment_label to numerical for aggregation if needed (e.g., POSITIVE=1, NEGATIVE=-1, NEUTRAL=0)
# Or analyze distribution of labels
if 'sentiment_label' in df_reviews.columns and df_reviews['sentiment_label'].isin(['POSITIVE', 'NEGATIVE']).any():
    sentiment_summary = df_reviews.groupby(['bank', 'rating', 'sentiment_label']).size().unstack(fill_value=0)
    print("Sentiment Distribution by Bank and Rating:")
    print(sentiment_summary)
    
    # Example: Mean sentiment score (if scores are meaningful for direct averaging)
    # Note: DistilBERT scores are confidence in the label, not directly comparable as +1/-1
    # For a more direct 'polarity' score, VADER or TextBlob might be simpler if that's needed.
    # Here, we'll look at the distribution of labels primarily.
    
    # Calculate percentage of positive/negative reviews per bank
    bank_sentiment_dist = df_reviews.groupby('bank')['sentiment_label'].value_counts(normalize=True).mul(100).unstack(fill_value=0)
    print("Sentiment Percentage per Bank:")
    print(bank_sentiment_dist)
else:
    print("Sentiment labels not available or not in expected format for aggregation.")

Sentiment Distribution by Bank and Rating:
sentiment_label                     NEGATIVE  POSITIVE
bank                        rating                    
Bank of Abysinnia           1            149        15
                            2             10         1
                            3             17        13
                            4              9         8
                            5             43       134
Commercial Bank of Ethiopia 1             36        10
                            2              9         4
                            3             16         5
                            4             19        20
                            5             44       224
Dashen Bank                 1             31         2
                            2             15         1
                            3              6         5
                            4             12        12
                            5             38       277
Sentiment Percentage p

## 3. Thematic Analysis (Placeholder)

In [5]:
thematic_analyzer = ThematicAnalyzer()
# Placeholder for using thematic_analyzer
# df_reviews['keywords'] = thematic_analyzer.extract_keywords(df_reviews['review_text_for_sentiment'].tolist())
# df_reviews['themes'] = ... based on keyword clustering ...
df_reviews.head()

Unnamed: 0,review,rating,date,bank,source,review_text_for_sentiment,sentiment_label,sentiment_score
0,yetemeta,1,2025-06-03,Commercial Bank of Ethiopia,Google Play,yetemeta,POSITIVE,0.655367
1,Engida Kebede Fetera,5,2025-06-03,Commercial Bank of Ethiopia,Google Play,Engida Kebede Fetera,NEGATIVE,0.839674
2,good,5,2025-06-03,Commercial Bank of Ethiopia,Google Play,good,POSITIVE,0.999816
3,it is not safety,1,2025-06-03,Commercial Bank of Ethiopia,Google Play,it is not safety,NEGATIVE,0.999787
4,NICE bank,5,2025-06-03,Commercial Bank of Ethiopia,Google Play,NICE bank,POSITIVE,0.999806


## 4. Save Results

In [6]:
OUTPUT_TASK2_CSV_PATH = '../data/reviews_with_sentiment_themes.csv'
df_reviews.to_csv(OUTPUT_TASK2_CSV_PATH, index=False)
print(f"Results saved to {OUTPUT_TASK2_CSV_PATH}")

Results saved to ../data/reviews_with_sentiment_themes.csv


## 5. KPIs Check (Placeholder)

In [7]:
# KPI: Sentiment scores for 90%+ reviews.
processed_sentiment_count = df_reviews[df_reviews['sentiment_label'] != 'Error'].shape[0]
total_reviews = len(df_reviews)
sentiment_coverage = (processed_sentiment_count / total_reviews) * 100 if total_reviews > 0 else 0
print(f"Sentiment analysis coverage: {sentiment_coverage:.2f}%")

# KPI: 3+ themes per bank with examples. (To be implemented)

Sentiment analysis coverage: 100.00%
