# Task 1: Data Collection and Preprocessing
This notebook contains the script to scrape and preprocess reviews from the Google Play Store for CBE, BOA, and Dashen Bank.

In [8]:
import pandas as pd
from google_play_scraper import reviews, Sort
from datetime import datetime
import uuid
import os  
print(os.getcwd())

apps = {
   'Commercial Bank of Ethiopia': 'com.combanketh.mobilebanking',
    'Bank of Abyssinia': 'com.boa.boaMobileBanking',
    'Dashen Bank': 'com.dashen.dashensuperapp'
}

def scrape_reviews(app_id, app_name, count=400):
    try:
        result, _ = reviews(app_id, lang='en', country='et', sort=Sort.NEWEST, count=count)
        print(f"Scraped {len(result)} reviews for {app_name}")
        reviews_list = []
        for review in result:
            reviews_list.append({
                'review_id': review['reviewId'],
                'review': review['content'],
                'rating': review['score'],
                'date': review['at'],
                'bank': app_name,
                'source': 'Google Play'
            })
        return pd.DataFrame(reviews_list)
    except Exception as e:
        print(f"Error scraping {app_name}: {e}")
        return pd.DataFrame()

all_reviews = []
for app_name, app_id in apps.items():
    df = scrape_reviews(app_id, app_name)
    print(f"DataFrame for {app_name}: {df.shape}")
    all_reviews.append(df)

df_reviews = pd.concat(all_reviews, ignore_index=True)
print(f"Combined DataFrame shape: {df_reviews.shape}")
print(f"Columns: {df_reviews.columns.tolist()}")

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

if df_reviews.empty:
    print("No reviews scraped. Consider using fallback dataset.")
else:
    df_reviews = df_reviews.drop_duplicates(subset=['review_id'])
    df_reviews['review'] = df_reviews['review'].fillna('')
    df_reviews['rating'] = df_reviews['rating'].fillna(df_reviews['rating'].median())
    df_reviews = df_reviews.dropna(subset=['date'])
    df_reviews['date'] = pd.to_datetime(df_reviews['date']).dt.strftime('%Y-%m-%d')
    df_reviews.to_csv('data/bank_reviews_cleaned.csv', index=False)
    print(f"Total reviews: {len(df_reviews)}")
    print(f"Missing data: {df_reviews.isnull().mean().mean() * 100:.2f}%")

c:\Users\hp\OneDrive\Desktop\Customer Experience Analytics for Fintech Apps\fintech-customer-analytics\notebook
Scraped 400 reviews for Commercial Bank of Ethiopia
DataFrame for Commercial Bank of Ethiopia: (400, 6)
Scraped 400 reviews for Bank of Abyssinia
DataFrame for Bank of Abyssinia: (400, 6)
Scraped 400 reviews for Dashen Bank
DataFrame for Dashen Bank: (400, 6)
Combined DataFrame shape: (1200, 6)
Columns: ['review_id', 'review', 'rating', 'date', 'bank', 'source']
Total reviews: 1200
Missing data: 0.00%


In [11]:
import os
print(os.getcwd())


c:\Users\hp\OneDrive\Desktop\Customer Experience Analytics for Fintech Apps\fintech-customer-analytics\notebook


# Task 2: Sentiment and Thematic Analysis
This section will include sentiment analysis using NLTK's VADER and thematic analysis of the reviews.

In [24]:
# Sentiment Analysis
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
df = pd.read_csv('C:\\Users\\hp\\OneDrive\\Desktop\\Customer Experience Analytics for Fintech Apps\\fintech-customer-analytics\\notebook\\data\\bank_reviews_cleaned.csv')
print("Loaded DataFrame shape:", df.shape)
print("Unique banks:", df['bank'].unique())
print("Sample banks (first 50):", df['bank'].head(50).tolist())
print("Sample banks (middle 50):", df['bank'].iloc[575:625].tolist())  # Approx middle
print("Sample banks (last 50):", df['bank'].tail(50).tolist())
df['sentiment'] = df['review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'Positive' if x > 0.05 else 'Negative' if x < -0.05 else 'Neutral')
print("Before save - Sample banks:", df['bank'].head(10).tolist())
df.to_csv('C:\\Users\\hp\\OneDrive\\Desktop\\Customer Experience Analytics for Fintech Apps\\fintech-customer-analytics\\notebook\\data\\bank_reviews_with_sentiment_new.csv', index=False, mode='w')
print("Saved to new file, verifying...")
saved_df = pd.read_csv('C:\\Users\\hp\\OneDrive\\Desktop\\Customer Experience Analytics for Fintech Apps\\fintech-customer-analytics\\notebook\\data\\bank_reviews_with_sentiment_new.csv')
print("Verified DataFrame shape:", saved_df.shape)
print("Verified Unique banks:", saved_df['bank'].unique())
print("Verified Sample banks (first 50):", saved_df['bank'].head(50).tolist())
print("Verified Sample banks (last 50):", saved_df['bank'].tail(50).tolist())
print("Sentiment Analysis Results:")
print(df['sentiment_label'].value_counts())
print("\nAverage Sentiment by Bank:")
print(df.groupby('bank')['sentiment'].mean())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Loaded DataFrame shape: (1200, 6)
Unique banks: ['Commercial Bank of Ethiopia' 'Bank of Abyssinia' 'Dashen Bank']
Sample banks (first 50): ['Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Ethiopia', 'Commercial Bank of Eth

In [25]:
df.to_csv('C:\\Users\\hp\\OneDrive\\Desktop\\Customer Experience Analytics for Fintech Apps\\fintech-customer-analytics\\notebook\\data\\temp_sentiment.csv', index=False, mode='w')


In [26]:
# Thematic Analysis
from collections import Counter
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

df = pd.read_csv('C:\\Users\\hp\\OneDrive\\Desktop\\Customer Experience Analytics for Fintech Apps\\fintech-customer-analytics\\notebook\\data\\bank_reviews_with_sentiment.csv')
stop_words = set(stopwords.words('english'))
words = ' '.join(df['review']).lower()
words = re.findall(r'\w+', words)
words = [word for word in words if word not in stop_words]
common_themes = Counter(words).most_common(10)
print("Top 10 Themes (Common Words) Across All Banks:", common_themes)

for bank in df['bank'].unique():
    bank_words = ' '.join(df[df['bank'] == bank]['review']).lower()
    bank_words = re.findall(r'\w+', bank_words)
    bank_words = [word for word in bank_words if word not in stop_words]
    bank_themes = Counter(bank_words).most_common(5)
    print(f"\nTop 5 Themes for {bank}:", bank_themes)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\hp\\OneDrive\\Desktop\\Customer Experience Analytics for Fintech Apps\\fintech-customer-analytics\\notebook\\data\\bank_reviews_with_sentiment.csv'