In [1]:
from google_play_scraper import Sort, reviews
import pandas as pd

# List of bank apps with package names (update if incorrect)
apps = {
    "Commercial Bank of Ethiopia": "com.combanketh.mobilebanking",
    "Bank of Abyssinia": "com.boa.boaMobileBanking",
    "Dashen Bank": "com.cr2.amolelight"
}


# Desired number of reviews per app
N_REVIEWS = 400

# Master DataFrame to hold all reviews
all_reviews = pd.DataFrame()

for bank_name, app_id in apps.items():
    print(f"📦 Fetching reviews for {bank_name}...")
    
    reviews_list, _ = reviews(
        app_id,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=N_REVIEWS,
        filter_score_with=None  # Pull all ratings, not just 1-star, etc.
    )

    df = pd.DataFrame(reviews_list)
    df['bank'] = bank_name
    all_reviews = pd.concat([all_reviews, df], ignore_index=True)

# Save the scraped data
output_path = "bank_reviews.csv"
all_reviews.to_csv(output_path, index=False)
print(f"\n✅ Scraping complete. Data saved to: {output_path}")


📦 Fetching reviews for Commercial Bank of Ethiopia...
📦 Fetching reviews for Bank of Abyssinia...
📦 Fetching reviews for Dashen Bank...

✅ Scraping complete. Data saved to: bank_reviews.csv


  all_reviews = pd.concat([all_reviews, df], ignore_index=True)


In [2]:
import pandas as pd
from pathlib import Path

# 1️⃣  Load the raw scrape
raw_path = Path("bank_reviews.csv")
df = pd.read_csv(raw_path)

# 2️⃣  Basic cleaning
# -------------------------------------------------
# a) Drop exact duplicates based on the reviewId (if present)
if "reviewId" in df.columns:
    df = df.drop_duplicates(subset="reviewId")

# b) Drop rows with missing review text or rating
df = df.dropna(subset=["content", "score"])

# 3️⃣  Normalise the date
# -------------------------------------------------
# 'at' column from google-play-scraper is a full timestamp
df["date"] = pd.to_datetime(df["at"], errors="coerce").dt.date  # keep only YYYY-MM-DD
df = df.dropna(subset=["date"])  # remove rows we couldn’t parse

# 4️⃣  Keep / rename the requested columns
# -------------------------------------------------
df_clean = df.rename(columns={
    "content": "review",
    "score": "rating"
})[["review", "rating", "date", "bank"]]

df_clean["source"] = "Google Play"

# 5️⃣  Save the cleaned file
# -------------------------------------------------
clean_path = Path("bank_reviews_clean.csv")
df_clean.to_csv(clean_path, index=False)

print("✅ Clean file saved to:", clean_path)
print(df_clean.head())


✅ Clean file saved to: bank_reviews_clean.csv
                                              review  rating        date  \
0                         So bad now and hard to use       5  2025-06-09   
1  it is so amazing app. but, it is better to upd...       5  2025-06-09   
2                                         v.good app       4  2025-06-09   
3                                      very good app       1  2025-06-09   
4           Very amazing app indeed. I'm enjoying it       5  2025-06-08   

                          bank       source  
0  Commercial Bank of Ethiopia  Google Play  
1  Commercial Bank of Ethiopia  Google Play  
2  Commercial Bank of Ethiopia  Google Play  
3  Commercial Bank of Ethiopia  Google Play  
4  Commercial Bank of Ethiopia  Google Play  


In [3]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("bank_reviews_clean.csv")

# Show unique bank names
print("🏦 Banks found in dataset:", df['bank'].unique())


🏦 Banks found in dataset: ['Commercial Bank of Ethiopia' 'Bank of Abyssinia' 'Dashen Bank']


In [5]:
import pandas as pd

# Load your cleaned dataset
df = pd.read_csv("bank_reviews.csv")  # Replace with your actual CSV file path

# 1. 🔢 Number of reviews per bank
print("🔍 Number of Reviews Per Bank:")
print(df['bank'].value_counts())

# 2. 📉 Percentage of missing data per column
print("\n🧪 Percentage of Missing Data Per Column:")
missing_percent = df.isnull().mean() * 100
print(missing_percent.round(2))


🔍 Number of Reviews Per Bank:
bank
Commercial Bank of Ethiopia    400
Bank of Abyssinia              400
Dashen Bank                    400
Name: count, dtype: int64

🧪 Percentage of Missing Data Per Column:
reviewId                 0.00
userName                 0.00
userImage                0.00
content                  0.00
score                    0.00
thumbsUpCount            0.00
reviewCreatedVersion    25.33
at                       0.00
replyContent            99.92
repliedAt               99.92
appVersion              25.33
bank                     0.00
dtype: float64
