In [21]:
from google_play_scraper import Sort, reviews
from google_play_scraper import reviews_all
import pandas as pd
from datetime import datetime
import os

In [2]:
apps = {
    "CBE": "com.combanketh.mobilebanking",
    "BOA": "com.boa.boaMobileBanking",
    "Dashen Bank": "com.dashen.dashensuperapp"
}

In [10]:
def clean_text(text):
    """Basic cleaning of review text."""
    if not isinstance(text, str):
        return ""
    # Add more cleaning steps here if needed
    return text.strip()


In [14]:
def preprocess_reviews(app_id, bank_name):
    print(f"Scraping reviews for {bank_name}...")

    # Scrape all reviews
    reviews = reviews_all(app_id)
    
    # Convert to DataFrame
    df = pd.DataFrame(reviews)

    # Rename and select columns
    df = df.rename(columns={
        "content": "review",
        "score": "rating",
        "at": "date"
    })[["review", "rating", "date"]]
    
    # Clean and preprocess
    df["review"] = df["review"].astype(str).apply(clean_text)
    df["review_length"] = df["review"].apply(len)
    
    # Remove duplicates
    df.drop_duplicates(subset=["review"], inplace=True)

    # Drop missing or very short reviews
    df.dropna(subset=["review", "rating"], inplace=True)
    df = df[df["review_length"] > 2]

    # Normalize date format
    df["date"] = df["date"].apply(lambda x: x.strftime("%Y-%m-%d") if pd.notnull(x) else None)

    # Add metadata
    df["bank"] = bank_name
    df["source"] = "Google Play Store"

In [18]:
def preprocess_reviews(app_id, bank_name):
    print(f"Scraping reviews for {bank_name}...")

    # Scrape reviews
    reviews = reviews_all(app_id)

    # Convert to DataFrame
    df = pd.DataFrame(reviews)

    # Rename and select columns
    df = df.rename(columns={
        "content": "review",
        "score": "rating",
        "at": "date"
    })[["review", "rating", "date"]]

    # Clean and preprocess
    df["review"] = df["review"].astype(str).apply(clean_text)
    df["review_length"] = df["review"].apply(len)

    # Remove duplicates and short reviews
    df.drop_duplicates(subset=["review"], inplace=True)
    df.dropna(subset=["review", "rating"], inplace=True)
    df = df[df["review_length"] > 2]

    # Format dates
    df["date"] = df["date"].apply(lambda x: x.strftime("%Y-%m-%d") if pd.notnull(x) else None)

    # Add metadata
    df["bank"] = bank_name
    df["source"] = "Google Play Store"

    # Ensure we have at most 400 reviews
    count = len(df)
    if count < 400:
        print(f"⚠️ Warning: Only {count} valid reviews scraped for {bank_name}.")
    else:
        df = df.head(400)
        print(f"✅ Collected 400 reviews for {bank_name}.")

    # Save to CSV
    filename = f"data/{bank_name}_reviews_cleaned.csv"
    df.to_csv(filename, index=False)
    print(f"💾 Saved cleaned reviews to {filename}\n")
    return df


In [23]:
def main():
    all_dfs = []
    for bank_name, app_id in apps.items():
        df = preprocess_reviews(app_id, bank_name)
        all_dfs.append(df)

    # Combine all reviews
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_df.to_csv("data/all_banks_reviews_cleaned.csv", index=False)
    print("✅ Saved combined cleaned reviews for all banks to data/all_banks_reviews_cleaned.csv")

if __name__ == "__main__":
    main()

Scraping reviews for CBE...
✅ Collected 400 reviews for CBE.
💾 Saved cleaned reviews to data/CBE_reviews_cleaned.csv

Scraping reviews for BOA...
✅ Collected 400 reviews for BOA.
💾 Saved cleaned reviews to data/BOA_reviews_cleaned.csv

Scraping reviews for Dashen Bank...
✅ Collected 400 reviews for Dashen Bank.
💾 Saved cleaned reviews to data/Dashen Bank_reviews_cleaned.csv

✅ Saved combined cleaned reviews for all banks to data/all_banks_reviews_cleaned.csv
