In [None]:
#  Install google-play-scraper
!pip install google-play-scraper

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [None]:
# Import required modules
from google_play_scraper import reviews, Sort
import pandas as pd
from datetime import datetime

In [None]:
# Function to scrape reviews for a given app
def scrape_reviews(app_id, bank_name, count=500):
    """
    Scrape reviews from the Google Play Store for a specific bank app.

    Parameters:
    - app_id (str): The app's unique package name.
    - bank_name (str): A label for which bank the app belongs to.
    - count (int): Number of reviews to fetch.

    Returns:
    - DataFrame with columns: review, rating, date, bank, source
    """
    print(f"Scraping {bank_name}...")
    result, _ = reviews(
        app_id,
        lang='en',         # Language of reviews
        country='us',      # Region
        sort=Sort.NEWEST,  # Sort by most recent
        count=count        # Number of reviews to fetch
    )

    # Build a structured DataFrame
    reviews_list = [{
        "review": r['content'],
        "rating": r['score'],
        "date": r['at'].strftime('%Y-%m-%d'),
        "bank": bank_name,
        "source": "Google Play"
    } for r in result]

    return pd.DataFrame(reviews_list)

In [None]:
# Scrape reviews for 3 Ethiopian bank apps using their official package names
df_cbe = scrape_reviews('com.combanketh.mobilebanking', 'CBE')
df_boa = scrape_reviews('com.boa.boaMobileBanking', 'BOA')
df_dashen = scrape_reviews('com.dashen.dashensuperapp', 'Dashen')

# Combine
all_reviews = pd.concat([df_cbe, df_boa, df_dashen], ignore_index=True)
len(all_reviews)


Scraping CBE...
Scraping BOA...
Scraping Dashen...


1449

In [None]:
all_reviews.head()

Unnamed: 0,review,rating,date,bank,source
0,yetemeta,1,2025-06-03,CBE,Google Play
1,Engida Kebede Fetera,5,2025-06-03,CBE,Google Play
2,good,5,2025-06-03,CBE,Google Play
3,it is not safety,1,2025-06-03,CBE,Google Play
4,NICE bank,5,2025-06-03,CBE,Google Play


In [None]:
# Preprocessing function
def preprocess_reviews(df):
    """
    Cleans the review data:
    - Removes duplicates
    - Drops missing or empty reviews
    - Trims whitespace
    """
    df = df.drop_duplicates(subset='review')                     # Remove duplicate texts
    df = df.dropna(subset=['review', 'rating', 'date'])          # Drop rows with missing key info
    df['review'] = df['review'].str.strip()                      # Remove leading/trailing spaces
    df = df[df['review'] != '']                                  # Remove empty reviews
    return df

In [None]:
# Apply preprocessing
clean_reviews = preprocess_reviews(all_reviews)

In [None]:
# Save to CSV
clean_reviews.to_csv('clean_bank_reviews.csv', index=False)
print("✅ Cleaned reviews saved to 'clean_bank_reviews.csv'")

✅ Cleaned reviews saved to 'clean_bank_reviews.csv'
