In [36]:
from google_play_scraper import Sort, reviews
import pandas as pd

def scrape_reviews(app_id, bank_name, lang='am', count=500):
    # scraping review from Play_Store
    all_reviews = []
    try:
        result, _ = reviews(
            app_id,
            country='et',  # Ethiopia
            lang=lang,        # defaults to 'en'
            count=count,
            sort=Sort.NEWEST  # defaults to Sort.NEWEST
        )
        # Creating DataFrame from reviews data
        for entry in result:
            all_reviews.append({
                'Review Description': entry.get('content', ''),
                'User': entry.get('userName', 'Anonymous'),
                'Rating': entry.get('score', None),
                'Date': entry.get('at', None),
                'Bank': bank_name,
                'Source': 'Google Play Store'
            })
        print(f"✅ Scraped {len(all_reviews)} reviews for {bank_name}")
    except Exception as e:
        print(f"❌ Failed to scrape {bank_name}: {e}")
    return all_reviews


In [37]:
if __name__ == "__main__":
    apps = {
        "CBE": "com.combanketh.mobilebanking",
        "BOA": "com.boa.boaMobileBanking",
        "Dashen": "com.dashen.dashensuperapp"
    }

    all_data = []
    for bank, app_id in apps.items():
        all_data.extend(scrape_reviews(app_id, bank))

    df = pd.DataFrame(all_data)
    df.to_csv("raw_reviews.csv", index=False)


✅ Scraped 54 reviews for CBE
✅ Scraped 12 reviews for BOA
✅ Scraped 1 reviews for Dashen


In [38]:
df_app = pd.read_csv("raw_reviews.csv")
df_app

Unnamed: 0,Review Description,User,Rating,Date,Bank,Source
0,በጠምጡሩ,Semira K,5,2025-06-04 03:18:52,CBE,Google Play Store
1,ከሠማይየሚወርደውንዕምድርይችለዋል,Eidafnediadem Aboobker,5,2025-06-04 02:54:11,CBE,Google Play Store
2,ዋው,محمد سعيد,5,2025-05-21 13:12:31,CBE,Google Play Store
3,it makes trading easier keep it up,Rediet Asmrom,5,2025-04-13 17:05:44,CBE,Google Play Store
4,betam konjo,lemii lemii,5,2025-03-21 17:33:43,CBE,Google Play Store
...,...,...,...,...,...,...
62,አስፈላጊ,Engdawork,5,2024-05-01 02:52:05,BOA,Google Play Store
63,tebedu arif app new,hola holie,4,2024-04-27 10:31:44,BOA,Google Play Store
64,በርቱ,Siraj,5,2024-04-25 14:25:55,BOA,Google Play Store
65,Good application,Nureadine Eliyas Badawi,5,2024-02-16 20:58:25,BOA,Google Play Store


In [39]:
import pandas as pd

def preprocess_reviews(file_path, output_path):
    df = pd.read_csv(file_path)

    # Drop duplicates
    df.drop_duplicates(subset=['Review Description'], inplace=True)

    # Drop rows with missing critical values
    df.dropna(subset=['Review Description', 'Rating', 'Date', 'Bank'], inplace=True)

    # Normalize date
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.date
    df.dropna(subset=['Date'], inplace=True)

    # Add source
    df['source'] = 'Google Play Store'

    # Save cleaned CSV
    df[['Review Description', 'Rating', 'Date', 'Bank', 'Source']].to_csv(output_path, index=False)

    print(f"Preprocessed data saved to {output_path}")
    return df


In [40]:
if __name__ == "__main__":
    df_new = preprocess_reviews("raw_reviews.csv", "cleaned_reviews.csv")

Preprocessed data saved to cleaned_reviews.csv


In [41]:
df_new

Unnamed: 0,Review Description,User,Rating,Date,Bank,Source,source
0,በጠምጡሩ,Semira K,5,2025-06-04,CBE,Google Play Store,Google Play Store
1,ከሠማይየሚወርደውንዕምድርይችለዋል,Eidafnediadem Aboobker,5,2025-06-04,CBE,Google Play Store,Google Play Store
2,ዋው,محمد سعيد,5,2025-05-21,CBE,Google Play Store,Google Play Store
3,it makes trading easier keep it up,Rediet Asmrom,5,2025-04-13,CBE,Google Play Store,Google Play Store
4,betam konjo,lemii lemii,5,2025-03-21,CBE,Google Play Store,Google Play Store
...,...,...,...,...,...,...,...
62,አስፈላጊ,Engdawork,5,2024-05-01,BOA,Google Play Store,Google Play Store
63,tebedu arif app new,hola holie,4,2024-04-27,BOA,Google Play Store,Google Play Store
64,በርቱ,Siraj,5,2024-04-25,BOA,Google Play Store,Google Play Store
65,Good application,Nureadine Eliyas Badawi,5,2024-02-16,BOA,Google Play Store,Google Play Store


In [6]:
df_app.shape

(1449, 6)

In [7]:
df_app.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Review Description  1449 non-null   object
 1   User                1449 non-null   object
 2   Rating              1449 non-null   int64 
 3   Date                1449 non-null   object
 4   Bank                1449 non-null   object
 5   Source              1449 non-null   object
dtypes: int64(1), object(5)
memory usage: 68.0+ KB
