# Task 1: Data Collection and Preprocessing using Google Play

In [33]:
import pandas as pd
import os 
import sys
import pandas as pd
import matplotlib.pyplot as plt
from google_play_scraper import Sort, reviews
sys.path.append(os.path.abspath("../scripts"))
from play_store_scraping import play_store_scrape
import warnings
warnings.filterwarnings('ignore')

In [3]:
# %%
os.chdir("..")  # Go up a directory
#print(os.getcwd())

# %%
# print(os.getcwd())

In [34]:
# Import the play_store_scrape function (it's in a separate file named 'play_store_scraping.py' found in scripts folder)
# from play_store_scraping import play_store_scrape  # Uncomment if using modules

# Define the app IDs and names
bank_apps = {
    "CBE": "com.combanketh.mobilebanking",
    "BOA": "com.boa.boaMobileBanking",
    "Dashen": "com.dashen.dashensuperapp"
}

# Collect all reviews
all_reviews = []

# for bank_name, app_id in bank_apps.items():
#     reviews = play_store_scrape(app_id, bank_name, lang='en', count=500)
#     all_reviews.extend(reviews): Web Scraping
for bank_name, app_id in bank_apps.items():
    bank_reviews = play_store_scrape(app_id, bank_name, lang='en', count=500)
    all_reviews.extend(bank_reviews)

Scraped 500 reviews for CBE
Scraped 500 reviews for BOA
Scraped 454 reviews for Dashen


In [35]:
# Convert to DataFrame
df = pd.DataFrame(all_reviews)
df

Unnamed: 0,Review Description,User,Rating,Date,Bank,Source
0,So bad now and hard to use,Daniel Ephrem,5,2025-06-09 18:31:56,CBE,Google Play Store
1,"it is so amazing app. but, it is better to upd...",abdulkerim habib,5,2025-06-09 16:20:06,CBE,Google Play Store
2,v.good app,Abdulhalim Bedre,4,2025-06-09 11:49:09,CBE,Google Play Store
3,very good app,Moha Yimer,1,2025-06-09 01:24:23,CBE,Google Play Store
4,Very amazing app indeed. I'm enjoying it,Puoch chuol Wath,5,2025-06-08 21:52:23,CBE,Google Play Store
...,...,...,...,...,...,...
1449,Best,Geresu Ku,5,2025-01-13 21:11:58,Dashen,Google Play Store
1450,"Waw Great and innovated,user friendly, always ...",TSEGAW WORKIE,5,2025-01-13 21:01:12,Dashen,Google Play Store
1451,It's Best waww 🙏,Kemal Keyire,5,2025-01-13 09:37:19,Dashen,Google Play Store
1452,Always one step ahead,Anwarf ahmed,5,2025-01-13 09:15:03,Dashen,Google Play Store


In [36]:
df.shape

(1454, 6)

In [42]:
# Optional: Preprocess (remove duplicates, missing, format date)
    # Drop duplicates
df.drop_duplicates(subset='Review Description', inplace=True)
  # Drop rows with missing critical values
df.drop_duplicates(subset=['Review Description', 'User', 'Date'], inplace=True)

# Handle missing values (check percentage)
missing_percentage = df.isnull().mean() * 0.05
print("Missing Data Percentage per Column:\n", missing_percentage)

# Drop rows with critical missing data (e.g., no review or rating)
df.dropna(subset=['Review Description', 'Rating'])

    # Normalize date
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

# Save to CSV
df.to_csv("cleaned_bank_reviews.csv", index=False)

# Display result
print(f" Final dataset contains {len(df)} cleaned reviews.")


total_reviews = len(df)
missing_data_ratio = df.isnull().sum().sum() / (df.shape[0] * df.shape[1])

print(f"Total cleaned reviews: {total_reviews}")
print(f"Missing data ratio: {missing_data_ratio:.5%}")



# Save to CSV
df.to_csv("data/cleaned_bank_reviews.csv", index=False)

# Display result
print(f" Final dataset contains {len(df)} cleaned reviews.")


Missing Data Percentage per Column:
 Review Description    0.0
User                  0.0
Rating                0.0
Date                  0.0
Bank                  0.0
Source                0.0
dtype: float64
 Final dataset contains 1183 cleaned reviews.
Total cleaned reviews: 1183
Missing data ratio: 0.00000%
 Final dataset contains 1183 cleaned reviews.


In [38]:
print(df.head())

                                  Review Description              User  \
0                         So bad now and hard to use     Daniel Ephrem   
1  it is so amazing app. but, it is better to upd...  abdulkerim habib   
2                                         v.good app  Abdulhalim Bedre   
3                                      very good app        Moha Yimer   
4           Very amazing app indeed. I'm enjoying it  Puoch chuol Wath   

   Rating        Date Bank             Source  
0       5  2025-06-09  CBE  Google Play Store  
1       5  2025-06-09  CBE  Google Play Store  
2       4  2025-06-09  CBE  Google Play Store  
3       1  2025-06-09  CBE  Google Play Store  
4       5  2025-06-08  CBE  Google Play Store  


In [39]:
output_path = os.path.join("data", "bank_reviews_cleaned.csv")
os.makedirs("data", exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Saved cleaned reviews to {output_path}")
df

Saved cleaned reviews to data\bank_reviews_cleaned.csv


Unnamed: 0,Review Description,User,Rating,Date,Bank,Source
0,So bad now and hard to use,Daniel Ephrem,5,2025-06-09,CBE,Google Play Store
1,"it is so amazing app. but, it is better to upd...",abdulkerim habib,5,2025-06-09,CBE,Google Play Store
2,v.good app,Abdulhalim Bedre,4,2025-06-09,CBE,Google Play Store
3,very good app,Moha Yimer,1,2025-06-09,CBE,Google Play Store
4,Very amazing app indeed. I'm enjoying it,Puoch chuol Wath,5,2025-06-08,CBE,Google Play Store
...,...,...,...,...,...,...
1448,Faster and userfriendly,bamlak fekadu,5,2025-01-14,Dashen,Google Play Store
1450,"Waw Great and innovated,user friendly, always ...",TSEGAW WORKIE,5,2025-01-13,Dashen,Google Play Store
1451,It's Best waww 🙏,Kemal Keyire,5,2025-01-13,Dashen,Google Play Store
1452,Always one step ahead,Anwarf ahmed,5,2025-01-13,Dashen,Google Play Store


In [43]:
print("🎯 KPI Summary:")
print(' ================')
print(f"✔️ Total reviews collected: {total_reviews}")
print(f"✔️ Missing data rate: {missing_data_ratio:.2%}")
print("✔️ CSV export complete.")

🎯 KPI Summary:
✔️ Total reviews collected: 1183
✔️ Missing data rate: 0.00%
✔️ CSV export complete.
