# Task 1: Data Collection and Preprocessing (v2)

This notebook handles the scraping of Google Play Store reviews for three bank apps, preprocessing the data, and saving it to a CSV file.

**Instructions:**
1. Update the `bank_apps` dictionary in the 'Configuration' section with the actual Google Play Store app IDs and desired names for your three target banks.
2. Run all cells to perform scraping, preprocessing, and save the data.

In [None]:
# Import necessary libraries
import pandas as pd
import sys
import os

# Add src to path to import custom modules
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

from utils.scraper import PlayStoreScraper
from utils.preprocessor import ReviewPreprocessor

## 1. Configuration

**IMPORTANT:** Replace the placeholder `app_id` values with the actual Google Play Store IDs for the apps you want to scrape. You can find the app ID in the URL of the app's Play Store page (e.g., for Gmail, the URL is `https://play.google.com/store/apps/details?id=com.google.android.gm`, so the ID is `com.google.android.gm`).

In [None]:
# Define App IDs for the three banks and their names
# PLEASE UPDATE THESE WITH ACTUAL APP IDs AND DESIRED NAMES
bank_apps = {
    'com.combanketh.mobilebanking': 'Commercial Bank of Ethiopia', # Example: 'com.cbe.mobile'
    'com.boa.boaMobileBanking': 'Bank of Abysinnia', # Example: 'com.awashbank.mobile'
    'com.dashen.dashensuperapp': 'Dashen Bank'  # Example: 'com.dashenbank.mobile'
}

TARGET_REVIEWS_PER_APP = 400
LANG = 'en'  # Language for reviews
COUNTRY = 'us' # Country for Play Store context (affects review availability)

OUTPUT_CSV_PATH = '../data/google_play_reviews.csv'
DATA_DIR = '../data'

## 2. Initialize Scraper and Preprocessor

In [None]:
scraper = PlayStoreScraper()
preprocessor = ReviewPreprocessor()

## 3. Scrape and Preprocess Reviews for Each App

In [None]:
all_processed_dfs = []

for app_id, bank_name in bank_apps.items():
    print(f"--- Processing: {bank_name} ({app_id}) ---")
    
    # Scrape reviews
    raw_reviews = scraper.get_reviews(
        app_id=app_id, 
        app_name=bank_name, 
        lang=LANG, 
        country=COUNTRY, 
        count=TARGET_REVIEWS_PER_APP
    )
    
    if not raw_reviews:
        print(f"No raw reviews fetched for {bank_name}. Skipping preprocessing.")
        continue
        
    print(f"Fetched {len(raw_reviews)} raw reviews for {bank_name}.")
    
    # Preprocess reviews
    processed_df_bank = preprocessor.preprocess_data(raw_reviews, bank_name)
    
    if not processed_df_bank.empty:
        all_processed_dfs.append(processed_df_bank)
        print(f"Finished preprocessing for {bank_name}. {len(processed_df_bank)} reviews added.")
    else:
        print(f"No reviews remaining after preprocessing for {bank_name}.")
    print("------------------------------------\n")

## 4. Combine Data and Save to CSV

In [None]:
final_df = pd.DataFrame() # Initialize an empty DataFrame

if all_processed_dfs:
    final_df = pd.concat(all_processed_dfs, ignore_index=True)
    print(f"Total reviews combined from all apps: {len(final_df)}")
    
    # Ensure the data directory exists
    os.makedirs(DATA_DIR, exist_ok=True)
    
    final_df.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"Combined dataset saved to: {OUTPUT_CSV_PATH}")
    print("\nFirst 5 rows of the combined dataset:")
    print(final_df.head())
else:
    print("No data was processed or collected from any app. CSV file not saved.")

## 5. KPIs Check

In [None]:
if not final_df.empty:
    total_reviews_collected = len(final_df)
    print(f"Total reviews in final dataset: {total_reviews_collected}")

    # Missing data calculation
    missing_data_summary = final_df.isnull().sum()
    total_cells = final_df.size # Total number of cells (rows * columns)
    total_missing_cells = missing_data_summary.sum()
    missing_percentage = (total_missing_cells / total_cells) * 100 if total_cells > 0 else 0

    print("\nMissing data summary (per column):")
    print(missing_data_summary[missing_data_summary > 0]) # Show only columns with missing data
    print(f"\nOverall missing data: {total_missing_cells} cells out of {total_cells} ({missing_percentage:.2f}%)")

    # KPI 1: 1,200+ reviews collected
    target_total_reviews = 3 * TARGET_REVIEWS_PER_APP # Expected total
    if total_reviews_collected >= target_total_reviews:
        print(f"\nKPI Met: {total_reviews_collected} reviews collected (Target: {target_total_reviews}+). ({total_reviews_collected/target_total_reviews*100:.2f}% of target)")
    elif total_reviews_collected >= 1200: # Absolute minimum from brief
         print(f"\nKPI Met (Minimum): {total_reviews_collected} reviews collected (Overall Target: 1200+). ({total_reviews_collected/1200*100:.2f}% of minimum target)")
    else:
        print(f"\nKPI Not Met: Expected {target_total_reviews}+ (or min 1200) reviews, got {total_reviews_collected}.")

    # KPI 2: <5% missing data
    if missing_percentage < 5:
        print("KPI Met: Missing data is less than 5%.")
    else:
        print(f"KPI Not Met: Expected missing data < 5%, got {missing_percentage:.2f}%.")
else:
    print("No data in the final DataFrame. KPIs cannot be checked.")