In [2]:
# %pip install google-play-scraper pandas nltk

In [3]:
# scrape_and_preprocess.py

import pandas as pd
from google_play_scraper import Sort, reviews
import time
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK assets once
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Belay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Belay\AppData\Roaming\nltk_data...


True

In [4]:
# ========== FUNCTION: Scrape Reviews ==========
def fetch_reviews(app_id, total_reviews=200):
    all_reviews = []
    try:
        for _ in range(0, total_reviews, 100):
            result, _ = reviews(
                app_id,
                lang='en',
                country='us',
                sort=Sort.NEWEST,
                count=100
            )
            all_reviews.extend(result)
            time.sleep(1)  # avoid rate limit
        df = pd.DataFrame(all_reviews)[['userName', 'content', 'score', 'at', 'reviewCreatedVersion']]
        df.drop_duplicates(subset='content', inplace=True)
        print(f"✅ Successfully scraped {len(df)} reviews.")
        return df
    except Exception as e:
        print(f"❌ Error during scraping: {e}")
        return pd.DataFrame()

# ========== FUNCTION: Clean and Preprocess Text ==========
def clean_text(text):
    try:
        # Remove URLs, special characters, and lowercase
        text = re.sub(r"http\S+|[^a-zA-Z\s]", "", text.lower())
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        words = text.split()
        words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
        return " ".join(words)
    except Exception as e:
        print(f"❌ Error cleaning text: {e}")
        return ""

# ========== FUNCTION: Apply Preprocessing ==========
def preprocess_reviews(df):
    try:
        df['clean_text'] = df['content'].apply(clean_text)
        return df
    except Exception as e:
        print(f"❌ Error preprocessing reviews: {e}")
        return df

# ========== MAIN EXECUTION ==========
if __name__ == "__main__":
    # Example for Commercial Bank of Ethiopia (CBE) app ID
    app_id = 'com.cbe.mobile'
    
    # Step 1: Scrape Reviews
    reviews_df = fetch_reviews(app_id, total_reviews=300)
    
    if not reviews_df.empty:
        # Step 2: Preprocess Text
        cleaned_df = preprocess_reviews(reviews_df)

        # Step 3: Save for analysis
        cleaned_df.to_csv("cbe_reviews_cleaned.csv", index=False)
        print("✅ Saved cleaned reviews to 'cbe_reviews_cleaned.csv'.")


❌ Error during scraping: "None of [Index(['userName', 'content', 'score', 'at', 'reviewCreatedVersion'], dtype='object')] are in the [columns]"
