In [1]:
from google_play_scraper import reviews, Sort
import pandas as pd
from tqdm import tqdm
import time
from datetime import datetime

BANKS = {
    "Chase": "com.chase.sig.android",
    "Bank of America": "com.infonow.bofa",
    "Wells Fargo": "com.wf.wellsfargomobile"
}

def scrape_reviews(app_id: str, bank_name: str, count: int = 400) -> pd.DataFrame:
    """Scrape reviews with rate limiting and error handling"""
    all_reviews = []
    continuation_token = None
    
    with tqdm(total=count, desc=f"Scraping {bank_name}") as pbar:
        while len(all_reviews) < count:
            try:
                result, continuation_token = reviews(
                    app_id,
                    lang='en',
                    count=min(100, count - len(all_reviews)),
                    sort=Sort.NEWEST,
                    continuation_token=continuation_token
                )
                
                if not result:
                    break
                    
                for review in result:
                    all_reviews.append({
                        'review': review.get('content', '').strip(),
                        'rating': review.get('score', 0),
                        'date': review.get('at', datetime.now()).strftime('%Y-%m-%d'),
                        'bank': bank_name,
                        'source': 'Google Play Store'
                    })
                
                pbar.update(len(result))
                time.sleep(2)  # Rate limiting
                
            except Exception as e:
                print(f"Error: {e}")
                time.sleep(30)
    
    return pd.DataFrame(all_reviews)

def main():
    all_data = pd.DataFrame()
    for bank_name, app_id in BANKS.items():
        df = scrape_reviews(app_id, bank_name)
        all_data = pd.concat([all_data, df], ignore_index=True)
    
    # Preprocessing
    all_data = all_data.drop_duplicates(subset=['review', 'bank'])
    all_data = all_data.dropna(subset=['review'])
    all_data.to_csv('bank_reviews.csv', index=False)
    print(f"Saved {len(all_data)} reviews to bank_reviews.csv")

if __name__ == "__main__":
    main()

Scraping Chase: 100%|██████████| 400/400 [00:11<00:00, 33.52it/s] 
Scraping Bank of America: 100%|██████████| 400/400 [00:47<00:00,  8.35it/s] 
Scraping Wells Fargo: 100%|██████████| 400/400 [00:12<00:00, 32.27it/s]

Saved 1102 reviews to bank_reviews.csv





In [3]:
import pandas as pd
from datetime import datetime

def preprocess_reviews(input_file='bank_reviews.csv', output_file='clean_reviews.csv'):
    """Data cleaning pipeline"""
    df = pd.read_csv(input_file)
    
    # 1. Handle missing data
    df = df.dropna(subset=['review'])
    df['rating'] = df['rating'].fillna(0).astype(int)
    
    # 2. Normalize dates
    df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')
    
    # 3. Text cleaning
    df['review'] = df['review'].str.strip()
    df = df[df['review'].str.len() > 5]  # Remove empty/short reviews
    
    # 4. Save cleaned data
    df.to_csv(output_file, index=False)
    print(f"Cleaned data saved to {output_file}")
    print(f"Final stats:\n{df['bank'].value_counts()}")
    return df

if __name__ == "__main__":
    preprocess_reviews()

Cleaned data saved to clean_reviews.csv
Final stats:
bank
Wells Fargo        358
Chase              355
Bank of America    353
Name: count, dtype: int64
