In [None]:
Preprocessing includes:
    - Removing duplicates
    - Dropping rows with missing critical fields
    - Normalizing the date format to 'YYYY-MM-DD'

The final cleaned dataset is saved to: `data/raw_reviews.csv`

KPI Targets:
------------
- Minimum 400 reviews per bank (1,200 total)
- <5% missing data after preprocessing
- Clean, structured CSV output for downstream analysis

In [2]:
import pandas as pd


In [3]:
# 2. Define preprocessing function
def preprocess_reviews(input_path, output_path):
    df = pd.read_csv(input_path)

    # Drop duplicates
    df.drop_duplicates(subset='review', inplace=True)

    # Drop rows with missing key data
    df.dropna(subset=['review', 'rating', 'date'], inplace=True)

    # Normalize date format
    df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.date
    df.dropna(subset=['date'], inplace=True)  # drop any invalid dates

    # Save cleaned data
    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

# 3. Define main()
def main():
    input_path = '../data/raw_reviews.csv'
    output_path = '../data/clean_reviews.csv'
    preprocess_reviews(input_path, output_path)

# 4. Run main()
main()

Cleaned data saved to ../data/clean_reviews.csv
