In [1]:
!pip install pandas
!pip install seaborn
!pip install matplotlib



In [2]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add scripts path to sys.path
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from scripts.preprocessing import ReviewPreprocessor
from scripts.utils import CLEANED_DATA_DIR, APP_ID_TO_BANK_NAME, TODAY_DATE_STR

ModuleNotFoundError: No module named 'scripts.preprocessing'


2. Set Paths and List Raw Files

In [None]:
RAW_DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'raw'))
CLEANED_DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'cleaned'))

os.makedirs(CLEANED_DATA_DIR, exist_ok=True)

raw_files = [os.path.join(RAW_DATA_DIR, f) for f in os.listdir(RAW_DATA_DIR) if f.endswith('.csv')]
print("Raw files found:", raw_files)

3. Preprocess Raw Data

In [None]:
preprocessor = ReviewPreprocessor(
    cleaned_data_dir=CLEANED_DATA_DIR,
    app_id_to_bank_name=APP_ID_TO_BANK_NAME
)

combined_df = preprocessor.preprocess_batch(raw_files)

4. Load Cleaned Data

In [None]:
# Load the combined cleaned CSV
combined_cleaned_path = os.path.join(CLEANED_DATA_DIR, f'all_reviews_cleaned_{TODAY_DATE_STR}.csv')
df = pd.read_csv(combined_cleaned_path)
df.head()

5. Exploratory Data Analysis (EDA)
5.1 Overview

In [None]:
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.info()
df.describe(include='all')

5.2 Missing Values

In [None]:
df.isnull().sum()

5.3 Distribution of Ratings

In [None]:
sns.countplot(data=df, y='bank_name', order=df['bank_name'].value_counts().index)
plt.title('Number of Reviews per Bank')
plt.show()

5.4 Reviews per Bank

In [None]:
sns.countplot(data=df, y='bank_name', order=df['bank_name'].value_counts().index)
plt.title('Number of Reviews per Bank')
plt.show()

5.5 Review Length Distributio


In [None]:
df['review_length'] = df['review_text'].astype(str).apply(len)
sns.histplot(df['review_length'], bins=30)
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length (characters)')
plt.show()

5.6 Ratings Over Time

In [None]:
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')
df['year_month'] = df['review_date'].dt.to_period('M')
sns.countplot(data=df, x='year_month', hue='bank_name')
plt.title('Reviews Over Time by Bank')
plt.xticks(rotation=45)
plt.show()

In [None]:
# --- Data Cleaning: Remove duplicates, handle missing data, normalize dates, and select columns ---

# Remove duplicates
df = df.drop_duplicates()

# Handle missing data: drop rows with missing essential fields
df = df.dropna(subset=['review_text', 'rating', 'review_date', 'bank_name', 'source'])

# Normalize dates to YYYY-MM-DD
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Rename columns to match the required output
df = df.rename(columns={
    'review_text': 'review',
    'bank_name': 'bank',
    # 'source' and 'rating' assumed to already match
})

# Select and reorder columns
df = df[['review', 'rating', 'review_date', 'bank', 'source']]

# Save cleaned combined CSV
final_cleaned_path = os.path.join(CLEANED_DATA_DIR, f'all_reviews_cleaned_final_{TODAY_DATE_STR}.csv')
df.to_csv(final_cleaned_path, index=False)
print(f"Final cleaned combined CSV saved to: {final_cleaned_path}")

# Save per-bank cleaned CSVs
for bank in df['bank'].unique():
    bank_df = df[df['bank'] == bank]
    bank_path = os.path.join(CLEANED_DATA_DIR, f'{bank}_cleaned_{TODAY_DATE_STR}.csv')
    bank_df.to_csv(bank_path, index=False)
    print(f"Cleaned CSV for {bank} saved to: {bank_path}")

df.head()

NameError: name 'df' is not defined