In [5]:
!pip install pandas
!pip install seaborn
!pip install matplotlib

Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.3.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.0-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------

In [2]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add scripts path to sys.path
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from preprocessing import ReviewPreprocessor
from utils import CLEANED_DATA_DIR, APP_ID_TO_BANK_NAME, TODAY_DATE_STR

ModuleNotFoundError: No module named 'tqdm'


2. Set Paths and List Raw Files

In [None]:
RAW_DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'raw'))
CLEANED_DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'cleaned'))

os.makedirs(CLEANED_DATA_DIR, exist_ok=True)

raw_files = [os.path.join(RAW_DATA_DIR, f) for f in os.listdir(RAW_DATA_DIR) if f.endswith('.csv')]
print("Raw files found:", raw_files)

3. Preprocess Raw Data

In [None]:
preprocessor = ReviewPreprocessor(
    cleaned_data_dir=CLEANED_DATA_DIR,
    app_id_to_bank_name=APP_ID_TO_BANK_NAME
)

combined_df = preprocessor.preprocess_batch(raw_files)

4. Load Cleaned Data

In [None]:
# Load the combined cleaned CSV
combined_cleaned_path = os.path.join(CLEANED_DATA_DIR, f'all_reviews_cleaned_{TODAY_DATE_STR}.csv')
df = pd.read_csv(combined_cleaned_path)
df.head()

5. Exploratory Data Analysis (EDA)
5.1 Overview

In [None]:
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.info()
df.describe(include='all')

ModuleNotFoundError: No module named 'pandas'

5.2 Missing Values

In [None]:
df.isnull().sum()

5.3 Distribution of Ratings

In [None]:
sns.countplot(data=df, y='bank_name', order=df['bank_name'].value_counts().index)
plt.title('Number of Reviews per Bank')
plt.show()

ModuleNotFoundError: No module named 'pandas'

5.4 Reviews per Bank

In [None]:
sns.countplot(data=df, y='bank_name', order=df['bank_name'].value_counts().index)
plt.title('Number of Reviews per Bank')
plt.show()

ModuleNotFoundError: No module named 'pandas'

5.5 Review Length Distributio


In [None]:
df['review_length'] = df['review_text'].astype(str).apply(len)
sns.histplot(df['review_length'], bins=30)
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length (characters)')
plt.show()

5.6 Ratings Over Time

In [None]:
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')
df['year_month'] = df['review_date'].dt.to_period('M')
sns.countplot(data=df, x='year_month', hue='bank_name')
plt.title('Reviews Over Time by Bank')
plt.xticks(rotation=45)
plt.show()

ModuleNotFoundError: No module named 'pandas'