In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style("whitegrid")

# Load data
raw_data_path = Path('../data/synthetic/stree_shakti_trips_raw.csv')
df = pd.read_csv(raw_data_path)

print("="*60)
print("DATA LOADING & INSPECTION")
print("="*60)
print(f"\nDataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage().sum() / 1e6:.2f} MB")
print(f"\nFirst few rows:\n{df.head()}")
print(f"\nMissing values: {df.isnull().sum().sum()}")


DATA LOADING & INSPECTION

Dataset shape: (1000000, 14)
Memory usage: 112.00 MB

First few rows:
         trip_id        date      time  hour day_of_week     bus_id  \
0  TRIP_00000001  2025-08-27  15:34:30    15   Wednesday  BUS_07143   
1  TRIP_00000002  2025-09-15  14:09:21    14      Monday  BUS_03866   
2  TRIP_00000003  2025-08-30  08:44:16     8    Saturday  BUS_03331   
3  TRIP_00000004  2025-10-08  20:01:27    20   Wednesday  BUS_04814   
4  TRIP_00000005  2025-11-12  16:53:19    16   Wednesday  BUS_06517   

  route_category  distance_km passenger_gender  passenger_type age_group  \
0          urban          6.2                M            Paid     18-30   
1          urban         17.5                F  Senior_Citizen     45-60   
2          urban          4.8                F    Stree_Shakti     18-30   
3          urban          7.0                F    Stree_Shakti     18-30   
4          urban          8.3                F         Student     18-30   

   normal_fare  rev

In [2]:
print("\n" + "="*60)
print("DATA TYPE CONVERSION")
print("="*60)

# Convert date and time columns
df['date'] = pd.to_datetime(df['date'])
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time
df['hour'] = df['hour'].astype(int)

# Convert numeric columns
df['distance_km'] = df['distance_km'].astype(float)
df['normal_fare'] = df['normal_fare'].astype(int)
df['revenue_loss'] = df['revenue_loss'].astype(int)
df['occupancy_pct'] = df['occupancy_pct'].astype(int)

# Ensure categorical columns
df['passenger_gender'] = df['passenger_gender'].astype('category')
df['passenger_type'] = df['passenger_type'].astype('category')
df['age_group'] = df['age_group'].astype('category')
df['route_category'] = df['route_category'].astype('category')
df['day_of_week'] = df['day_of_week'].astype('category')

print(f"\n✓ Data types converted successfully")
print(f"\nNew memory usage: {df.memory_usage().sum() / 1e6:.2f} MB")
print(f"\nData types:\n{df.dtypes}")



DATA TYPE CONVERSION

✓ Data types converted successfully

New memory usage: 61.00 MB

Data types:
trip_id                     object
date                datetime64[ns]
time                        object
hour                         int32
day_of_week               category
bus_id                      object
route_category            category
distance_km                float64
passenger_gender          category
passenger_type            category
age_group                 category
normal_fare                  int32
revenue_loss                 int32
occupancy_pct                int32
dtype: object


In [3]:
print("\n" + "="*60)
print("OUTLIER DETECTION")
print("="*60)

df_clean = df.copy()
outliers_removed = 0

# Invalid occupancy (should be 0-100%)
invalid_occupancy = df_clean[(df_clean['occupancy_pct'] < 0) | (df_clean['occupancy_pct'] > 100)]
if len(invalid_occupancy) > 0:
    df_clean = df_clean[(df_clean['occupancy_pct'] >= 0) & (df_clean['occupancy_pct'] <= 100)]
    outliers_removed += len(invalid_occupancy)
print(f"Invalid occupancy removed: {len(invalid_occupancy)}")

# Invalid distance (should be positive)
invalid_distance = df_clean[df_clean['distance_km'] <= 0]
if len(invalid_distance) > 0:
    df_clean = df_clean[df_clean['distance_km'] > 0]
    outliers_removed += len(invalid_distance)
print(f"Invalid distance removed: {len(invalid_distance)}")

# Invalid revenue loss (should be non-negative)
invalid_revenue = df_clean[df_clean['revenue_loss'] < 0]
if len(invalid_revenue) > 0:
    df_clean = df_clean[df_clean['revenue_loss'] >= 0]
    outliers_removed += len(invalid_revenue)
print(f"Invalid revenue loss removed: {len(invalid_revenue)}")

print(f"\n✓ Total outliers removed: {outliers_removed}")
print(f"  Remaining records: {len(df_clean):,}")
print(f"  Retention rate: {len(df_clean)/len(df)*100:.2f}%")



OUTLIER DETECTION
Invalid occupancy removed: 0
Invalid distance removed: 0
Invalid revenue loss removed: 0

✓ Total outliers removed: 0
  Remaining records: 1,000,000
  Retention rate: 100.00%


In [4]:
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)

# Date components
df_clean['year'] = df_clean['date'].dt.year
df_clean['month'] = df_clean['date'].dt.month
df_clean['week'] = df_clean['date'].dt.isocalendar().week
df_clean['day'] = df_clean['date'].dt.day
df_clean['is_weekend'] = df_clean['date'].dt.dayofweek.isin([5, 6]).astype(int)
print("✓ Date components added")

# Time period classification
def classify_time_period(hour):
    if 6 <= hour < 9:
        return 'Early_Peak'
    elif 9 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 14:
        return 'Midday'
    elif 14 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 20:
        return 'Evening_Peak'
    else:
        return 'Night'

df_clean['time_period'] = df_clean['hour'].apply(classify_time_period)
print("✓ Time period classification added")

# Occupancy category
def classify_occupancy(occ):
    if occ > 90:
        return 'Overcrowded'
    elif occ > 75:
        return 'High'
    elif occ > 50:
        return 'Medium'
    else:
        return 'Low'

df_clean['occupancy_category'] = df_clean['occupancy_pct'].apply(classify_occupancy)
print("✓ Occupancy categories added")

# Beneficiary indicators
df_clean['beneficiary_trip'] = (df_clean['passenger_type'] == 'Stree_Shakti').astype(int)
df_clean['concessional_trip'] = (df_clean['passenger_type'].isin(['Student', 'Senior_Citizen'])).astype(int)
print("✓ Beneficiary indicators added")

print(f"\n✓ Feature engineering complete!")
print(f"  Total columns: {len(df_clean)}")



FEATURE ENGINEERING
✓ Date components added
✓ Time period classification added
✓ Occupancy categories added
✓ Beneficiary indicators added

✓ Feature engineering complete!
  Total columns: 1000000


In [5]:
# Save to CSV
processed_path = Path('../data/processed/stree_shakti_trips_cleaned.csv')
processed_path.parent.mkdir(parents=True, exist_ok=True)
df_clean.to_csv(processed_path, index=False)

print(f"\n✓ Cleaned data saved!")
print(f"✓ File: {processed_path}")
print(f"\n✓✓✓ DATA CLEANING COMPLETE ✓✓✓")
print(f"Final dataset size: {len(df_clean):,} records")



✓ Cleaned data saved!
✓ File: ..\data\processed\stree_shakti_trips_cleaned.csv

✓✓✓ DATA CLEANING COMPLETE ✓✓✓
Final dataset size: 1,000,000 records
