In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv("chennai_final_dataset.csv")

df['date'] = pd.to_datetime(df['date'], dayfirst=True)


df = df.drop_duplicates()

df['event'] = df['event'].fillna('None')
df['type'] = df['type'].fillna('None')
df['traffic_impact'] = df['traffic_impact'].fillna('Low')



daily_df = df.groupby(['date', 'Junction'], observed=False).agg({
    'Vehicles': 'sum',
    'temperature': 'mean',
    'rain (mm)': 'sum',
    'precipitation (mm)': 'sum',
    'snow_depth (m)': 'sum',
    'event': 'first',
    'type': 'first',
    'traffic_impact': 'first'
}).reset_index()


daily_df['day_of_week'] = daily_df['date'].dt.dayofweek
daily_df['month'] = daily_df['date'].dt.month
daily_df['week_of_year'] = daily_df['date'].dt.isocalendar().week
daily_df['is_weekend'] = (daily_df['day_of_week'] >= 5).astype(int)


daily_df['is_event_day'] = (daily_df['event'] != 'None').astype(int)


daily_df = daily_df.sort_values(['Junction', 'date'])

daily_df['vehicles_lag_1'] = daily_df.groupby('Junction', observed=False)['Vehicles'].shift(1)
daily_df['vehicles_lag_7'] = daily_df.groupby('Junction', observed=False)['Vehicles'].shift(7)


daily_df['vehicles_roll_3'] = (
    daily_df.groupby('Junction', observed=False)['Vehicles']
            .rolling(3).mean().reset_index(0, drop=True)
)

daily_df['vehicles_roll_7'] = (
    daily_df.groupby('Junction', observed=False)['Vehicles']
            .rolling(7).mean().reset_index(0, drop=True)
)


encoded_df = pd.get_dummies(
    daily_df,
    columns=['event', 'type', 'traffic_impact'],
    drop_first=True
)


preprocessed_df = encoded_df.copy()

preprocessed_df.head()


explain this code 

df['traffic_zscore'] = (df['Vehicles'] - df['Vehicles'].mean()) / df['Vehicles'].std()
df['rolling_7'] = df.groupby('Junction')['Vehicles'].rolling(7).mean().reset_index(0, drop=True)
df['rolling_3'] = df.groupby('Junction')['Vehicles'].rolling(3).mean().reset_index(0, drop=True)

threshold = df['Vehicles'].quantile(0.90)
df['is_peak_day'] = (df['Vehicles'] >= threshold).astype(int)

df['day_of_week'] = df['date'].dt.day_name()
weekday_pattern = df.groupby(['day_of_week', 'Junction'])['Vehicles'].mean()
df['month'] = df['date'].dt.month
monthly_pattern = df.groupby(['month', 'Junction'])['Vehicles'].mean()

df[['Vehicles', 'temperature', 'rain (mm)']].corr()
event_impact = df.groupby('event')['Vehicles'].mean().sort_values(ascending=False)


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


data = preprocessed_df.sort_values('date')

X = data.drop(['Vehicles', 'date'], axis=1)
y = data['Vehicles']



X_clean = X.dropna()
y_clean = y.loc[X_clean.index]


split_idx = int(len(X_clean) * 0.8)

X_train = X_clean.iloc[:split_idx]
y_train = y_clean.iloc[:split_idx]

X_val = X_clean.iloc[split_idx:]
y_val = y_clean.iloc[split_idx:]


rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)


gb = GradientBoostingRegressor(random_state=42)

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}

tscv = TimeSeriesSplit(n_splits=5)

gb_grid = GridSearchCV(
    gb,
    param_grid=param_grid_gb,
    scoring='neg_mean_absolute_error',
    cv=tscv,
    n_jobs=-1
)

gb_grid.fit(X_clean, y_clean)
best_gb = gb_grid.best_estimator_


def evaluate(model, X_train, y_train, X_val, y_val):
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    results = {
        'Train MAE': mean_absolute_error(y_train, y_train_pred),
        'Val MAE': mean_absolute_error(y_val, y_val_pred),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'Val RMSE': np.sqrt(mean_squared_error(y_val, y_val_pred)),
        'Train R2': r2_score(y_train, y_train_pred),
        'Val R2': r2_score(y_val, y_val_pred)
    }
    return results, y_val_pred


rf_results, rf_pred = evaluate(rf, X_train, y_train, X_val, y_val)
gb_results, gb_pred = evaluate(best_gb, X_train, y_train, X_val, y_val)

print(" Random Forest Results:\n", rf_results)
print("\n Best Gradient Boosting Results:\n", gb_results)
print("\n Best GB Model:", best_gb)


cv_scores = []
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_clean)):
    X_tr, X_vl = X_clean.iloc[train_idx], X_clean.iloc[val_idx]
    y_tr, y_vl = y_clean.iloc[train_idx], y_clean.iloc[val_idx]

    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_vl)

    cv_scores.append({
        'Fold': fold + 1,
        'MAE': mean_absolute_error(y_vl, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_vl, y_pred)),
        'R2': r2_score(y_vl, y_pred)
    })

cv_df = pd.DataFrame(cv_scores)
print("\n Time-Based Cross-Validation Results:\n", cv_df)
print("\n Average CV Performance:\n", cv_df.mean())


val_dates = data['date'].loc[X_val.index]

plt.figure(figsize=(12,5))
plt.plot(val_dates, y_val.values, label='Actual', linewidth=2)
plt.plot(val_dates, gb_pred, label='Predicted (GB)', alpha=0.8)
plt.title("Actual vs Predicted Traffic (Gradient Boosting)")
plt.xlabel("Date")
plt.ylabel("Vehicles")
plt.legend()
plt.show()


residuals = y_val - gb_pred

plt.figure(figsize=(6,4))
sns.histplot(residuals, kde=True)
plt.title("Residual Distribution (GB)")
plt.xlabel("Residuals")
plt.show()

plt.figure(figsize=(6,4))
plt.scatter(gb_pred, residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residuals vs Predicted (GB)")
plt.xlabel("Predicted")
plt.ylabel("Residual")
plt.show()
