In [None]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load your FINAL dataset  
df = pd.read_csv("D:/traffic-congestion-predictor/data/processed/final_modeling_data.csv", parse_dates=['pickup_hour_dt'])  

# Verify columns (should match your selected features)    
#df.describe()
#df.info()

In [None]:
# congestion tiers 
def nyc_congestion_tier(speed):
    if speed > 13:   return 'low'
    elif speed > 9: return 'medium'
    else:           return 'high'

df['congestion'] = df['avg_speed'].apply(nyc_congestion_tier)

# Verify distribution
print(df['congestion'].value_counts(normalize=True)) 

In [None]:
#Drop object-type columns
df_model = df.drop(columns=['PUBorough','DOBorough','PUZone','DOZone','pickup_day','pickup_hour_dt'])

# 2. Cyclical time encoding
df_model['hour_sin'] = np.sin(2 * np.pi * df['pickup_hour']/24)
df_model['hour_cos'] = np.cos(2 * np.pi * df['pickup_hour']/24)
df_model = df_model.drop(columns=['pickup_hour'])

# 3. Convert LocationIDs to category dtype
df_model['PULocationID'] = df_model['PULocationID'].astype('category')
df_model['DOLocationID'] = df_model['DOLocationID'].astype('category')

In [None]:
# 1. Temperature check
print("Temperature stats (Celsius expected):")
print(df['temp'].describe())

# 2. Precipitation check
print("\nPrecipitation issues:", df[df['prcp'] < 0].shape[0])
print("Max precipitation:", df['prcp'].max())

# 3. Wind speed check
print("\nWind speed issues:", df[df['wspd'] < 0].shape[0])
print("Wind speed percentiles:")
print(df['wspd'].quantile([0.25, 0.5, 0.75, 0.99]))

# 4. Weather condition (coco) check
print("\nUnique weather codes:", sorted(df['coco'].unique()))
# Expected WMO codes: https://open-meteo.com/en/docs

# 5. Snow flag check
print("\nSnow flag counts:")
print(df['snowed'].value_counts(dropna=False))

In [None]:
# For coco (weather condition), use severity scores or one-hot encoding:
severity_map = {0:0, 1:0, 2:0, 3:0, 5:2, 7:1, 8:1, 9:1, 12:3, 14:3, 15:2, 16:3, 17:3}
df_model['weather_severity'] = df['coco'].map(severity_map)

# Or one-hot encode:
#df_model = pd.get_dummies(df_model, columns=['coco'], prefix='weather')

In [None]:
from sklearn.model_selection import train_test_split

# Features (X) and targets (y)
X = df_model.drop(columns=['avg_speed', 'congestion'])
y_speed = df_model['avg_speed']
y_congestion = df_model['congestion']

# Split data (80% train, 20% test)
X_train, X_test, y_speed_train, y_speed_test, y_congestion_train, y_congestion_test = train_test_split(
    X, y_speed, y_congestion, test_size=0.2, random_state=42
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_congestion_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_congestion_test, y_pred))

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Train
reg = XGBRegressor(random_state=42,enable_categorical=True)
reg.fit(X_train, y_speed_train)

# Evaluate
y_pred_speed = reg.predict(X_test)
print("MAE:", mean_absolute_error(y_speed_test, y_pred_speed))
print("R²:", r2_score(y_speed_test, y_pred_speed))

In [None]:
import matplotlib.pyplot as plt

# For Random Forest (Congestion)
plt.barh(X.columns, clf.feature_importances_)
plt.title("Feature Importance (Congestion Model)")
plt.show()

# For XGBoost (Speed)
plt.barh(X.columns, reg.feature_importances_)
plt.title("Feature Importance (Speed Model)")
plt.show()