In [None]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load your FINAL dataset  
df = pd.read_csv("D:/traffic-congestion-predictor/data/processed/final_modeling_data.csv", parse_dates=['pickup_hour_dt'])  

# Verify columns (should match your selected features)    
#df.describe()
df.info()

In [None]:
#drop speed outliers
df = df[df['avg_speed'] <= 60].copy()

# Drop rows with missing borough/zone 
df.dropna(subset=['PUBorough', 'DOBorough', 'PUZone', 'DOZone'], inplace=True)

#Frequency encoding high-cardinality categorical features
for col in ['PUBorough', 'DOBorough', 'PUZone', 'DOZone']:
    freq_map = df[col].value_counts(normalize=True)
    df[f'{col}_freq'] = df[col].map(freq_map)

#encoding pickup days as they are in object dtypes(low cardinality)
df = pd.get_dummies(df, columns=['pickup_day'])#, drop_first=True)


#Reset index after cleaning

df.reset_index(drop=True, inplace=True)




In [None]:
df.info()
#df.to_csv("D:/traffic-congestion-predictor/data/processed/xbdata_data.csv", index=False)

In [None]:
#df.head()
df.info()

##FEATURE SELECTION AND CORRELATION HEATMAP

In [None]:
# Compute correlation matrix
corr_matrix = df.corr(numeric_only=True)

# Sort correlations with target
target_corr = corr_matrix['avg_speed'].sort_values(ascending=False)

# Print top 10 positively and negatively correlated features
print("Top positive correlations:\n", target_corr.head(10))
print("\nTop negative correlations:\n", target_corr.tail(10))

# Plot full heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title("Correlation Heatmap")
plt.savefig("D:/traffic-congestion-predictor/outputs/plots/correlation_heatmap.png")
plt.show()


In [None]:
#features to keep
#selected_features = [
   # 'pickup_hour',
    #'pickup_day_Monday', 'pickup_day_Sunday',  # Only high-corr ones
    #'is_rush_hour',
    #'is_midweek',

    #'PULocationID', 'DOLocationID',
    #'PUZone_freq', 'DOZone_freq',
    #'PUBorough_freq', 'DOBorough_freq',

    #'temp', 'prcp', 'wspd', 'snowed', 'coco'

#target = 'avg_speed'
#df

In [None]:
#pip install --user xgboost

In [None]:
#pip install --upgrade scikit-learn
import sklearn
print(sklearn.__version__)


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# ----------------------------
# Features & Target
# ----------------------------
features = [
    'pickup_hour', 'pickup_day_Sunday', 'pickup_day_Monday',
    'is_rush_hour', 'is_midweek',
    'PULocationID', 'DOLocationID',
    'PUZone_freq', 'DOZone_freq',
    'PUBorough_freq', 'DOBorough_freq',
    'temp', 'prcp', 'wspd', 'snowed', 'coco'
]
target = 'avg_speed'

X = df[features]
y = df[target]

# ----------------------------
# Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------
# Standardization (optional but helps)
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# Linear Regression
# ----------------------------
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# ----------------------------
# Prediction & Evaluation
# ----------------------------
y_pred = lr.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
#rmse = mean_squared_error(y_test, y_pred, squared=False)
#r2 = r2_score(y_test, y_pred)

print(f"Linear Regression RMSE: {rmse:.3f}")
print(f"Linear Regression R²: {r2:.3f}")


In [None]:
#

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse=np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rf_rmse:.3f}")
print(f"Random Forest R²: {rf_r2:.3f}")


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

xgb_mse = mean_squared_error(y_test, y_pred_xgb)
xgb_rmse=np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {xgb_rmse:.3f}")
print(f"XGBoost R²: {xgb_r2:.3f}")


In [None]:
import matplotlib.pyplot as plt

importances = xgb.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("XGBoost Feature Importances")
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig("D:/traffic-congestion-predictor/outputs/plots/xgb1_feature_importance.png")
plt.show()



In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Set up parameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5, 7]
}

xgb_model = XGBRegressor(random_state=42, n_jobs=-1)

random_search_xgb = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=30,  # keep it light
    cv=5,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search_xgb.fit(X_train, y_train)

# Best model
best_xgb = random_search_xgb.best_estimator_

# Predictions
y_pred_xgb = best_xgb.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred_xgb)
rmse=np.sqrt(mse)
r2 = r2_score(y_test, y_pred_xgb)

print(f"Tuned XGBoost RMSE: {rmse:.3f}")
print(f"Tuned XGBoost R²: {r2:.3f}")
print("Best Params:", random_search_xgb.best_params_)


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Param grid for RF
rf_param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_grid,
    n_iter=30,
    cv=5,
    verbose=1,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42
)

rf_random_search.fit(X_train, y_train)

# Evaluate
best_rf = rf_random_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse=np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_pred_rf)

print(f"Tuned Random Forest RMSE: {rf_rmse:.3f}")
print(f"Tuned Random Forest R²: {rf_r2:.3f}")
print("Best Params:", rf_random_search.best_params_)
