In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Step 1: Importing the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Step 2: Import the merged dataset
merged_df = pd.read_csv("/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv")

# Step 3: Encode 'From_Location' and 'To_Location'
label_encoder = LabelEncoder()
merged_df['From_Location_Encoded'] = label_encoder.fit_transform(merged_df['From_Location'])
merged_df['To_Location_Encoded'] = label_encoder.fit_transform(merged_df['To_Location'])

# Step 4: Define the features and target
features = [
    'Time_Slot_Encoded',
    'Day_Type_Encoded',
    'From_Location_Encoded',
    'To_Location_Encoded',
    'Discount_Offered (%)',
    'Special_Event_Encoded',
    'Distance_km'
]
target = 'Passenger_Count'

# Step 5: Select features and target
X = merged_df[features]
y = merged_df[target]

# Step 6: Split the data (80% train / 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Output the shapes of splits
print("Training and Testing Data Shapes")
print(f"Training Features Shape: {X_train.shape}")
print(f"Test Features Shape: {X_test.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Test Target Shape: {y_test.shape}")


Training and Testing Data Shapes
Training Features Shape: (1600, 7)
Test Features Shape: (400, 7)
Training Target Shape: (1600,)
Test Target Shape: (400,)


In [None]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Load the CSV file
df = pd.read_csv("/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv")

# Drop rows with missing values
df_cleaned = df.dropna()

# Define features and target
features = [
    'Time_Slot_Encoded',
    'Day_Type_Encoded',
    'Special_Event_Encoded',
    'Ticket_Price',
    'Discount_Offered (%)',
    'Total_Revenue',
    'Distance_km'
]
target = 'Passenger_Count'

# Check if all required columns exist
missing_cols = [col for col in features + [target] if col not in df_cleaned.columns]
if missing_cols:
    print(f"Missing columns in dataset: {missing_cols}")
    exit()

# Prepare input and output variables
X = df_cleaned[features]
y = df_cleaned[target]

# Split data into training and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# Train, predict, and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "MAE": round(mean_absolute_error(y_test, y_pred), 4),
        "MSE": round(mean_squared_error(y_test, y_pred), 4),
        "RMSE": round(np.sqrt(mean_squared_error(y_test, y_pred)), 4),
        "R2 Score": round(r2_score(y_test, y_pred), 4)
    }

# Display model performance
print("\nModel Evaluation Results (Before Hyperparameter Tuning)")
print("=" * 50)
for model_name, metrics in results.items():
    print(f"\n{model_name}")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")



Model Evaluation Results (Before Hyperparameter Tuning)

Linear Regression
MAE: 22.17
MSE: 946.4125
RMSE: 30.7638
R2 Score: 0.7849

Decision Tree Regressor
MAE: 12.6225
MSE: 318.8025
RMSE: 17.855
R2 Score: 0.9275

Random Forest Regressor
MAE: 8.7625
MSE: 138.6595
RMSE: 11.7754
R2 Score: 0.9685

Support Vector Regressor
MAE: 45.602
MSE: 3162.4875
RMSE: 56.236
R2 Score: 0.2812


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Timer start
start = time.time()

# Load and clean dataset
df = pd.read_csv("/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv")
df_cleaned = df.dropna()

# Features and target
features = [
    'Time_Slot_Encoded', 'Day_Type_Encoded', 'Special_Event_Encoded',
    'Ticket_Price', 'Discount_Offered (%)', 'Total_Revenue', 'Distance_km'
]
target = 'Passenger_Count'

# Check for column integrity
missing_cols = [col for col in features + [target] if col not in df_cleaned.columns]
if missing_cols:
    print(f"Missing columns in dataset: {missing_cols}")
    exit()

X = df_cleaned[features]
y = df_cleaned[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model results dictionary
results = {}

# === 1. Ridge Regression ===
ridge = Ridge()
params_ridge = {'alpha': [0.01, 0.1, 1]}
gs_ridge = GridSearchCV(ridge, params_ridge, cv=2, scoring='neg_mean_squared_error', n_jobs=1)
gs_ridge.fit(X_train, y_train)
y_pred_ridge = gs_ridge.predict(X_test)
results["Ridge Regression"] = {
    "MAE": round(mean_absolute_error(y_test, y_pred_ridge), 4),
    "MSE": round(mean_squared_error(y_test, y_pred_ridge), 4),
    "RMSE": round(np.sqrt(mean_squared_error(y_test, y_pred_ridge)), 4),
    "R2 Score": round(r2_score(y_test, y_pred_ridge), 4)
}

# === 2. Decision Tree ===
params_dt = {'max_depth': [5, 10]}
gs_dt = GridSearchCV(DecisionTreeRegressor(random_state=42), params_dt, cv=2, scoring='neg_mean_squared_error', n_jobs=1)
gs_dt.fit(X_train, y_train)
y_pred_dt = gs_dt.predict(X_test)
results["Decision Tree"] = {
    "MAE": round(mean_absolute_error(y_test, y_pred_dt), 4),
    "MSE": round(mean_squared_error(y_test, y_pred_dt), 4),
    "RMSE": round(np.sqrt(mean_squared_error(y_test, y_pred_dt)), 4),
    "R2 Score": round(r2_score(y_test, y_pred_dt), 4)
}

# === 3. Random Forest (light tuning) ===
params_rf = {'n_estimators': [50], 'max_depth': [10]}
gs_rf = GridSearchCV(RandomForestRegressor(random_state=42), params_rf, cv=2, scoring='neg_mean_squared_error', n_jobs=1)
gs_rf.fit(X_train, y_train)
y_pred_rf = gs_rf.predict(X_test)
results["Random Forest"] = {
    "MAE": round(mean_absolute_error(y_test, y_pred_rf), 4),
    "MSE": round(mean_squared_error(y_test, y_pred_rf), 4),
    "RMSE": round(np.sqrt(mean_squared_error(y_test, y_pred_rf)), 4),
    "R2 Score": round(r2_score(y_test, y_pred_rf), 4)
}

# === 4. Support Vector Regression (simplified) ===
params_svr = {'C': [1], 'kernel': ['linear'], 'epsilon': [0.1]}
gs_svr = GridSearchCV(SVR(), params_svr, cv=2, scoring='neg_mean_squared_error', n_jobs=1)
gs_svr.fit(X_train, y_train)
y_pred_svr = gs_svr.predict(X_test)
results["Support Vector Regressor"] = {
    "MAE": round(mean_absolute_error(y_test, y_pred_svr), 4),
    "MSE": round(mean_squared_error(y_test, y_pred_svr), 4),
    "RMSE": round(np.sqrt(mean_squared_error(y_test, y_pred_svr)), 4),
    "R2 Score": round(r2_score(y_test, y_pred_svr), 4)
}

# Timer end
end = time.time()

# Print results
print("\n========== Model Evaluation Results (After Hyperparameter Tuning Processs) ==========")
for model_name, metrics in results.items():
    print(f"\n{model_name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
print("\nTotal Execution Time:", round(end - start, 2), "seconds")




Ridge Regression
MAE: 22.1643
MSE: 946.0823
RMSE: 30.7585
R2 Score: 0.785

Decision Tree
MAE: 14.7879
MSE: 435.9175
RMSE: 20.8786
R2 Score: 0.9009

Random Forest
MAE: 10.9161
MSE: 210.8023
RMSE: 14.519
R2 Score: 0.9521

Support Vector Regressor
MAE: 438.4453
MSE: 393334.3417
RMSE: 627.1637
R2 Score: -88.3973

Total Execution Time: 1250.27 seconds


In [None]:
# Import necessary libraries
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import numpy as np

# Define models for evaluation
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# Perform cross-validation and evaluate each model
print("\nCross-Validation Results")
print("=" * 40)
for model_name, model in models.items():
    print(f"\nResults for {model_name}")
    print("-" * (20 + len(model_name)))

    # Mean Absolute Error (MAE)
    mae_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    mean_mae = -mae_scores.mean()
    print(f"Mean MAE: {mean_mae:.4f}")

    # Mean Squared Error (MSE)
    mse_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    mean_mse = -mse_scores.mean()
    print(f"Mean MSE: {mean_mse:.4f}")

    # Root Mean Squared Error (RMSE)
    mean_rmse = np.sqrt(mean_mse)
    print(f"Mean RMSE: {mean_rmse:.4f}")

    # R² Score
    r2_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    mean_r2 = r2_scores.mean()
    print(f"Mean R² Score: {mean_r2:.4f}")



Cross-Validation Results

Results for Linear Regression
-------------------------------------
Mean MAE: 22.3485
Mean MSE: 962.7912
Mean RMSE: 31.0289
Mean R² Score: 0.7710

Results for Decision Tree Regressor
-------------------------------------------
Mean MAE: 12.5555
Mean MSE: 322.0235
Mean RMSE: 17.9450
Mean R² Score: 0.9235

Results for Random Forest Regressor
-------------------------------------------
Mean MAE: 8.7218
Mean MSE: 159.9069
Mean RMSE: 12.6454
Mean R² Score: 0.9620

Results for Support Vector Regressor
--------------------------------------------
Mean MAE: 44.6682
Mean MSE: 3030.6860
Mean RMSE: 55.0517
Mean R² Score: 0.2789


# Final Model Evaluation and Selection Report

---

## Model Evaluation Summary

### Evaluation Metrics Comparison

| Model                     | MAE       | MSE        | RMSE      | R² Score  | Evaluation Type             |
|--------------------------|-----------|------------|-----------|-----------|-----------------------------|
| Linear Regression        | 22.1700   | 946.4125   | 30.7638   | 0.7849    | Before Tuning               |
| Decision Tree Regressor | 12.6225   | 318.8025   | 17.8550   | 0.9275    | Before Tuning               |
| Random Forest Regressor | 8.7625    | 138.6595   | 11.7754   | 0.9685    | Before Tuning               |
| Support Vector Regressor| 45.6020   | 3162.4875  | 56.2360   | 0.2812    | Before Tuning               |
| Ridge Regression         | 22.1643   | 946.0823   | 30.7585   | 0.7850    | After Tuning                |
| Decision Tree            | 14.7879   | 435.9175   | 20.8786   | 0.9009    | After Tuning                |
| Random Forest            | 10.9161   | 210.8023   | 14.5190   | 0.9521    | After Tuning                |
| Support Vector Regressor| 438.4453  | 393334.3417| 627.1637  | -88.3973  | After Tuning (Failed Model) |
| Linear Regression (CV)   | 22.3485   | 962.7912   | 31.0289   | 0.7710    | Cross-Validation            |
| Decision Tree (CV)       | 12.5555   | 322.0235   | 17.9450   | 0.9235    | Cross-Validation            |
| Random Forest (CV)       | 8.7218    | 159.9069   | 12.6454   | 0.9620    | Cross-Validation            |
| Support Vector Regressor (CV)| 44.6682 | 3030.6860 | 55.0517   | 0.2789    | Cross-Validation            |

---

## Final Model Ranking

| Rank | Model                  | Key Strengths                           |
|------|------------------------|------------------------------------------|
| 1    | Random Forest          | Best accuracy (highest R²), lowest errors |
| 2    | Decision Tree          | Strong performance, simple structure     |
| 3    | Ridge / Linear Regression | Consistent but less accurate             |
| -    | Support Vector Regressor | Failed after tuning, very high error     |

---

## Selected Final Model: Random Forest Regressor

### Why Random Forest Was Selected

- Highest R² Score  
  - Test R²: 0.9521  
  - Cross-Validation R²: 0.9620

- Lowest Error Metrics  
  - MAE: 10.92  
  - MSE: 210.80  
  - RMSE: 14.52

- Consistent and Stable Performance  
  - Maintains top results across testing and cross-validation

- Robust and Non-linear  
  - Handles overfitting well and captures complex patterns

---

### Conclusion:
**Random Forest Regressor** is the most **robust, accurate, and generalizable** model for this regression task based on evaluations before tuning, after tuning, and through cross-validation.


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv")

# Convert 'Date' to datetime and extract features
df['Date'] = pd.to_datetime(df['Date'])
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Weekday_Num'] = df['Date'].dt.weekday
df.drop("Date", axis=1, inplace=True)


# Define categorical columns to encode
categorical_cols = ['From_Location', 'To_Location', 'Time_Slot', 'Day_Type', 'Special_Event', 'Weekday']

# Initialize OrdinalEncoder with handle_unknown to avoid errors in inference
ordinal_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit the encoder on the categorical columns
ordinal_enc.fit(df[categorical_cols])

# Transform categorical columns
encoded_array = ordinal_enc.transform(df[categorical_cols])

# Convert encoded array to DataFrame with appropriate column names
encoded_df = pd.DataFrame(encoded_array, columns=[col + '_Encoded' for col in categorical_cols])

# Combine encoded columns with the rest of the dataset (dropping original categorical columns)
df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

# ------------------ Model 1: Ticket Price Prediction ------------------

X_price = df.drop(["Ticket_Price", "Total_Revenue", "Passenger_Count"], axis=1)
y_price = df["Ticket_Price"]

X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(
    X_price, y_price, test_size=0.2, random_state=42
)

price_model = RandomForestRegressor(n_estimators=100, random_state=42)
price_model.fit(X_train_price, y_train_price)

# Evaluate
y_pred_price = price_model.predict(X_test_price)
print("Ticket Price Prediction Metrics:")
print("MAE:", mean_absolute_error(y_test_price, y_pred_price))
print("MSE:", mean_squared_error(y_test_price, y_pred_price))
print("RMSE:", np.sqrt(mean_squared_error(y_test_price, y_pred_price)))
print("R² Score:", r2_score(y_test_price, y_pred_price))


# ------------------ Model 2: Passenger Count Prediction ------------------

X_passenger = df.drop(["Passenger_Count", "Ticket_Price", "Total_Revenue"], axis=1)
y_passenger = df["Passenger_Count"]

X_train_pass, X_test_pass, y_train_pass, y_test_pass = train_test_split(
    X_passenger, y_passenger, test_size=0.2, random_state=42
)

passenger_model = RandomForestRegressor(n_estimators=100, random_state=42)
passenger_model.fit(X_train_pass, y_train_pass)

# Evaluate
y_pred_pass = passenger_model.predict(X_test_pass)
print("\nPassenger Count Prediction Metrics:")
print("MAE:", mean_absolute_error(y_test_pass, y_pred_pass))
print("MSE:", mean_squared_error(y_test_pass, y_pred_pass))
print("RMSE:", np.sqrt(mean_squared_error(y_test_pass, y_pred_pass)))
print("R² Score:", r2_score(y_test_pass, y_pred_pass))


# ================== Save Models and Encoders ==================

joblib.dump(price_model, '/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Resultant Models/price_prediction_model.pkl')
joblib.dump(passenger_model, '/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Resultant Models/passenger_count_prediction_model.pkl')
joblib.dump(ordinal_enc, '/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Resultant Models/ordinal_encoder.pkl')

print("\nModels and label encoders saved successfully to:")
print("   ➤ /Resultant Models/price_prediction_model.pkl")
print("   ➤ /Resultant Models/passenger_count_prediction_model.pkl")
print("   ➤ /Resultant Models/ordinal_encoder.pkl")


Ticket Price Prediction Metrics:
MAE: 0.07407499999999807
MSE: 0.08194725000000097
RMSE: 0.28626430095280997
R² Score: 0.9999987556603391

Passenger Count Prediction Metrics:
MAE: 25.518175
MSE: 957.1383170277778
RMSE: 30.937652093004374
R² Score: 0.7824608503241804

Models and label encoders saved successfully to:
   ➤ /Resultant Models/price_prediction_model.pkl
   ➤ /Resultant Models/passenger_count_prediction_model.pkl
   ➤ /Resultant Models/ordinal_encoder.pkl


# Conclusion

- ML algorithms are implemented.

- Hyperparameter tuning is applied.

- Cross-validation is conducted.

- Performance evaluation metrics are calculated.

- Comparisons are made between different configurations.