In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.stats import zscore
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')


def load_and_preprocess_data(file_path):
    data = pd.read_csv("/content/drive/MyDrive/1000 Buses Trajectory Dataset (1).csv")

    
    data['Start'] = pd.to_datetime(data['Start'], format='%H:%M:%S', errors='coerce')
    data['Hour'] = data['Start'].dt.hour
    data['Weekday'] = data['Start'].dt.weekday
    data['Month'] = data['Start'].dt.month
    data['Day_of_Year'] = data['Start'].dt.dayofyear
    data['Is_Weekend'] = data['Weekday'].apply(lambda x: 1 if x >= 5 else 0)

    def convert_to_minutes(time_str):
        try:
            h, m, s = map(int, time_str.split(':'))
            return h * 60 + m + s / 60
        except:
            return np.nan

    data['Duration'] = data['Duration'].astype(str).apply(convert_to_minutes)
    data = data.dropna(subset=['Start', 'Duration'])

    
    stop_features = [col for col in data.columns if "Stop" in col and "to" in col]
    data['Route_Avg_Duration'] = data.groupby('Route')['Duration'].transform('mean')
    data['Delay_Factor'] = data['Duration'] - data['Route_Avg_Duration']

    
    Q1 = data['Duration'].quantile(0.25)
    Q3 = data['Duration'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data.loc[(data['Duration'] < lower_bound) | (data['Duration'] > upper_bound), 'Duration'] = data['Duration'].median()

    
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_cats = pd.DataFrame(encoder.fit_transform(data[['Route', 'Service', 'Vehicle']]))
    encoded_cats.columns = encoder.get_feature_names_out(['Route', 'Service', 'Vehicle'])
    data = data.drop(columns=['Route', 'Service', 'Vehicle']).reset_index(drop=True)
    data = pd.concat([data, encoded_cats], axis=1)

    return data, stop_features, encoded_cats.columns

class LightGBMAFT:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.params = {
            'objective': 'regression',
            'metric': 'mae',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.01,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'max_depth': 8,
            'min_data_in_leaf': 20,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1
        }

    def fit(self, X, y):
        
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        
        mean_duration = np.mean(y)
        X_scaled['mean_duration'] = mean_duration
        X_scaled['time_ratio'] = y / mean_duration

        
        train_data = lgb.Dataset(X_scaled, label=y)
        self.model = lgb.train(self.params, train_data, num_boost_round=200)
        return self

    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
        X_scaled['mean_duration'] = X_scaled.mean().mean()  
        X_scaled['time_ratio'] = 1.0 

        return self.model.predict(X_scaled)

class CustomNNRegressor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.model = MLPRegressor(
            hidden_layer_sizes=(200, 100, 50),
            activation='relu',
            solver='adam',
            alpha=0.001,
            batch_size=32,
            learning_rate='adaptive',
            learning_rate_init=0.001,
            max_iter=1000,
            early_stopping=True,
            validation_fraction=0.1,
            n_iter_no_change=20,
            random_state=42
        )

    def fit(self, X, y):
        X_scaled = self.scaler.fit_transform(X)
        self.model.fit(X_scaled, y)
        return self

    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)

def evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        results[name] = {
            "MAE": mean_absolute_error(y_test, y_pred),
            "RMSE": mean_squared_error(y_test, y_pred) ** 0.5,
            "MAPE": mean_absolute_percentage_error(y_test, y_pred),
            "R² (Accuracy)": r2_score(y_test, y_pred)
        }

    return pd.DataFrame(results).T

def main(file_path):
    print("Loading and preprocessing data...")
    data, stop_features, encoded_cat_columns = load_and_preprocess_data(file_path)

    features = ['Hour', 'Weekday', 'Month', 'Day_of_Year', 'Is_Weekend',
                'Route_Avg_Duration', 'Delay_Factor'] + stop_features + list(encoded_cat_columns)
    target = 'Duration'

    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        data[features], data[target], test_size=0.2, random_state=42
    )

    models = {
        "Random Forest": RandomForestRegressor(
            n_estimators=200,
            max_depth=15,
            min_samples_split=5,
            random_state=42
        ),
        "Gradient Boosting": GradientBoostingRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            random_state=42
        ),
        "LightGBM-AFT": LightGBMAFT(),
        "Neural Network": CustomNNRegressor()
    }

    print("Evaluating models...")
    results_df = evaluate_models(models, X_train, X_test, y_train, y_test)
    print("\nModel Evaluation:\n", results_df)

    return models, results_df

if __name__ == "__main__":
    file_path = "1000 Buses Trajectory Dataset (1).csv"
    models, results = main(file_path)

Loading and preprocessing data...
Splitting data...
Evaluating models...

Training Random Forest...

Training Gradient Boosting...

Training LightGBM-AFT...

Training Neural Network...

Model Evaluation:
                         MAE      RMSE      MAPE  R² (Accuracy)
Random Forest      0.007292  0.040060  0.000143       0.999985
Gradient Boosting  0.014442  0.025128  0.000266       0.999994
LightGBM-AFT       1.329266  2.017358  0.024303       0.960921
Neural Network     1.151935  3.003186  0.019878       0.913394
