# Libraries

In [343]:
# Data manipulation libraries
import pandas as pd # Dataframes

# Statistical libraries
from sklearn.model_selection import train_test_split # Split dataset for validation
from sklearn.model_selection import cross_val_score # Cross validation for models
from sklearn.model_selection import GridSearchCV # Hyperparameter fine-tuning

# Modeling libraries
from sklearn.linear_model import LinearRegression, GammaRegressor, BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from lightgbm import LGBMRegressor

# Warnings
import warnings

# Data

In [332]:
df = pd.read_csv('data/clean_data_2.csv')\
           .drop(columns = ['Order_ID'])\
           .dropna()

target_col = 'Delivery_Time_min'
X = df.drop(columns = target_col)
y = df[target_col]

# Baseline model

In [333]:
def eval_model(model_name:str, X:pd.DataFrame, y:pd.Series, 
               models:dict, test_size:float = 0.3, random_state:int = 42, 
               verbose:bool = False) -> tuple[str,float,float,float]:
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size = test_size, 
        random_state = random_state
    )  

    try:
        # Train model      
        model = models[model_name]
        model.fit(X_train, y_train)

        # Evaluate
        score_cv = cross_val_score(model, X_train, y_train, cv=5)
        score_test = model.score(X_test, y_test)
        if verbose:
            print(f'{model_name} \n'\
                f'mean cross-validation score: {score_cv.mean():0.4f} '\
                f'with a standard deviation of {score_cv.std():0.4f}\n'\
                f'test score: {score_test:0.4f}\n'\
                '-----')
        
        return (model_name, score_cv.mean(), score_cv.std(), score_test)
    
    except:
        return (model_name, 0, 0, 0)

In [334]:
models = {'Linear Regression': LinearRegression()}

In [335]:
eval_model('Linear Regression', X, y, models, verbose = True)

Linear Regression 
mean cross-validation score: 0.7517 with a standard deviation of 0.0366
test score: 0.8351
-----


('Linear Regression',
 0.7516632570579345,
 0.0365981173095549,
 0.8351348286803177)

# Model comparison

In [None]:
def eval_dataset(path:str, target_col:str, models:dict[str,object], 
                     drop_cols:list[str] = [], dropna:bool = True) -> pd.DataFrame:

    df = pd.read_csv(path)\
           .drop(columns = drop_cols)
    
    if dropna:
        df = df.dropna()

    X = df.drop(columns = target_col)
    y = df[target_col]

    model_scores = []

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for m in models.keys():
            model_scores.append(eval_model(m, X, y, models, verbose = False))

    df_scores = pd.DataFrame(
        model_scores,
        columns = ['model_name','score_cv_mean','score_cv_std','score_test']
    ).sort_values(
        by = 'score_test',
        ascending = False
    )

    return df_scores

In [337]:
def get_best_models(root_file_name:str, target_col:str, 
                    models: dict[str,object],drop_cols:list[str] = []) -> pd.DataFrame:

    models_df = pd.DataFrame(columns = ['dataset', 'model_name','score_cv_mean','score_cv_std','score_test'])

    for i in range(2,5):

        file_name = f'{root_file_name}{i}'

        temp_df = eval_dataset(
            path = f'data/{file_name}.csv', 
            target_col = target_col, 
            models = models,
            drop_cols = drop_cols
        )

        temp_df['dataset'] = file_name

        models_df = pd.concat([models_df, temp_df])
    
    return models_df.sort_values(by = 'score_test', ascending = False)

In [None]:
models = {
    # Linear models
    'Gamma Regression': GammaRegressor(),

    # Bayesian
    'Bayesian Ridge': BayesianRidge(),
    
    # Decision Trees and Ensembles
    'Gradient Boosting': GradientBoostingRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'LightGBM': LGBMRegressor(),
    
    # Support Vector Machines
    'Linear Support Vector Regression': LinearSVR(),
    
    # Instance Based Learning
    'KNeighbors': KNeighborsRegressor()
}

In [342]:
get_best_models(
    root_file_name = 'clean_data_',
    target_col = 'Delivery_Time_min',
    models = models,
    drop_cols = ['Order_ID']
).drop_duplicates(subset = ['model_name'])

  models_df = pd.concat([models_df, temp_df])


Unnamed: 0,dataset,model_name,score_cv_mean,score_cv_std,score_test
6,clean_data_2,Linear Support Vector Regression,0.749421,0.029543,0.837113
1,clean_data_3,Bayesian Ridge,0.751822,0.035869,0.835099
2,clean_data_4,Gradient Boosting,0.724955,0.035503,0.770208
3,clean_data_4,Random Forest,0.693352,0.056056,0.768855
5,clean_data_4,LightGBM,0.717549,0.03974,0.761799
0,clean_data_2,Gamma Regression,0.723414,0.034846,0.759337
7,clean_data_2,KNeighbors,0.655945,0.061167,0.720834
4,clean_data_4,Decision Tree,0.439434,0.095677,0.379672


# Hyperparameter fine-tuning

In [None]:
def finetune_model(path:str, target_col: str, model, param_grid,
               drop_cols:list[str] = [], dropna:bool = True) -> dict:
    
    df = pd.read_csv(path)\
           .drop(columns = drop_cols)
    
    if dropna:
        df = df.dropna()

    X = df.drop(columns = target_col)
    y = df[target_col]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size = 0.3, 
        random_state = 42
    ) 

    grid_search = GridSearchCV(
        model,
        param_grid,
        cv = 5, scoring = 'r2'
    )

    grid_search.fit(X_train,y_train)

    return grid_search.best_params_

## Linear Support Vector Regression

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'tol': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
    'epsilon': [0, 0.01, 0.05, 0.1, 0.5],
    'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive']
}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    best_params = finetune_model(
        'data/clean_data_2.csv',
        'Delivery_Time_min',
        LinearSVR(),
        param_grid,
        drop_cols = ['Order_ID']
    )

print(best_params)

{'C': 0.1, 'epsilon': 0.01, 'loss': 'squared_epsilon_insensitive', 'tol': 1e-05}


## Bayesian Ridge

In [None]:
param_grid = {
    'alpha_1': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
    'alpha_2': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
    'lambda_1': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
    'lambda_2': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    best_params = finetune_model(
        'data/clean_data_2.csv',
        'Delivery_Time_min',
        BayesianRidge(),
        param_grid,
        drop_cols = ['Order_ID']
    )

print(best_params)

{'alpha_1': 0.001, 'alpha_2': 1e-07, 'lambda_1': 1e-07, 'lambda_2': 0.001}


In [419]:
models = {
    'Baseline Linear SVR': LinearSVR(),
    'Fine-tuned Linear SVR': LinearSVR(
        C = 0.1, epsilon = 0.01, tol = 1e-05,
        loss = 'squared_epsilon_insensitive'),
    
    'Baseline Bayesian Ridge': BayesianRidge(),
    'Fine-tuned Bayesian Ridge': BayesianRidge(
        alpha_1 = 0.001, alpha_2 = 1e-07, 
        lambda_1 = 1e-07, lambda_2 = 0.001)
}

In [422]:
get_best_models(
    root_file_name = 'clean_data_',
    target_col = 'Delivery_Time_min',
    models = models,
    drop_cols = ['Order_ID']
).drop_duplicates(subset = ['model_name'])

  models_df = pd.concat([models_df, temp_df])


Unnamed: 0,dataset,model_name,score_cv_mean,score_cv_std,score_test
0,clean_data_2,Baseline Linear SVR,0.749425,0.030659,0.837862
1,clean_data_2,Fine-tuned Linear SVR,0.751859,0.035624,0.835404
3,clean_data_3,Fine-tuned Bayesian Ridge,0.751822,0.035869,0.835099
2,clean_data_3,Baseline Bayesian Ridge,0.751822,0.035869,0.835099
