In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

import logging

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

## data_preprocessing

In [3]:
def load_data(filepath:str)-> pd.DataFrame:
    """
    Loads the dataset from a CSV file
    """
    df = pd.read_csv(filepath)
    return df

In [27]:
def preprocess_data(df:pd.DataFrame, random_state:int = 42):
    """
    Performs data preprocessing:
    - Handling missing values.
    - Coding categorical variables.
    - Scaling numerical variables.
    - Dividing into training and test sets.
    """

    ## Identify numeric and categorical columns
    numerical_features = ['Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs']
    categorical_features = ['Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type']

    ## Create transformers for preprocessing
    numeric_transformer = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    ## Create a preprocessor that applies the transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ('num',numeric_transformer,numerical_features),
            ('cat',categorical_transformer,categorical_features)
        ],
     
    )

    X = df.drop(columns=['Delivery_Time_min'])
    y = df['Delivery_Time_min']

    ## Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    ## Fit the preprocessor
    preprocessor.fit(X_train)
    X_train_processed = preprocessor.transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    logging.info("Data successfully preprocessed")

    ## Save the preprocessor
    joblib.dump(preprocessor, "../models/preprocessor.pkl")
    logging.info("Preprocessor saved successfully")

    num_features = preprocessor.transformers_[0][2]
    ohe = preprocessor.transformers_[1][1]
    cat_features = preprocessor.transformers_[1][2]
    ohe_feature_names = ohe.get_feature_names_out(cat_features)
    all_feature_names = list(num_features) + list(ohe_feature_names)

    ## Convert the processed data with the new columns
    X_train_processed_df = pd.DataFrame(X_train_processed,columns=all_feature_names)
    X_test_processed_df = pd.DataFrame(X_test_processed,columns=all_feature_names)

    return X_train_processed_df, X_test_processed_df, y_train, y_test


In [42]:
df = load_data("../data/Food_Delivery_Times.csv")
X_train, X_test, y_train, y_test = preprocess_data(df)

2025-06-30 22:26:33,415 - root - INFO - Data successfully preprocessed
2025-06-30 22:26:33,421 - root - INFO - Preprocessor saved successfully


## model_training

In [100]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import cross_val_score, KFold
import optuna
import joblib
from functools import partial
from collections import defaultdict
import os
import logging

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
optuna.logging.set_verbosity(optuna.logging.WARNING)
# current_script_directory = os.path.dirname(os.path.abspath(__file__))

best_model_scores = defaultdict(lambda: float("inf"))
best_model_params = {}

def objective(trial,X,y):
    model_name = trial.suggest_categorical("model", ["elasticnet","random_forest", "svm","lgbm", "xgb"])

    if model_name == "elasticnet":
        alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
        l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio,max_iter=5000, random_state=42)

    if model_name == "random_forest":
        n_estimators = trial.suggest_int("rf_n_estimators", 100, 1000)
        max_depth = trial.suggest_int("rf_max_depth", 3, 30)
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

    elif model_name == "svm":
        C = trial.suggest_float("svm_C", 0.1, 100.0, log=True)
        epsilon = trial.suggest_float("svm_epsilon", 0.01, 1.0, log=True)
        kernel = trial.suggest_categorical("svm_kernel", ["linear", "rbf"])
        model = SVR(C=C, epsilon=epsilon, kernel=kernel)

    elif model_name == "xgb":
        n_estimators = trial.suggest_int("xgb_n_estimators", 100, 1000)
        max_depth = trial.suggest_int("xgb_max_depth", 3, 30)
        learning_rate = trial.suggest_float("xgb_lr", 0.01, 0.3)
        model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth,
                             learning_rate=learning_rate, random_state=42,
                             objective="reg:squarederror", verbosity=0)

    elif model_name == "lgbm":
        n_estimators = trial.suggest_int("lgb_n_estimators", 100, 1000)
        max_depth = trial.suggest_int("lgb_max_depth", 3, 30)
        learning_rate = trial.suggest_float("lgb_lr", 0.01, 0.3)
        model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth,
                              learning_rate=learning_rate, random_state=42)
        
    ## Cross-validation MAE
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    mae = cross_val_score(model, X, y, cv=cv, scoring=make_scorer(mean_absolute_error)).mean()
    
    ## Save if it's better for that model
    if mae < best_model_scores[model_name]:
        best_model_scores[model_name] = mae
        best_model_params[model_name] = trial.params

    return mae


In [101]:
def build_best_model(name, params):
    if name == "elasticnet":
        return ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio'], random_state=42)
    elif name == "random_forest":
        return RandomForestRegressor(n_estimators=params['rf_n_estimators'],
                                     max_depth=params['rf_max_depth'], random_state=42)
    elif name == "svm":
        return SVR(C=params['svm_C'], epsilon=params['svm_epsilon'], kernel=params['svm_kernel'])
    elif name == "lgbm":
        return LGBMRegressor(n_estimators=params['lgb_n_estimators'],
                             max_depth=params['lgb_max_depth'],
                             learning_rate=params['lgb_lr'],
                             random_state=42)
    elif name == "xgb":
        return XGBRegressor(n_estimators=params['xgb_n_estimators'],
                            max_depth=params['xgb_max_depth'],
                            learning_rate=params['xgb_lr'],
                            random_state=42,
                            objective="reg:squarederror", verbosity=0)

In [None]:
def train_model(X_train_processed, y_train):
    """
    Uses optuna for hyperparameter tuning
    """
    objective_with_data = partial(objective, X=X_train_processed, y=y_train)

    logging.info("Hyperparameterization with OPTUNA")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective_with_data, n_trials=50)
    logging.info("Training completed")

    print("Best MAE by model")
    for model_name, mae in best_model_scores.items():
        print(f"  {model_name}: {mae:.4f}")

    print("Details best model")
    best_model_name = study.best_params['model']
    print(f"  Model: {best_model_name}")
    print(f"  MAE: {study.best_value:.4f}")
    print(f"  Hyperparameters: {study.best_params}")

    final_model = build_best_model(best_model_name, study.best_params)
    final_model.fit(X_train_processed, y_train)

    # joblib.dump(final_model, os.path.join(current_script_directory,"..","models","model.pkl"))
    joblib.dump(final_model, "../models/model.pkl")
    logging.info("Model saved successfully")

    return final_model

In [None]:
model = train_model(X_train,y_train)

2025-07-01 00:23:11,598 - root - INFO - Hyperparameterization with OPTUNA
2025-07-01 00:24:34,440 - root - INFO - Training completed
2025-07-01 00:24:34,502 - root - INFO - Model saved successfully


Best MAE by model
  elasticnet: 6.8095
  xgb: 8.4059
  svm: 6.5549
  lgbm: 8.5493
  random_forest: 7.9636
Details best model
  Modelo: svm
  MAE: 6.5549
  Hiperparámetros: {'model': 'svm', 'svm_C': 5.7345428219985575, 'svm_epsilon': 0.057104846911244816, 'svm_kernel': 'linear'}


: 

In [1]:
!pip install shap

Collecting shap
  Downloading shap-0.48.0-cp313-cp313-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.61.2-cp313-cp313-win_amd64.whl.metadata (2.8 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.44.0-cp313-cp313-win_amd64.whl.metadata (5.0 kB)
Collecting numpy (from shap)
  Downloading numpy-2.2.6-cp313-cp313-win_amd64.whl.metadata (60 kB)
Downloading shap-0.48.0-cp313-cp313-win_amd64.whl (545 kB)
   ---------------------------------------- 0.0/545.1 kB ? eta -:--:--
   --------------------------------------- 545.1/545.1 kB 20.9 MB/s eta 0:00:00
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading numba-0.61.2-cp313-cp313-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 M

  You can safely remove it manually.
  You can safely remove it manually.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
