In [120]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [79]:
path='../../data/data2/processed/'

In [80]:
steps = pd.read_csv(f"{path}steps_with_regions_2.1.csv")
columns_to_drop=["cost","route_id","delivery_id","CODIGO","shipper_id","effective_dt","lat","lng"]
steps.drop(columns=columns_to_drop, inplace=True)
steps=pd.get_dummies(data = steps, columns=["action","NOME"])

In [88]:
steps[['time_from_previous']]

Unnamed: 0,time_from_previous
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
181451,768.0
181452,514.0
181453,3458.0
181454,2620.0


In [81]:
all_categorical=['action_delivery',
       'action_pickup', 'NOME_BARREIRO', 'NOME_CENTRO-SUL', 'NOME_LESTE',
       'NOME_NORDESTE', 'NOME_NOROESTE', 'NOME_NORTE', 'NOME_OESTE',
       'NOME_PAMPULHA', 'NOME_VENDA NOVA']
all_continue=['step_order', 'distance_from_previous', 'total_size', 'final_price',
       'shipper_capacity', 'shipper_deliveries_completed',
       'route_deliveries_count', 'time_from_previous']

In [89]:
d_combinations = {
    'cb-1': {
        'categories_col': [
            'action_delivery',
            'action_pickup', 'NOME_BARREIRO', 'NOME_CENTRO-SUL', 'NOME_LESTE',
            'NOME_NORDESTE', 'NOME_NOROESTE', 'NOME_NORTE', 'NOME_OESTE',
            'NOME_PAMPULHA', 'NOME_VENDA NOVA'
        ],
        'continue_cols': [
            'step_order', 'distance_from_previous', 'total_size', 'final_price',
            'shipper_capacity', 'shipper_deliveries_completed',
            'route_deliveries_count'
        ]
    },
    'cb-2': {
        'categories_col': [
            'action_delivery',
            'action_pickup'
        ],
        'continue_cols': [
            'step_order', 'distance_from_previous', 'total_size', 'final_price',
            'shipper_capacity', 'shipper_deliveries_completed',
            'route_deliveries_count'
        ]
    },
    'cb-3': {
        'categories_col': [
            'action_delivery',
            'action_pickup'
        ],
        'continue_cols': [
            'step_order', 'distance_from_previous', 'total_size', 'final_price',
            'shipper_capacity', 'shipper_deliveries_completed',
            'route_deliveries_count'
        ]
    },
    'cb-3': {
        'categories_col': [
            'action_delivery',
            'action_pickup'
        ],
        'continue_cols': [
            'distance_from_previous', 'final_price',
            'shipper_deliveries_completed',
            'route_deliveries_count'
        ]
    },
    'cb-4': {
        'categories_col': [
            'action_delivery',
            'action_pickup',
            'NOME_BARREIRO', 'NOME_CENTRO-SUL', 'NOME_LESTE',
            'NOME_NORDESTE', 'NOME_NOROESTE', 'NOME_NORTE', 'NOME_OESTE',
            'NOME_PAMPULHA', 'NOME_VENDA NOVA'
        ],
        'continue_cols': [
            'distance_from_previous', 'final_price',
            'shipper_deliveries_completed',
            'route_deliveries_count'
        ]
    }
}


In [110]:
def create_model(df,continue_cols,categories_col,target_col,model):
    scaler = StandardScaler()
    y=df[target_col]
    X=df.drop(columns=[target_col])

    X_NORM = X.loc[:,categories_col]
    X_NORM[continue_cols] = scaler.fit_transform(X.loc[:,continue_cols])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    # model = LinearRegression()
    model.fit(X_train,y_train)
    return model,X_test, y_test


def metrics(y_pred, y_test):
    return {
        'MSE' : mean_squared_error(y_pred,y_test),
        'MAE' : mean_absolute_error(y_pred,y_test),
        'SQRT_MSE' : np.sqrt(mean_squared_error(y_pred,y_test)),
        'R2_SCORE': r2_score(y_test,y_pred)
    }

In [123]:
def generate_models_metrics(d_combinations,type_model):
    d_models={}
    d_metrics={}
    if type_model=='regression':
        model=LinearRegression()
    if type_model=='ridge':
        model=Ridge()
    if type_model=='lasso':
        model=Lasso()
    if type_model=='elasticnet':
        model=ElasticNet()
    if type_model=='tweedie':
        model=TweedieRegressor(power=1, alpha=0.5, link='log',max_iter=1000)
    for key,value in d_combinations.items():
        d_models[key]=create_model(steps,value['continue_cols'],value['categories_col'],'time_from_previous',model)
    for key,value in d_models.items():
        d_metrics.setdefault(key,metrics(value[0].predict(value[1]),value[2]))
    return d_models,pd.DataFrame(d_metrics)

In [116]:
generate_models_metrics(d_combinations,'regression')[1]

Unnamed: 0,cb-1,cb-2,cb-3,cb-4
MSE,43358950.0,43358950.0,43358950.0,43358950.0
MAE,1937.312,1937.312,1937.312,1937.312
SQRT_MSE,6584.751,6584.751,6584.751,6584.751
R2_SCORE,0.05790714,0.05790714,0.05790714,0.05790714


In [117]:
generate_models_metrics(d_combinations,'ridge')[1]

Unnamed: 0,cb-1,cb-2,cb-3,cb-4
MSE,43358950.0,43358950.0,43358950.0,43358950.0
MAE,1937.309,1937.309,1937.309,1937.309
SQRT_MSE,6584.751,6584.751,6584.751,6584.751
R2_SCORE,0.05790715,0.05790715,0.05790715,0.05790715


In [118]:
generate_models_metrics(d_combinations,'lasso')[1]

Unnamed: 0,cb-1,cb-2,cb-3,cb-4
MSE,43358250.0,43358250.0,43358250.0,43358250.0
MAE,1936.043,1936.043,1936.043,1936.043
SQRT_MSE,6584.698,6584.698,6584.698,6584.698
R2_SCORE,0.05792232,0.05792232,0.05792232,0.05792232


In [119]:
generate_models_metrics(d_combinations,'elasticnet')[1]

Unnamed: 0,cb-1,cb-2,cb-3,cb-4
MSE,43436860.0,43436860.0,43436860.0,43436860.0
MAE,1925.745,1925.745,1925.745,1925.745
SQRT_MSE,6590.665,6590.665,6590.665,6590.665
R2_SCORE,0.05621423,0.05621423,0.05621423,0.05621423


In [124]:
generate_models_metrics(d_combinations,'tweedie')[1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


Unnamed: 0,cb-1,cb-2,cb-3,cb-4
MSE,44075110.0,44075110.0,44075110.0,44075110.0
MAE,1950.31,1950.31,1950.31,1950.31
SQRT_MSE,6638.908,6638.908,6638.908,6638.908
R2_SCORE,0.04234664,0.04234664,0.04234664,0.04234664
