In [67]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Feature Selection

In [18]:
steps = pd.read_csv("../data/processed/steps_with_regions.csv")

In [19]:
steps.head()

Unnamed: 0,route_id,delivery_id,action,step_order,distance_from_previous,total_size,cost,final_price,shipper_id,effective_dt,lat,lng,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,CODIGO,NOME
0,r-shiinc-1247126,t-petz-3171208,pickup,1,0.0,3,10.557,11.98,245014.0,2022-05-01 16:49:22,-19.9524,-43.9388,3,343,7.0,0.0,19,CENTRO-SUL
1,r-shiinc-1247126,t-petz-3171121,pickup,2,0.0,3,9.752,11.3,245014.0,2022-05-01 16:49:22,-19.9524,-43.9388,3,343,7.0,0.0,19,CENTRO-SUL
2,r-shiinc-1247126,t-petz-3171210,pickup,3,0.0,4,9.752,11.3,245014.0,2022-05-01 16:49:22,-19.9524,-43.9388,3,343,7.0,0.0,19,CENTRO-SUL
3,r-shiinc-1247126,t-petz-3171223,pickup,4,0.0,1,13.846,15.72,245014.0,2022-05-01 16:49:22,-19.9524,-43.9388,3,343,7.0,0.0,19,CENTRO-SUL
4,r-shiinc-1247126,t-petz-3171209,pickup,5,0.0,5,12.351,14.02,245014.0,2022-05-01 16:49:22,-19.9524,-43.9388,3,343,7.0,0.0,19,CENTRO-SUL


In [20]:
steps.corr()

Unnamed: 0,step_order,distance_from_previous,total_size,cost,final_price,shipper_id,lat,lng,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,CODIGO
step_order,1.0,-0.174906,-0.465992,-0.771874,-0.737494,0.109969,0.039568,-0.087461,-0.157335,0.363029,0.924417,0.01651,0.032152
distance_from_previous,-0.174906,1.0,0.088267,0.235152,0.233503,0.039879,0.069432,-0.092786,0.007399,-0.101709,-0.168981,0.208082,0.070658
total_size,-0.465992,0.088267,1.0,0.474134,0.469468,-0.066079,0.044109,0.06253,0.060378,-0.185495,-0.494238,-0.01904,0.022319
cost,-0.771874,0.235152,0.474134,1.0,0.953111,-0.129436,0.011586,0.036824,0.169052,-0.273685,-0.843287,-0.023454,0.029346
final_price,-0.737494,0.233503,0.469468,0.953111,1.0,-0.126102,0.002786,0.038774,0.147673,-0.274258,-0.802043,-0.013559,0.020773
shipper_id,0.109969,0.039879,-0.066079,-0.129436,-0.126102,1.0,0.117777,-0.07674,-0.055347,-0.314364,0.113175,0.03568,0.133912
lat,0.039568,0.069432,0.044109,0.011586,0.002786,0.117777,1.0,0.079593,-0.094973,-0.161497,0.023366,0.010096,0.564613
lng,-0.087461,-0.092786,0.06253,0.036824,0.038774,-0.07674,0.079593,1.0,0.065412,-0.068389,-0.089603,-0.02707,-0.442399
shipper_capacity,-0.157335,0.007399,0.060378,0.169052,0.147673,-0.055347,-0.094973,0.065412,1.0,-0.179099,-0.165459,0.052296,-0.087558
shipper_deliveries_completed,0.363029,-0.101709,-0.185495,-0.273685,-0.274258,-0.314364,-0.161497,-0.068389,-0.179099,1.0,0.391206,-0.077425,-0.104256


In [23]:
steps.NOME.value_counts()

CENTRO-SUL    5903
OESTE         2291
PAMPULHA      1992
LESTE         1946
NORDESTE      1404
NOROESTE      1327
BARREIRO       659
VENDA NOVA     609
NORTE          454
Name: NOME, dtype: int64

In [21]:
steps.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,step_order,distance_from_previous,total_size,cost,final_price,shipper_id,lat,lng,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,CODIGO
step_order,1.0,-0.174906,-0.465992,-0.771874,-0.737494,0.109969,0.039568,-0.087461,-0.157335,0.363029,0.924417,0.01651,0.032152
distance_from_previous,-0.174906,1.0,0.088267,0.235152,0.233503,0.039879,0.069432,-0.092786,0.007399,-0.101709,-0.168981,0.208082,0.070658
total_size,-0.465992,0.088267,1.0,0.474134,0.469468,-0.066079,0.044109,0.06253,0.060378,-0.185495,-0.494238,-0.01904,0.022319
cost,-0.771874,0.235152,0.474134,1.0,0.953111,-0.129436,0.011586,0.036824,0.169052,-0.273685,-0.843287,-0.023454,0.029346
final_price,-0.737494,0.233503,0.469468,0.953111,1.0,-0.126102,0.002786,0.038774,0.147673,-0.274258,-0.802043,-0.013559,0.020773
shipper_id,0.109969,0.039879,-0.066079,-0.129436,-0.126102,1.0,0.117777,-0.07674,-0.055347,-0.314364,0.113175,0.03568,0.133912
lat,0.039568,0.069432,0.044109,0.011586,0.002786,0.117777,1.0,0.079593,-0.094973,-0.161497,0.023366,0.010096,0.564613
lng,-0.087461,-0.092786,0.06253,0.036824,0.038774,-0.07674,0.079593,1.0,0.065412,-0.068389,-0.089603,-0.02707,-0.442399
shipper_capacity,-0.157335,0.007399,0.060378,0.169052,0.147673,-0.055347,-0.094973,0.065412,1.0,-0.179099,-0.165459,0.052296,-0.087558
shipper_deliveries_completed,0.363029,-0.101709,-0.185495,-0.273685,-0.274258,-0.314364,-0.161497,-0.068389,-0.179099,1.0,0.391206,-0.077425,-0.104256


In [25]:
steps.drop(columns=["cost","route_id","delivery_id","CODIGO","shipper_id","effective_dt","lat","lng"], inplace=True)

In [26]:
steps.columns

Index(['action', 'step_order', 'distance_from_previous', 'total_size',
       'final_price', 'shipper_capacity', 'shipper_deliveries_completed',
       'route_deliveries_count', 'time_from_previous', 'NOME'],
      dtype='object')

In [29]:
steps = pd.get_dummies(data = steps, columns=["action","NOME"])

In [31]:
steps.head()

Unnamed: 0,step_order,distance_from_previous,total_size,final_price,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,action_delivery,action_pickup,NOME_BARREIRO,NOME_CENTRO-SUL,NOME_LESTE,NOME_NORDESTE,NOME_NOROESTE,NOME_NORTE,NOME_OESTE,NOME_PAMPULHA,NOME_VENDA NOVA
0,1,0.0,3,11.98,3,343,7.0,0.0,0,1,0,1,0,0,0,0,0,0,0
1,2,0.0,3,11.3,3,343,7.0,0.0,0,1,0,1,0,0,0,0,0,0,0
2,3,0.0,4,11.3,3,343,7.0,0.0,0,1,0,1,0,0,0,0,0,0,0
3,4,0.0,1,15.72,3,343,7.0,0.0,0,1,0,1,0,0,0,0,0,0,0
4,5,0.0,5,14.02,3,343,7.0,0.0,0,1,0,1,0,0,0,0,0,0,0


In [56]:
scaler = StandardScaler()

In [50]:
y = steps["time_from_previous"]

In [34]:
X = steps.drop(columns=["time_from_previous"])

In [35]:
X.head()

Unnamed: 0,step_order,distance_from_previous,total_size,final_price,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,action_delivery,action_pickup,NOME_BARREIRO,NOME_CENTRO-SUL,NOME_LESTE,NOME_NORDESTE,NOME_NOROESTE,NOME_NORTE,NOME_OESTE,NOME_PAMPULHA,NOME_VENDA NOVA
0,1,0.0,3,11.98,3,343,7.0,0,1,0,1,0,0,0,0,0,0,0
1,2,0.0,3,11.3,3,343,7.0,0,1,0,1,0,0,0,0,0,0,0
2,3,0.0,4,11.3,3,343,7.0,0,1,0,1,0,0,0,0,0,0,0
3,4,0.0,1,15.72,3,343,7.0,0,1,0,1,0,0,0,0,0,0,0
4,5,0.0,5,14.02,3,343,7.0,0,1,0,1,0,0,0,0,0,0,0


In [57]:
X_NORM = X[X.columns[-11:]]

In [58]:
X_NORM[X.columns[:-11]] = scaler.fit_transform(X.iloc[:,:-11])

In [59]:
X_NORM

Unnamed: 0,action_delivery,action_pickup,NOME_BARREIRO,NOME_CENTRO-SUL,NOME_LESTE,NOME_NORDESTE,NOME_NOROESTE,NOME_NORTE,NOME_OESTE,NOME_PAMPULHA,NOME_VENDA NOVA,step_order,distance_from_previous,total_size,final_price,shipper_capacity,shipper_deliveries_completed,route_deliveries_count
0,0,1,0,1,0,0,0,0,0,0,0,-1.554513,-0.416467,0.830749,1.202486,-0.31618,-1.090767,-1.614371
1,0,1,0,1,0,0,0,0,0,0,0,-1.530916,-0.416467,0.830749,1.024599,-0.31618,-1.090767,-1.614371
2,0,1,0,1,0,0,0,0,0,0,0,-1.507318,-0.416467,2.024268,1.024599,-0.31618,-1.090767,-1.614371
3,0,1,0,1,0,0,0,0,0,0,0,-1.483720,-0.416467,-1.556288,2.180867,-0.31618,-1.090767,-1.614371
4,0,1,0,1,0,0,0,0,0,0,0,-1.460122,-0.416467,3.217787,1.736148,-0.31618,-1.090767,-1.614371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16580,1,0,0,0,0,0,0,1,0,0,0,1.253621,-0.416467,-0.362769,-0.623475,-0.31618,-0.352265,0.641699
16581,1,0,0,0,0,0,0,1,0,0,0,1.277219,-0.150656,-0.362769,-0.675795,-0.31618,-0.352265,0.641699
16582,1,0,0,0,0,0,0,1,0,0,0,1.300817,-0.327863,-0.362769,-0.675795,-0.31618,-0.352265,0.641699
16583,1,0,0,0,0,0,0,1,0,0,0,1.324415,-0.416467,-0.362769,-0.675795,-0.31618,-0.352265,0.641699


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_NORM, y, test_size=0.30, random_state=42)

In [63]:
model = LinearRegression()

In [64]:
model.fit(X_train, y_train)

LinearRegression()

In [65]:
y_pred = model.predict(X_test)

In [68]:
mean_absolute_error(y_test,y_pred)

2459.519694533762

In [69]:
mean_squared_error(y_test, y_pred)

60442007.32315113

In [70]:
r2_score(y_test,y_pred)

0.06972068108807117

In [None]:
#Conclusion
#NO hay regresion lineal xd

In [71]:
X_train.to_csv("../data/split_data/x_train.csv",index=False)
X_test.to_csv("../data/split_data/x_test.csv",index=False)
y_train.to_csv("../data/split_data/y_train.csv",index=False)
y_test.to_csv("../data/split_data/y_test.csv",index=False)