In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Feature Selection

In [2]:
path='../../data/data2/processed/'

In [3]:
steps = pd.read_csv(f"{path}steps_with_regions_2.csv")

In [4]:
steps.head()

Unnamed: 0,route_id,delivery_id,action,step_order,distance_from_previous,total_size,cost,final_price,shipper_id,effective_dt,lat,lng,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,CODIGO,NOME
0,r-shiinc-1011382,t-petz-2548122,pickup,1,0.0,1,6.48,10.0,170162.0,2022-01-03 12:30:51,-19.9627,-43.957,3.0,1736.0,20.0,0.0,24,OESTE
1,r-shiinc-1011382,t-petz-2547366,pickup,2,0.0,4,6.48,10.0,170162.0,2022-01-03 12:30:51,-19.9627,-43.957,3.0,1736.0,20.0,0.0,24,OESTE
2,r-shiinc-1011382,t-petz-2546288,pickup,3,0.0,3,6.48,10.0,170162.0,2022-01-03 12:30:51,-19.9627,-43.957,3.0,1736.0,20.0,0.0,24,OESTE
3,r-shiinc-1011382,t-petz-2546397,pickup,4,0.0,1,6.48,10.0,170162.0,2022-01-03 12:30:51,-19.9627,-43.957,3.0,1736.0,20.0,0.0,24,OESTE
4,r-shiinc-1011382,t-petz-2547114,pickup,5,0.0,3,6.48,10.0,170162.0,2022-01-03 12:30:51,-19.9627,-43.957,3.0,1736.0,20.0,0.0,24,OESTE


In [5]:
steps.corr()

Unnamed: 0,step_order,distance_from_previous,total_size,cost,final_price,shipper_id,lat,lng,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,CODIGO
step_order,1.0,-0.163549,-0.444896,-0.703323,-0.700385,0.111176,0.000427,-0.111491,-0.037509,0.407261,0.928319,0.014109,0.021693
distance_from_previous,-0.163549,1.0,0.064077,0.235608,0.225911,0.029728,0.068246,-0.09299,-0.001281,-0.098472,-0.159699,0.223219,0.071478
total_size,-0.444896,0.064077,1.0,0.425529,0.432299,-0.076605,0.030921,0.083047,0.007658,-0.197353,-0.470718,-0.017094,0.002308
cost,-0.703323,0.235608,0.425529,1.0,0.908485,-0.096344,0.024971,0.063699,0.064186,-0.306003,-0.764406,-0.007477,0.0114
final_price,-0.700385,0.225911,0.432299,0.908485,1.0,-0.104769,0.019428,0.066851,0.044225,-0.312861,-0.75636,-0.005106,0.005643
shipper_id,0.111176,0.029728,-0.076605,-0.096344,-0.104769,1.0,0.048583,-0.084192,0.08368,-0.290479,0.109268,0.021617,0.081399
lat,0.000427,0.068246,0.030921,0.024971,0.019428,0.048583,1.0,0.08337,-0.038575,-0.144457,-0.010844,0.02164,0.564361
lng,-0.111491,-0.09299,0.083047,0.063699,0.066851,-0.084192,0.08337,1.0,-0.055395,-0.045057,-0.109746,-0.035183,-0.444434
shipper_capacity,-0.037509,-0.001281,0.007658,0.064186,0.044225,0.08368,-0.038575,-0.055395,1.0,-0.089137,-0.032792,0.014636,0.026415
shipper_deliveries_completed,0.407261,-0.098472,-0.197353,-0.306003,-0.312861,-0.290479,-0.144457,-0.045057,-0.089137,1.0,0.439738,-0.048645,-0.073649


In [6]:
steps.NOME.value_counts()

CENTRO-SUL    63513
OESTE         24699
PAMPULHA      21502
LESTE         21477
NORDESTE      17245
NOROESTE      14272
VENDA NOVA     6631
BARREIRO       6533
NORTE          5584
Name: NOME, dtype: int64

In [7]:
steps.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,step_order,distance_from_previous,total_size,cost,final_price,shipper_id,lat,lng,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,CODIGO
step_order,1.0,-0.163549,-0.444896,-0.703323,-0.700385,0.111176,0.000427,-0.111491,-0.037509,0.407261,0.928319,0.014109,0.021693
distance_from_previous,-0.163549,1.0,0.064077,0.235608,0.225911,0.029728,0.068246,-0.09299,-0.001281,-0.098472,-0.159699,0.223219,0.071478
total_size,-0.444896,0.064077,1.0,0.425529,0.432299,-0.076605,0.030921,0.083047,0.007658,-0.197353,-0.470718,-0.017094,0.002308
cost,-0.703323,0.235608,0.425529,1.0,0.908485,-0.096344,0.024971,0.063699,0.064186,-0.306003,-0.764406,-0.007477,0.0114
final_price,-0.700385,0.225911,0.432299,0.908485,1.0,-0.104769,0.019428,0.066851,0.044225,-0.312861,-0.75636,-0.005106,0.005643
shipper_id,0.111176,0.029728,-0.076605,-0.096344,-0.104769,1.0,0.048583,-0.084192,0.08368,-0.290479,0.109268,0.021617,0.081399
lat,0.000427,0.068246,0.030921,0.024971,0.019428,0.048583,1.0,0.08337,-0.038575,-0.144457,-0.010844,0.02164,0.564361
lng,-0.111491,-0.09299,0.083047,0.063699,0.066851,-0.084192,0.08337,1.0,-0.055395,-0.045057,-0.109746,-0.035183,-0.444434
shipper_capacity,-0.037509,-0.001281,0.007658,0.064186,0.044225,0.08368,-0.038575,-0.055395,1.0,-0.089137,-0.032792,0.014636,0.026415
shipper_deliveries_completed,0.407261,-0.098472,-0.197353,-0.306003,-0.312861,-0.290479,-0.144457,-0.045057,-0.089137,1.0,0.439738,-0.048645,-0.073649


In [8]:
steps.drop(columns=["cost","route_id","delivery_id","CODIGO","shipper_id","effective_dt","lat","lng"], inplace=True)

In [9]:
steps.columns

Index(['action', 'step_order', 'distance_from_previous', 'total_size',
       'final_price', 'shipper_capacity', 'shipper_deliveries_completed',
       'route_deliveries_count', 'time_from_previous', 'NOME'],
      dtype='object')

In [10]:
steps = pd.get_dummies(data = steps, columns=["action","NOME"])

In [11]:
steps.head()

Unnamed: 0,step_order,distance_from_previous,total_size,final_price,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,time_from_previous,action_delivery,action_pickup,NOME_BARREIRO,NOME_CENTRO-SUL,NOME_LESTE,NOME_NORDESTE,NOME_NOROESTE,NOME_NORTE,NOME_OESTE,NOME_PAMPULHA,NOME_VENDA NOVA
0,1,0.0,1,10.0,3.0,1736.0,20.0,0.0,0,1,0,0,0,0,0,0,1,0,0
1,2,0.0,4,10.0,3.0,1736.0,20.0,0.0,0,1,0,0,0,0,0,0,1,0,0
2,3,0.0,3,10.0,3.0,1736.0,20.0,0.0,0,1,0,0,0,0,0,0,1,0,0
3,4,0.0,1,10.0,3.0,1736.0,20.0,0.0,0,1,0,0,0,0,0,0,1,0,0
4,5,0.0,3,10.0,3.0,1736.0,20.0,0.0,0,1,0,0,0,0,0,0,1,0,0


In [12]:
scaler = StandardScaler()

In [13]:
y = steps["time_from_previous"]

In [14]:
X = steps.drop(columns=["time_from_previous"])

In [15]:
X.head()

Unnamed: 0,step_order,distance_from_previous,total_size,final_price,shipper_capacity,shipper_deliveries_completed,route_deliveries_count,action_delivery,action_pickup,NOME_BARREIRO,NOME_CENTRO-SUL,NOME_LESTE,NOME_NORDESTE,NOME_NOROESTE,NOME_NORTE,NOME_OESTE,NOME_PAMPULHA,NOME_VENDA NOVA
0,1,0.0,1,10.0,3.0,1736.0,20.0,0,1,0,0,0,0,0,0,1,0,0
1,2,0.0,4,10.0,3.0,1736.0,20.0,0,1,0,0,0,0,0,0,1,0,0
2,3,0.0,3,10.0,3.0,1736.0,20.0,0,1,0,0,0,0,0,0,1,0,0
3,4,0.0,1,10.0,3.0,1736.0,20.0,0,1,0,0,0,0,0,0,1,0,0
4,5,0.0,3,10.0,3.0,1736.0,20.0,0,1,0,0,0,0,0,0,1,0,0


In [16]:
X_NORM = X[X.columns[-11:]]

In [17]:
X_NORM[X.columns[:-11]] = scaler.fit_transform(X.iloc[:,:-11])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_NORM[X.columns[:-11]] = scaler.fit_transform(X.iloc[:,:-11])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_NORM[X.columns[:-11]] = scaler.fit_transform(X.iloc[:,:-11])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_NORM[X.columns[:-11]] = scaler.fit_transform(X.iloc[:,:-11])
A value is tryin

In [18]:
X_NORM

Unnamed: 0,action_delivery,action_pickup,NOME_BARREIRO,NOME_CENTRO-SUL,NOME_LESTE,NOME_NORDESTE,NOME_NOROESTE,NOME_NORTE,NOME_OESTE,NOME_PAMPULHA,NOME_VENDA NOVA,step_order,distance_from_previous,total_size,final_price,shipper_capacity,shipper_deliveries_completed,route_deliveries_count
0,0,1,0,0,0,0,0,0,1,0,0,-1.500528,-0.416704,-1.556155,0.790015,-0.193204,-0.894619,-1.028844
1,0,1,0,0,0,0,0,0,1,0,0,-1.477234,-0.416704,2.014850,0.790015,-0.193204,-0.894619,-1.028844
2,0,1,0,0,0,0,0,0,1,0,0,-1.453940,-0.416704,0.824515,0.790015,-0.193204,-0.894619,-1.028844
3,0,1,0,0,0,0,0,0,1,0,0,-1.430646,-0.416704,-1.556155,0.790015,-0.193204,-0.894619,-1.028844
4,0,1,0,0,0,0,0,0,1,0,0,-1.407352,-0.416704,0.824515,0.790015,-0.193204,-0.894619,-1.028844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181451,1,0,0,0,0,0,0,1,0,0,0,-0.522178,-0.006973,-0.365820,-0.635508,-0.193204,1.275561,-0.826779
181452,1,0,0,0,0,0,0,1,0,0,0,-0.498883,0.539334,-0.365820,-0.672572,-0.193204,1.275561,-0.826779
181453,1,0,0,0,0,0,0,1,0,0,0,-0.475589,1.176693,-0.365820,-0.424531,-0.193204,1.275561,-0.826779
181454,1,0,0,0,0,0,0,1,0,0,0,-0.429001,0.129604,-0.365820,-0.672572,-0.193204,1.275561,-0.826779


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_NORM, y, test_size=0.30, random_state=42)

In [63]:
model = LinearRegression()

In [64]:
model.fit(X_train, y_train)

LinearRegression()

In [65]:
y_pred = model.predict(X_test)

In [68]:
mean_absolute_error(y_test,y_pred)

2459.519694533762

In [69]:
mean_squared_error(y_test, y_pred)

60442007.32315113

In [70]:
r2_score(y_test,y_pred)

0.06972068108807117

In [None]:
#Conclusion
#NO hay regresion lineal xd

In [20]:
path_save='../../data/data2/slipdata/'

In [71]:
X_train.to_csv(f"{path_save}x_train.csv",index=False)
X_test.to_csv(f"{path_save}x_test.csv",index=False)
y_train.to_csv(f"{path_save}y_train.csv",index=False)
y_test.to_csv(f"{path_save}y_test.csv",index=False)