In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from itertools import product
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import mean_squared_error , r2_score
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Carrier,Item Quantity,City,Country,Status,Weight,Shipping Date,Delivery Date,Delivery Duration
0,A,2.0,Jeddah,SA,Delivered,0.0625,2021-09-30,2021-10-02,2
1,A,2.0,Madinah,SA,Delivered,0.345,2021-09-30,2021-10-03,3
2,A,3.0,Makkah,SA,Delivered,2.1,2021-09-30,2021-10-03,3
3,B,1.0,Riyadh,SA,Delivered,0.7,2021-09-30,2021-10-03,3
4,B,2.0,Riyadh,SA,Delivered,1.4,2021-09-30,2021-10-02,2


In [3]:
df['Carrier'] = df['Carrier'].astype("category")
df['City'] = df['City'].astype("category")
df['Shipping Date'] =  pd.to_datetime(df['Shipping Date'])
df['Year'] = df['Shipping Date'].dt.year.astype("category")
df['Month'] = df['Shipping Date'].dt.month.astype("category")
df['Day'] = df['Shipping Date'].dt.day.astype("category")
df['DayOfWeek'] = df['Shipping Date'].dt.dayofweek.astype("category")

In [4]:
df.drop(['Country', 'Status', 'Delivery Date','Item Quantity', 'Shipping Date'], axis=1, inplace=True)
df.head()

Unnamed: 0,Carrier,City,Weight,Delivery Duration,Year,Month,Day,DayOfWeek
0,A,Jeddah,0.0625,2,2021,9,30,3
1,A,Madinah,0.345,3,2021,9,30,3
2,A,Makkah,2.1,3,2021,9,30,3
3,B,Riyadh,0.7,3,2021,9,30,3
4,B,Riyadh,1.4,2,2021,9,30,3


In [5]:
df_fast = df[df['Delivery Duration'] <= 7].copy()
df_slow = df[df['Delivery Duration'] > 7].copy()

In [8]:
df_fast['Delivery Duration'].value_counts()

Delivery Duration
1    58897
0    40770
2    38303
3    16637
4     6482
5     2631
6     1308
7      790
Name: count, dtype: int64

In [60]:
y_f = df_fast['Delivery Duration']
X_f = df_fast.drop('Delivery Duration', axis=1)
X_train, X_test, y_train, y_test = train_test_split(
X_f, y_f, test_size=0.2, random_state=42)
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, xgb_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.8524072532376933


In [44]:
X_train.dtypes

Carrier      category
City         category
Weight        float64
Year         category
Month        category
Day          category
DayOfWeek    category
dtype: object

In [61]:
model = XGBRegressor(enable_categorical=True)
model.fit(X_f, y_f, 
          eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-rmse:1.16008	validation_1-rmse:1.16585
[1]	validation_0-rmse:1.07865	validation_1-rmse:1.08524
[2]	validation_0-rmse:1.03156	validation_1-rmse:1.03769
[3]	validation_0-rmse:0.99884	validation_1-rmse:1.00491
[4]	validation_0-rmse:0.97795	validation_1-rmse:0.98363
[5]	validation_0-rmse:0.96262	validation_1-rmse:0.96772
[6]	validation_0-rmse:0.94890	validation_1-rmse:0.95386
[7]	validation_0-rmse:0.94050	validation_1-rmse:0.94473
[8]	validation_0-rmse:0.92899	validation_1-rmse:0.93291
[9]	validation_0-rmse:0.92248	validation_1-rmse:0.92661
[10]	validation_0-rmse:0.91963	validation_1-rmse:0.92356
[11]	validation_0-rmse:0.91406	validation_1-rmse:0.91707
[12]	validation_0-rmse:0.90986	validation_1-rmse:0.91310
[13]	validation_0-rmse:0.90664	validation_1-rmse:0.90990
[14]	validation_0-rmse:0.90497	validation_1-rmse:0.90822
[15]	validation_0-rmse:0.90143	validation_1-rmse:0.90490
[16]	validation_0-rmse:0.89898	validation_1-rmse:0.90255
[17]	validation_0-rmse:0.89305	validation

In [62]:
model.save_model("model_sklearn.json")

In [64]:
model2 = xgb.XGBRegressor()
model2.load_model("model_sklearn.json")

In [75]:
list(X_test.head(1).values)

[array(['W', 'Riyadh', 2.56, 2023, 5, 24, 2], dtype=object)]

In [21]:
df_pred = pd.DataFrame()
df_pred['preds'] = np.floor(xgb_preds)
df_pred['preds'] = df_pred['preds'].replace({-1:0})
df_pred['true'] = y_test.values
df_pred['err'] = np.absolute(df_pred['true'] - df_pred['preds'])
for i in X_test.columns: 
    df_pred[i] = X_test[i].values
df_pred.true = df_pred.true.astype(str)
df_pred[['true','preds']].groupby('true').agg(mean=('preds',np.mean),
                                              stdv=('preds',np.std),
                                              min=('preds',np.min),
                                              max=('preds',np.max),
                                             )

Unnamed: 0_level_0,mean,stdv,min,max
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.128866,0.348362,0.0,3.0
1,0.740577,0.545264,0.0,4.0
2,1.402866,0.702052,0.0,4.0
3,1.902292,0.861151,0.0,5.0
4,2.333587,1.077441,0.0,5.0
5,2.581921,1.241717,0.0,5.0
6,2.772549,1.390077,0.0,6.0
7,3.029412,1.607771,0.0,6.0


In [29]:
from sklearn.utils import resample

time_4 = df_fast[(df_fast['Delivery Duration'] == 4)]
time_5 = df_fast[(df_fast['Delivery Duration'] == 5)]
time_6 = df_fast[(df_fast['Delivery Duration'] == 6)]
time_7 = df_fast[(df_fast['Delivery Duration'] == 7)]


    
upsampled_time_4 = resample(time_4,
                             replace=True,
                             n_samples=7000)


upsampled_time_5 = resample(time_5,
                             replace=True,
                             n_samples=5000)


upsampled_time_6 = resample(time_6,
                             replace=True,
                             n_samples=4000)


upsampled_time_7 = resample(time_7,
                             replace=True,
                             n_samples=3000)

In [30]:
new_df_fast = pd.concat([upsampled_time_4,upsampled_time_5,upsampled_time_6,upsampled_time_7,
                         df_fast[df_fast['Delivery Duration'] <=3 ]],axis=0)
new_df_fast['Delivery Duration'].value_counts()

Delivery Duration
1    58897
0    40770
2    38303
3    16637
4     7000
5     5000
6     4000
7     3000
Name: count, dtype: int64

In [31]:
y_f = new_df_fast['Delivery Duration']
X_f = new_df_fast.drop('Delivery Duration', axis=1)
X_train, X_test, y_train, y_test = train_test_split(
X_f, y_f, test_size=0.2, random_state=42)
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, xgb_preds, squared=False)
print(f"RMSE: {rmse}")


RMSE: 0.9831535289501847


In [37]:
df_pred = pd.DataFrame()
df_pred['preds'] = np.floor(xgb_preds)
df_pred['preds'] = df_pred['preds'].replace({-1:0})
df_pred['true'] = y_test.values
df_pred['err'] = np.absolute(df_pred['true'] - df_pred['preds'])
for i in X_test.columns: 
    df_pred[i] = X_test[i].values
df_pred.true = df_pred.true.astype(str)
df_pred[['true','err']].groupby('true').agg(mean=('err',np.mean),
                                             stdv=('err',np.std),
                                              min=('err',np.min),
                                              max=('err',np.max),
                                             )

Unnamed: 0_level_0,mean,stdv,min,max
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.1756,0.418454,0.0,3.0
1,0.400051,0.511277,0.0,4.0
2,0.658175,0.651848,0.0,3.0
3,1.052427,0.831257,0.0,4.0
4,1.462191,1.046219,0.0,4.0
5,2.025559,1.27272,0.0,5.0
6,2.582697,1.518936,0.0,6.0
7,2.930743,1.68308,0.0,7.0


In [80]:
y_f = df_slow['Delivery Duration']
X_f = df_slow.drop('Delivery Duration', axis=1)
X_train, X_test, y_train, y_test = train_test_split(
X_f, y_f, test_size=0.2, random_state=42)
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, xgb_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 5.245974554480871


In [81]:
df_pred = pd.DataFrame()
df_pred['preds'] = np.floor(xgb_preds)
df_pred['preds'] = df_pred['preds'].replace({-1:0})
df_pred['true'] = y_test.values
df_pred['err'] = np.absolute(df_pred['true'] - df_pred['preds'])
for i in X_test.columns: 
    df_pred[i] = X_test[i].values
df_pred.true = df_pred.true.astype(str)
df_pred[['true','err']].groupby('true').agg(mean=('err',np.mean),
                                             stdv=('err',np.std),
                                              min=('err',np.min),
                                              max=('err',np.max),
                                             )

Unnamed: 0_level_0,mean,stdv,min,max
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,1.5,1.732051,0.0,6.0
11,2.228571,2.015903,0.0,11.0
12,2.117647,1.317306,0.0,5.0
13,4.5,2.081666,2.0,7.0
14,4.666667,2.503331,0.0,7.0
15,3.6,2.073644,1.0,6.0
16,4.0,1.870829,1.0,6.0
17,4.25,3.304038,0.0,8.0
18,6.333333,5.507571,1.0,12.0
19,10.666667,1.527525,9.0,12.0


In [82]:
y_f = df['Delivery Duration']
X_f = df.drop('Delivery Duration', axis=1)
X_train, X_test, y_train, y_test = train_test_split(
X_f, y_f, test_size=0.2, random_state=42)
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, xgb_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 1.1542509919542023


In [84]:
df_pred = pd.DataFrame()
df_pred['preds'] = np.floor(xgb_preds)
df_pred['preds'] = df_pred['preds'].replace({-1:0})
df_pred['true'] = y_test.values
df_pred['err'] = np.absolute(df_pred['true'] - df_pred['preds'])
for i in X_test.columns: 
    df_pred[i] = X_test[i].values
df_pred.true = df_pred.true.astype(str)
df_pred[['true','err']].groupby('true').agg(mean=('err',np.mean),
                                             stdv=('err',np.std),
                                              min=('err',np.min),
                                              max=('err',np.max),
                                             ).sort_values('mean')

Unnamed: 0_level_0,mean,stdv,min,max
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.14562,0.391323,0.0,6.0
1,0.408663,0.578045,0.0,25.0
2,0.676659,0.665416,0.0,11.0
3,1.102222,0.832386,0.0,5.0
4,1.633962,1.097054,0.0,4.0
5,2.294224,1.298936,0.0,5.0
6,2.935484,1.496241,0.0,6.0
7,3.562092,1.935914,0.0,7.0
8,3.929412,2.487053,0.0,8.0
10,4.8,3.220623,1.0,9.0


In [88]:
df['Delivery Duration'].value_counts()

Delivery Duration
1     58897
0     40770
2     38303
3     16637
4      6482
5      2631
6      1308
7       790
8       408
9       223
10      176
11      122
12       97
13       55
14       26
15       21
18       19
16       17
17       17
19       10
21        9
24        6
23        5
20        5
35        3
33        2
29        2
22        2
41        1
27        1
80        1
77        1
25        1
39        1
36        1
57        1
28        1
50        1
26        1
56        1
45        1
Name: count, dtype: int64

In [95]:
from sklearn.utils import resample

df_slow = df[(df['Delivery Duration'] > 7)]


ls_df = []
for i in range(7,81):
    if i in list(df['Delivery Duration'].value_counts().index):
        duration_df = df[df['Delivery Duration'] == i]
        upsampled_data = resample(duration_df,
                      replace=True,
                  n_samples= int(1.1*len(duration_df) + 200))
        ls_df.append(upsampled_data)

In [98]:
new_df = pd.concat([*ls_df, df[df['Delivery Duration'] <= 7]],axis=0)
# new_df
# new_df['Delivery Duration'].value_counts()

In [99]:
y_f = new_df['Delivery Duration']
X_f = new_df.drop('Delivery Duration', axis=1)
X_train, X_test, y_train, y_test = train_test_split(
X_f, y_f, test_size=0.2, random_state=42)
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, xgb_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 2.082489687599943


In [100]:
df_pred = pd.DataFrame()
df_pred['preds'] = np.floor(xgb_preds)
df_pred['preds'] = df_pred['preds'].replace({-1:0})
df_pred['true'] = y_test.values
df_pred['err'] = np.absolute(df_pred['true'] - df_pred['preds'])
for i in X_test.columns: 
    df_pred[i] = X_test[i].values
df_pred.true = df_pred.true.astype(str)
df_pred[['true','err']].groupby('true').agg(mean=('err',np.mean),
                                             stdv=('err',np.std),
                                              min=('err',np.min),
                                              max=('err',np.max),
                                             ).sort_values('mean')

Unnamed: 0_level_0,mean,stdv,min,max
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.312722,1.354201,0.0,48.0
1,0.638692,1.402161,0.0,68.0
2,0.904535,1.38423,0.0,46.0
57,1.0,0.0,1.0,1.0
56,1.0,0.0,1.0,1.0
39,1.0,0.0,1.0,1.0
80,1.0,0.0,1.0,1.0
26,1.0,0.0,1.0,1.0
25,1.0,0.0,1.0,1.0
27,1.0,0.0,1.0,1.0


In [6]:
y = df['Delivery Duration']
X = df.drop('Delivery Duration', axis=1)
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X, y)
xgb_preds = xgb_model.predict(X)
rmse = mean_squared_error(y, xgb_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 1.0914957936758252


In [12]:
df_pred = pd.DataFrame()
df_pred['preds'] = np.floor(xgb_preds)
df_pred['preds'] = df_pred['preds'].replace({-1:0})
df_pred['true'] = y.values
df_pred['err'] = np.absolute(df_pred['true'] - df_pred['preds'])
for i in X.columns: 
    df_pred[i] = X[i].values
df_pred.true = df_pred.true.astype(str)
df_pred[['true','err']].groupby('true').agg(mean=('err',np.mean),
                                             stdv=('err',np.std),
                                              min=('err',np.min),
                                              max=('err',np.max),
                                             ).sort_values('mean').to_csv('error.csv')

In [11]:
xgb_model.save_model("reg_model.json")