In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from itertools import product
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import mean_squared_error , r2_score
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Carrier,Item Quantity,City,Country,Status,Weight,Shipping Date,Delivery Date,Delivery Duration
0,A,2.0,Jeddah,SA,Delivered,0.0625,2021-09-30,2021-10-02,2
1,A,2.0,Madinah,SA,Delivered,0.345,2021-09-30,2021-10-03,3
2,A,3.0,Makkah,SA,Delivered,2.1,2021-09-30,2021-10-03,3
3,B,1.0,Riyadh,SA,Delivered,0.7,2021-09-30,2021-10-03,3
4,B,2.0,Riyadh,SA,Delivered,1.4,2021-09-30,2021-10-02,2


In [3]:
df['Carrier'] = df['Carrier'].astype("category")
df['City'] = df['City'].astype("category")
df['Shipping Date'] =  pd.to_datetime(df['Shipping Date'])
df['Year'] = df['Shipping Date'].dt.year.astype("category")
df['Month'] = df['Shipping Date'].dt.month.astype("category")
df['Day'] = df['Shipping Date'].dt.day.astype("category")
df['DayOfWeek'] = df['Shipping Date'].dt.dayofweek.astype("category")

In [4]:
df.drop(['Country', 'Status', 'Delivery Date','Item Quantity', 'Shipping Date'], axis=1, inplace=True)
df.head()

Unnamed: 0,Carrier,City,Weight,Delivery Duration,Year,Month,Day,DayOfWeek
0,A,Jeddah,0.0625,2,2021,9,30,3
1,A,Madinah,0.345,3,2021,9,30,3
2,A,Makkah,2.1,3,2021,9,30,3
3,B,Riyadh,0.7,3,2021,9,30,3
4,B,Riyadh,1.4,2,2021,9,30,3


In [5]:
y = df['Delivery Duration']
X = df.drop('Delivery Duration', axis=1)

In [6]:

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

In [7]:
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, xgb_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 1.1542509919542023


In [9]:
df_pred = pd.DataFrame()
df_pred['preds'] = pd.Series(np.floor(xgb_preds)).replace({-1:0})
df_pred['true'] = y_test.values
df_pred.true = df_pred.true.astype(str)

agg = df_pred[['true','preds']].groupby('true').agg(mean=('preds',np.mean),
                                             stdv=('preds',np.std),
                                              min=('preds',np.min),
                                              max=('preds',np.max),
                                             ).reset_index()
agg.true = agg.true.astype(int)
agg.sort_values('true')

Unnamed: 0,true,mean,stdv,min,max
0,0,0.14562,0.391323,0.0,6.0
1,1,0.766307,0.668236,0.0,26.0
12,2,1.46682,0.785104,0.0,13.0
16,3,1.999111,0.951864,0.0,8.0
18,4,2.44,1.199975,0.0,8.0
19,5,2.745487,1.366821,0.0,8.0
21,6,3.177419,1.700433,0.0,9.0
22,7,3.490196,2.029771,0.0,9.0
23,8,4.070588,2.487053,0.0,8.0
24,9,3.28,2.449823,0.0,10.0


In [10]:
# Run This if you want to hyperParamter Tunning
# HINT: it Would take much time


# from sklearn.model_selection import GridSearchCV

# def hyperParameterTuning(X_train, y_train):
#     param_tuning = {
#         'learning_rate': [0.01, 0.1],
#         'max_depth': [ 5, 7, 10],
#         'min_child_weight': [ 3, 5],
#         'subsample': [0.5, 0.7],
#         'colsample_bytree': [0.5, 0.7],
#         'n_estimators' : [200, 500],
#     }
#     xgb_model = XGBRegressor(enable_categorical=True)

#     gsearch = GridSearchCV(estimator = xgb_model,
#                            param_grid = param_tuning,
#                            cv = 4,
#                            n_jobs = -1,
#                            )
#     gsearch.fit(X_train,y_train)
#     return gsearch.best_params_



In [19]:
xgb_model = XGBRegressor(enable_categorical=True)
xgb_model.fit(X, y)
xgb_preds = xgb_model.predict(X)
rmse = mean_squared_error(y, xgb_preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 1.0914957936758252


In [36]:
df_pred = pd.DataFrame()
df_pred['preds'] = pd.Series(np.floor(xgb_preds)).replace({-1:0})
df_pred['true'] = y.values

df_pred['error'] = np.absolute(df_pred['preds'] - df_pred['true'])

df_pred['true'] = df_pred['true'].astype(str)

agg = df_pred[['true','error']].groupby('true').agg(mean=('error',np.mean),
                                             stdv=('error',np.std),
                                              min=('error',np.min),
                                              max=('error',np.max),
                                            count=('error',len),
                                             ).reset_index()


agg.sort_values('mean').set_index('true').dropna()

Unnamed: 0_level_0,mean,stdv,min,max,count
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.133014,0.36,0.0,6.0,40770
1,0.367863,0.492195,0.0,8.0,58897
2,0.63371,0.634149,0.0,6.0,38303
3,1.063834,0.817583,0.0,5.0,16637
4,1.57652,1.042867,0.0,10.0,6482
5,2.247054,1.26362,0.0,5.0,2631
6,2.847859,1.481,0.0,6.0,1308
7,3.331646,1.94249,0.0,7.0,790
8,3.919118,2.30283,0.0,8.0,408
10,4.914773,3.085192,1.0,10.0,176


In [12]:
xgb_model.save_model("reg_model.json")