In [343]:
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [344]:
# from lazypredict.Supervised import LazyClassifier
import pickle
import joblib as jb

In [345]:
train_data=pd.read_csv("supply_chain_data.csv")

In [346]:
train_data.head(10)
col=['SKU','Availability','Costs','Customer demographics','Defect rates','Inspection results','Lead time',
     'Lead times','Manufacturing lead time','Number of products sold','Order quantities','Price','Product type','Production volumes']
train_data.loc[:10,col]

Unnamed: 0,SKU,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Manufacturing lead time,Number of products sold,Order quantities,Price,Product type,Production volumes
0,SKU0,55,187.752075,Non-binary,0.22641,Pending,29,7,29,802,96,69.808006,haircare,215
1,SKU1,95,503.065579,Female,4.854068,Pending,23,30,30,736,37,14.843523,skincare,517
2,SKU2,34,141.920282,Unknown,4.580593,Pending,12,10,27,8,88,11.319683,haircare,971
3,SKU3,68,254.776159,Non-binary,4.746649,Fail,24,13,18,83,59,61.163343,skincare,937
4,SKU4,26,923.440632,Non-binary,3.14558,Fail,5,3,3,871,56,4.805496,skincare,414
5,SKU5,87,235.461237,Non-binary,2.779194,Fail,10,27,17,147,66,1.699976,haircare,104
6,SKU6,48,134.369097,Male,1.000911,Pending,14,15,24,65,58,4.078333,skincare,314
7,SKU7,59,802.056312,Female,0.398177,Fail,22,17,1,426,11,42.958384,cosmetics,564
8,SKU8,78,505.557134,Female,2.709863,Pending,13,10,8,150,15,68.717597,cosmetics,769
9,SKU9,35,995.929461,Unknown,3.844614,Pending,29,27,23,980,83,64.015733,skincare,963


In [347]:
train_data.isnull().sum()

Product type               0
SKU                        0
Price                      0
Availability               0
Number of products sold    0
Revenue generated          0
Customer demographics      0
Stock levels               0
Lead times                 0
Order quantities           0
Shipping times             0
Shipping carriers          0
Shipping costs             0
Supplier name              0
Location                   0
Lead time                  0
Production volumes         0
Manufacturing lead time    0
Manufacturing costs        0
Inspection results         0
Defect rates               0
Transportation modes       0
Routes                     0
Costs                      0
dtype: int64

In [348]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product type             100 non-null    object 
 1   SKU                      100 non-null    object 
 2   Price                    100 non-null    float64
 3   Availability             100 non-null    int64  
 4   Number of products sold  100 non-null    int64  
 5   Revenue generated        100 non-null    float64
 6   Customer demographics    100 non-null    object 
 7   Stock levels             100 non-null    int64  
 8   Lead times               100 non-null    int64  
 9   Order quantities         100 non-null    int64  
 10  Shipping times           100 non-null    int64  
 11  Shipping carriers        100 non-null    object 
 12  Shipping costs           100 non-null    float64
 13  Supplier name            100 non-null    object 
 14  Location                 10

In [349]:
labelencoder=LabelEncoder()
def object_to_int(data):
    if data.dtypes=='object':
        data=labelencoder.fit_transform(data)
    return data

In [369]:
train_data=train_data.apply(lambda data:object_to_int(data))
col=['SKU','Availability','Costs','Customer demographics','Defect rates','Inspection results','Lead time','Lead times','Manufacturing lead time','Number of products sold','Order quantities','Price','Product type','Production volumes']
train_data.loc[:10,col]

Unnamed: 0,SKU,Availability,Costs,Customer demographics,Defect rates,Inspection results,Lead time,Lead times,Manufacturing lead time,Number of products sold,Order quantities,Price,Product type,Production volumes
0,0,55,187.752075,2,0.22641,2,29,7,29,802,96,69.808006,1,215
1,1,95,503.065579,0,4.854068,2,23,30,30,736,37,14.843523,2,517
2,12,34,141.920282,3,4.580593,2,12,10,27,8,88,11.319683,1,971
3,23,68,254.776159,2,4.746649,0,24,13,18,83,59,61.163343,2,937
4,34,26,923.440632,2,3.14558,0,5,3,3,871,56,4.805496,2,414
5,45,87,235.461237,2,2.779194,0,10,27,17,147,66,1.699976,1,104
6,56,48,134.369097,1,1.000911,2,14,15,24,65,58,4.078333,2,314
7,67,59,802.056312,0,0.398177,0,22,17,1,426,11,42.958384,0,564
8,78,78,505.557134,0,2.709863,2,13,10,8,150,15,68.717597,0,769
9,89,35,995.929461,3,3.844614,2,29,27,23,980,83,64.015733,2,963


In [370]:
num_unique_values = train_data['Product type'].nunique()

print("Number of different outputs in the column:", num_unique_values)

Number of different outputs in the column: 3


In [351]:
train_data['Customer_demand'] = [1 if (int(column['Stock levels'] - column['Order quantities']) < 0) else 0 for _,column in train_data.iterrows()]
train_data

Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Lead times,Order quantities,...,Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs,Customer_demand
0,1,0,69.808006,55,802,8661.996792,2,58,7,96,...,29,215,29,46.279879,2,0.226410,2,1,187.752075,1
1,2,1,14.843523,95,736,7460.900065,0,53,30,37,...,23,517,30,33.616769,2,4.854068,2,1,503.065579,0
2,1,12,11.319683,34,8,9577.749626,3,1,10,88,...,12,971,27,30.688019,2,4.580593,0,2,141.920282,1
3,2,23,61.163343,68,83,7766.836426,2,23,13,59,...,24,937,18,35.624741,0,4.746649,1,0,254.776159,1
4,2,34,4.805496,26,871,2686.505152,2,5,3,56,...,5,414,3,92.065161,0,3.145580,0,0,923.440632,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,95,77.903927,65,672,7386.363944,3,15,14,26,...,18,450,26,58.890686,2,1.210882,0,0,778.864241,1
96,0,96,24.423131,29,324,7698.424766,2,67,2,32,...,28,648,28,17.803756,2,3.872048,2,0,188.742141,0
97,1,97,3.526111,56,62,4370.916580,1,46,19,4,...,10,535,13,65.765156,0,3.376238,2,0,540.132423,0
98,2,98,19.754605,43,913,8525.952560,0,53,1,27,...,28,581,9,5.604691,2,2.908122,1,0,882.198864,0


In [352]:
X=train_data.drop(['Customer_demand'],axis=1)
y=train_data['Customer_demand']

In [353]:
X = X.head(70)
y = y.head(70)

In [354]:
f_p_value=chi2(X,y)

In [355]:
p_value=pd.Series(f_p_value[1])
p_value.index=X.columns
p_value.sort_index(ascending=True)

Availability                1.284822e-03
Costs                       4.018163e-07
Customer demographics       6.846693e-01
Defect rates                9.726777e-01
Inspection results          3.953924e-01
Lead time                   7.404922e-02
Lead times                  6.371322e-01
Location                    5.247325e-01
Manufacturing costs         1.168911e-01
Manufacturing lead time     2.933497e-01
Number of products sold     1.138683e-16
Order quantities            2.730577e-69
Price                       6.137958e-03
Product type                5.236762e-01
Production volumes          6.028791e-28
Revenue generated           0.000000e+00
Routes                      8.022176e-01
SKU                         7.075475e-01
Shipping carriers           3.272508e-01
Shipping costs              1.930272e-01
Shipping times              2.559881e-01
Stock levels               3.867851e-192
Supplier name               8.250314e-02
Transportation modes        6.903763e-01
dtype: float64

In [356]:
X.drop(['Routes','Shipping carriers','Shipping costs','Shipping times','Supplier name','Transportation modes'],axis=1,inplace=True)

In [357]:
X.drop(['Location','Revenue generated','Manufacturing costs'],axis=1,inplace=True)

In [358]:
test_data = X.tail(30)

In [359]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [360]:
# clf = LazyClassifier()
# model, predict = clf.fit(X_train, X_test, y_train, y_test)
# model

In [361]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)


In [362]:
print("Training Accuracy: ", xgb.score(X_train, y_train))

Training Accuracy:  1.0


In [363]:
print("Testing Accuracy: ", accuracy_score(y_test, xgb.predict(X_test)))

Testing Accuracy:  0.9523809523809523


In [364]:
result = pd.DataFrame(test_data['SKU'])
y_pred=xgb.predict(test_data)

In [365]:
result["Customer_demand"] = y_pred
result["Actual Output:"]= y.tail(30)

result

Unnamed: 0,SKU,Customer_demand,Actual Output:
40,35,0,0
41,36,0,0
42,37,1,1
43,38,1,1
44,39,1,1
45,40,0,0
46,41,0,0
47,42,1,1
48,43,0,0
49,44,0,0


In [366]:
joblib.dump(xgb,'model.pkl')



['model.pkl']