In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import pickle


DATA LOADING AND UNDERSTANDING

In [38]:
df= pd.read_csv('data/DataCoSupplyChainDataset.csv', encoding='latin1')

In [39]:
df.sample(4)

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
150453,DEBIT,6,4,99.360001,293.950012,Late delivery,1,17,Cleats,Caguas,...,43229.0,365,17,,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,0,6/12/2016 10:28,Standard Class
19706,PAYMENT,2,1,2.97,118.699997,Late delivery,1,40,Accessories,Woonsocket,...,,897,40,,http://images.acmesports.sports/Team+Golf+New+...,Team Golf New England Patriots Putter Grip,24.99,0,4/2/2017 8:39,First Class
53273,DEBIT,2,2,22.15,195.990005,Shipping on time,0,48,Water Sports,Fontana,...,,1073,48,,http://images.acmesports.sports/Pelican+Sunstr...,Pelican Sunstream 100 Kayak,199.990005,0,8/27/2017 2:19,Second Class
6354,PAYMENT,6,4,-8.3,41.5,Late delivery,1,24,Women's Apparel,Caguas,...,90301.0,502,24,,http://images.acmesports.sports/Nike+Men%27s+D...,Nike Men's Dri-FIT Victory Golf Polo,50.0,0,8/27/2016 10:01,Standard Class


In [40]:
df.shape

(180519, 53)

In [41]:
df.describe()

Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Late_delivery_risk,Category Id,Customer Id,Customer Zipcode,Department Id,Latitude,...,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Price,Product Status
count,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180516.0,180519.0,180519.0,...,180519.0,180519.0,180519.0,180519.0,24840.0,180519.0,180519.0,0.0,180519.0,180519.0
mean,3.497654,2.931847,21.974989,183.107609,0.548291,31.851451,6691.379495,35921.126914,5.44346,29.719955,...,2.127638,203.772096,183.107609,21.974989,55426.132327,692.509764,31.851451,,141.23255,0.0
std,1.623722,1.374449,104.433526,120.04367,0.497664,15.640064,4162.918106,37542.461122,1.629246,9.813646,...,1.453451,132.273077,120.04367,104.433526,31919.279101,336.446807,15.640064,,139.732492,0.0
min,0.0,0.0,-4274.97998,7.49,0.0,2.0,1.0,603.0,2.0,-33.937553,...,1.0,9.99,7.49,-4274.97998,1040.0,19.0,2.0,,9.99,0.0
25%,2.0,2.0,7.0,104.379997,0.0,18.0,3258.5,725.0,4.0,18.265432,...,1.0,119.980003,104.379997,7.0,23464.0,403.0,18.0,,50.0,0.0
50%,3.0,4.0,31.52,163.990005,1.0,29.0,6457.0,19380.0,5.0,33.144863,...,1.0,199.919998,163.990005,31.52,59405.0,627.0,29.0,,59.990002,0.0
75%,5.0,4.0,64.800003,247.399994,1.0,45.0,9779.0,78207.0,7.0,39.279617,...,3.0,299.950012,247.399994,64.800003,90008.0,1004.0,45.0,,199.990005,0.0
max,6.0,4.0,911.799988,1939.98999,1.0,76.0,20757.0,99205.0,12.0,48.781933,...,5.0,1999.98999,1939.98999,911.799988,99301.0,1363.0,76.0,,1999.98999,0.0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  object 
 1   Days for shipping (real)       180519 non-null  int64  
 2   Days for shipment (scheduled)  180519 non-null  int64  
 3   Benefit per order              180519 non-null  float64
 4   Sales per customer             180519 non-null  float64
 5   Delivery Status                180519 non-null  object 
 6   Late_delivery_risk             180519 non-null  int64  
 7   Category Id                    180519 non-null  int64  
 8   Category Name                  180519 non-null  object 
 9   Customer City                  180519 non-null  object 
 10  Customer Country               180519 non-null  object 
 11  Customer Email                 180519 non-null  object 
 12  Customer Fname                

In [43]:
# convert sales per customer into integer
df['Sales per customer']=df['Sales per customer'].astype(int)

In [44]:
df.head(2)

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class


In [45]:
df.describe

<bound method NDFrame.describe of             Type  Days for shipping (real)  Days for shipment (scheduled)  \
0          DEBIT                         3                              4   
1       TRANSFER                         5                              4   
2           CASH                         4                              4   
3          DEBIT                         3                              4   
4        PAYMENT                         2                              4   
...          ...                       ...                            ...   
180514      CASH                         4                              4   
180515     DEBIT                         3                              2   
180516  TRANSFER                         5                              4   
180517   PAYMENT                         3                              4   
180518   PAYMENT                         4                              4   

        Benefit per order  Sales per cust

In [46]:
df.columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Benefit per order', 'Sales per customer', 'Delivery Status',
       'Late_delivery_risk', 'Category Id', 'Category Name', 'Customer City',
       'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Id',
       'Customer Lname', 'Customer Password', 'Customer Segment',
       'Customer State', 'Customer Street', 'Customer Zipcode',
       'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market',
       'Order City', 'Order Country', 'Order Customer Id',
       'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id',
       'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id',
       'Order Item Product Price', 'Order Item Profit Ratio',
       'Order Item Quantity', 'Sales', 'Order Item Total',
       'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status',
       'Order Zipcode', 'Product Card Id', 'Product Category Id',
       'Product De

In [47]:
df.isnull().sum()

Type                                  0
Days for shipping (real)              0
Days for shipment (scheduled)         0
Benefit per order                     0
Sales per customer                    0
Delivery Status                       0
Late_delivery_risk                    0
Category Id                           0
Category Name                         0
Customer City                         0
Customer Country                      0
Customer Email                        0
Customer Fname                        0
Customer Id                           0
Customer Lname                        8
Customer Password                     0
Customer Segment                      0
Customer State                        0
Customer Street                       0
Customer Zipcode                      3
Department Id                         0
Department Name                       0
Latitude                              0
Longitude                             0
Market                                0


In [48]:
# dropping columns
df=df.drop(columns=['Customer Id','Category Id', 'Department Id','Product Category Id','Order Item Cardprod Id','Order Id','Product Card Id'])

In [49]:
df=df.drop(columns=['Customer Email', 'Customer Password','Customer Zipcode','Order Zipcode','Customer State','Customer City','Product Description','Product Image'])

In [50]:
df.shape

(180519, 38)

In [51]:
df.columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Benefit per order', 'Sales per customer', 'Delivery Status',
       'Late_delivery_risk', 'Category Name', 'Customer Country',
       'Customer Fname', 'Customer Lname', 'Customer Segment',
       'Customer Street', 'Department Name', 'Latitude', 'Longitude', 'Market',
       'Order City', 'Order Country', 'Order Customer Id',
       'order date (DateOrders)', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price',
       'Order Item Profit Ratio', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Product Name', 'Product Price',
       'Product Status', 'shipping date (DateOrders)', 'Shipping Mode'],
      dtype='object')

In [52]:
df['Customer Name'] = df['Customer Fname'].astype(str) + ' ' + df['Customer Lname'].astype(str)


In [53]:
df= df.drop(columns=['Order City','Order Country','Sales per customer'])

In [54]:
df=df.drop(columns=['Benefit per order','Order Item Profit Ratio','Product Status'])

In [55]:
df['Estimated_Order_Value']=df['Order Item Quantity'].astype(float)* df['Product Price'].astype(float)

In [56]:
df['Shipping_Delay_Days'] = (
    df['Days for shipping (real)'] -
    df['Days for shipment (scheduled)']
)


In [57]:
df.shape

(180519, 35)

target class distribution

In [58]:
high_demand_threshold = df['Estimated_Order_Value'].quantile(0.75)


In [59]:
def define_disruption_risk(row):
    conditions_met = 0

    # Condition 1: High shipping delay
    if row['Shipping_Delay_Days'] > 3:
        conditions_met += 1

    # Condition 2: Late delivery risk
    if row['Late_delivery_risk'] == 1:
        conditions_met += 1

    # Condition 3: High demand pressure
    if row['Estimated_Order_Value'] > high_demand_threshold:
        conditions_met += 1

    # Assign risk level
    if conditions_met >= 2:
        return 2   # High Risk
    elif conditions_met == 1:
        return 1   # Medium Risk
    else:
        return 0   # Low Risk


In [60]:
df['Disruption_Risk'] = df.apply(define_disruption_risk, axis=1)


In [61]:
df.duplicated().sum()

np.int64(0)

In [62]:
df

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late_delivery_risk,Category Name,Customer Country,Customer Fname,Customer Lname,Customer Segment,...,Order State,Order Status,Product Name,Product Price,shipping date (DateOrders),Shipping Mode,Customer Name,Estimated_Order_Value,Shipping_Delay_Days,Disruption_Risk
0,DEBIT,3,4,Advance shipping,0,Sporting Goods,Puerto Rico,Cally,Holloway,Consumer,...,Java Occidental,COMPLETE,Smart watch,327.750000,2/3/2018 22:56,Standard Class,Cally Holloway,327.750000,-1,1
1,TRANSFER,5,4,Late delivery,1,Sporting Goods,Puerto Rico,Irene,Luna,Consumer,...,Rajastán,PENDING,Smart watch,327.750000,1/18/2018 12:27,Standard Class,Irene Luna,327.750000,1,2
2,CASH,4,4,Shipping on time,0,Sporting Goods,EE. UU.,Gillian,Maldonado,Consumer,...,Rajastán,CLOSED,Smart watch,327.750000,1/17/2018 12:06,Standard Class,Gillian Maldonado,327.750000,0,1
3,DEBIT,3,4,Advance shipping,0,Sporting Goods,EE. UU.,Tana,Tate,Home Office,...,Queensland,COMPLETE,Smart watch,327.750000,1/16/2018 11:45,Standard Class,Tana Tate,327.750000,-1,1
4,PAYMENT,2,4,Advance shipping,0,Sporting Goods,Puerto Rico,Orli,Hendricks,Corporate,...,Queensland,PENDING_PAYMENT,Smart watch,327.750000,1/15/2018 11:24,Standard Class,Orli Hendricks,327.750000,-2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180514,CASH,4,4,Shipping on time,0,Fishing,EE. UU.,Maria,Peterson,Home Office,...,Shanghái,CLOSED,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/20/2016 3:40,Standard Class,Maria Peterson,399.980011,0,1
180515,DEBIT,3,2,Late delivery,1,Fishing,EE. UU.,Ronald,Clark,Corporate,...,Osaka,COMPLETE,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/19/2016 1:34,Second Class,Ronald Clark,399.980011,1,2
180516,TRANSFER,5,4,Late delivery,1,Fishing,EE. UU.,John,Smith,Corporate,...,Australia del Sur,PENDING,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/20/2016 21:00,Standard Class,John Smith,399.980011,1,2
180517,PAYMENT,3,4,Advance shipping,0,Fishing,Puerto Rico,Mary,Smith,Consumer,...,Australia del Sur,PENDING_PAYMENT,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/18/2016 20:18,Standard Class,Mary Smith,399.980011,-1,1


In [63]:
df=df.drop(columns=['Customer Fname','Customer Lname','Late_delivery_risk','Estimated_Order_Value','Shipping_Delay_Days'])

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  object 
 1   Days for shipping (real)       180519 non-null  int64  
 2   Days for shipment (scheduled)  180519 non-null  int64  
 3   Delivery Status                180519 non-null  object 
 4   Category Name                  180519 non-null  object 
 5   Customer Country               180519 non-null  object 
 6   Customer Segment               180519 non-null  object 
 7   Customer Street                180519 non-null  object 
 8   Department Name                180519 non-null  object 
 9   Latitude                       180519 non-null  float64
 10  Longitude                      180519 non-null  float64
 11  Market                         180519 non-null  object 
 12  Order Customer Id             

In [66]:
df.columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Delivery Status', 'Category Name', 'Customer Country',
       'Customer Segment', 'Customer Street', 'Department Name', 'Latitude',
       'Longitude', 'Market', 'Order Customer Id', 'order date (DateOrders)',
       'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id',
       'Order Item Product Price', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Product Name', 'Product Price',
       'shipping date (DateOrders)', 'Shipping Mode', 'Customer Name',
       'Disruption_Risk'],
      dtype='object')

In [67]:
df=df.drop(columns=['Order Item Id','Customer Name'])

In [68]:
print('Shipping_Delay_Days' in df.columns)

False


In [69]:
for col in df.columns:
    if 'shipping' in col.lower() or 'shipment' in col.lower():
        print(col)


Days for shipping (real)
Days for shipment (scheduled)
shipping date (DateOrders)
Shipping Mode


In [70]:
df.columns = df.columns.str.strip()


In [71]:
print(df.columns.tolist())


['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)', 'Delivery Status', 'Category Name', 'Customer Country', 'Customer Segment', 'Customer Street', 'Department Name', 'Latitude', 'Longitude', 'Market', 'Order Customer Id', 'order date (DateOrders)', 'Order Item Discount', 'Order Item Discount Rate', 'Order Item Product Price', 'Order Item Quantity', 'Sales', 'Order Item Total', 'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status', 'Product Name', 'Product Price', 'shipping date (DateOrders)', 'Shipping Mode', 'Disruption_Risk']


In [72]:
df['Shipping_Delay_Days'] = (
    df['Days for shipping (real)'] -
    df['Days for shipment (scheduled)']
)


In [73]:
print('Shipping_Delay_Days' in df.columns)


True


In [74]:
df=df.drop(columns=['Days for shipping (real)','Days for shipment (scheduled)'])

In [75]:
df.columns

Index(['Type', 'Delivery Status', 'Category Name', 'Customer Country',
       'Customer Segment', 'Customer Street', 'Department Name', 'Latitude',
       'Longitude', 'Market', 'Order Customer Id', 'order date (DateOrders)',
       'Order Item Discount', 'Order Item Discount Rate',
       'Order Item Product Price', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Product Name', 'Product Price',
       'shipping date (DateOrders)', 'Shipping Mode', 'Disruption_Risk',
       'Shipping_Delay_Days'],
      dtype='object')

In [76]:
df.shape

(180519, 28)

In [77]:
df.to_csv('supplier_chain_processing.csv',index=False)