In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train=pd.read_csv("Kaggle_Training_Dataset_v2.csv")
test=pd.read_csv("Kaggle_Test_Dataset_v2.csv")
train.shape,test.shape

((1687861, 23), (242076, 23))

In [4]:

def preprocess(df):
    #train['lead_time']=train['lead_time'].fillna(train['lead_time'].median())
    drop_cols=['forecast_6_month','forecast_9_month','perf_12_month_avg','sales_3_month','sales_6_month','sales_9_month','sku']
    df.drop(columns=drop_cols,inplace=True)
    cat_cols=['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop']
    num_cols=['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month', 'sales_1_month', 'min_bank', 'pieces_past_due', 'perf_6_month_avg', 'local_bo_qty']
    num_pipeline=Pipeline(
        steps=[
            ('imputer',SimpleImputer(strategy="median")),
            ('scaler',StandardScaler())
        ])
    potential_issue=['No', 'Yes']
    deck_risk=['No', 'Yes']
    oe_constraint=['No', 'Yes']
    ppap_risk=['No', 'Yes']
    stop_auto_buy=['Yes', 'No']
    rev_stop= ['No', 'Yes']
    cat_pipeline = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ('labelencoder',OrdinalEncoder(categories=[potential_issue,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop]))
        ]
    )
    preprocessor=ColumnTransformer([
        ('num_pipeline',num_pipeline,num_cols),
        ('cat_pipeline',cat_pipeline,cat_cols)
    ])
    new_df=preprocessor.fit_transform(df)
    return pd.DataFrame(new_df,columns=df.columns)


    

In [5]:
train.dropna(subset=['went_on_backorder'],inplace=True)

In [6]:
target='went_on_backorder'

In [7]:
new_train=preprocess(train.drop(columns=target))

In [8]:
y=train[target]

In [9]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score,accuracy_score,recall_score,roc_auc_score

In [10]:
models={
    'LogisticRegression':LogisticRegression(),
    'Randomforest':RandomForestClassifier(),
    'xgboost':XGBClassifier(),
    'knn':KNeighborsClassifier()
}

In [11]:
def evaluate_model(true,pred,probs):
    accuracy=accuracy_score(true,pred)
    precision=precision_score(true,pred,pos_label='Yes')
    recall=recall_score(true,pred,pos_label='Yes')
    auc=roc_auc_score(true,probs[::,1])
    return accuracy,precision,recall,auc

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
y.isnull().sum()

0

In [14]:
maps={"No":0,"Yes":1}
y.unique()

array(['No', 'Yes'], dtype=object)

In [15]:
y=y.map(maps)
y.unique()

array([0, 1], dtype=int64)

In [16]:
from imblearn.over_sampling import SMOTE

In [17]:
smote=SMOTE(random_state=1)

In [18]:
X_train,X_test,y_train,y_test=train_test_split(new_train,y,test_size=0.2,random_state=1)

In [19]:
X_smote,y_smote=smote.fit_resample(X_train,y_train)

In [20]:
print(y_smote.value_counts())
print(y_train.value_counts())

0    1341276
1    1341276
Name: went_on_backorder, dtype: int64
0    1341276
1       9012
Name: went_on_backorder, dtype: int64


In [30]:
from sklearn.metrics import classification_report

for i in range(len(list(models))):
    model=list(models.values())[i]
    
    model.fit(X_smote,y_smote)
    preds=model.predict(X_test)
    print(list(models.keys())[i])
    print(classification_report(y_test,preds))

LogisticRegression
              precision    recall  f1-score   support

           0       1.00      0.54      0.70    335291
           1       0.01      0.77      0.02      2281

    accuracy                           0.54    337572
   macro avg       0.50      0.66      0.36    337572
weighted avg       0.99      0.54      0.70    337572

Randomforest
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    335291
           1       0.26      0.45      0.33      2281

    accuracy                           0.99    337572
   macro avg       0.63      0.72      0.66    337572
weighted avg       0.99      0.99      0.99    337572

xgboost
              precision    recall  f1-score   support

           0       1.00      0.96      0.98    335291
           1       0.10      0.63      0.18      2281

    accuracy                           0.96    337572
   macro avg       0.55      0.80      0.58    337572
weighted avg       0.99      0.96 

In [20]:
#F1 score for Random FOrest Knn and Xg boost looks quite well 
# We will test SVM with different kernels for tis data mow

In [21]:
from imblearn.under_sampling import RandomUnderSampler

In [47]:

rus = RandomUnderSampler(sampling_strategy='not minority', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)


In [48]:
y_resampled.value_counts()

0    9012
1    9012
Name: went_on_backorder, dtype: int64

In [49]:
y_train.value_counts()

0    1341276
1       9012
Name: went_on_backorder, dtype: int64

In [54]:
from sklearn.svm import SVC

In [57]:
svc=SVC(kernel="poly",C=5,degree=3,coef0=1)

In [58]:
svc.fit(X_resampled,y_resampled)

In [59]:
sv_preds=svc.predict(X_test)

In [60]:
print(classification_report(y_test,sv_preds))

              precision    recall  f1-score   support

           0       1.00      0.69      0.81    335291
           1       0.01      0.63      0.03      2281

    accuracy                           0.69    337572
   macro avg       0.50      0.66      0.42    337572
weighted avg       0.99      0.69      0.81    337572



In [21]:
rf=RandomForestClassifier()

In [22]:
rf.fit(X_resampled,y_resampled)

In [34]:
from sklearn.metrics import classification_report

In [24]:

preds=rf.predict(X_test)
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93    335291
           1       0.05      0.92      0.09      2281

    accuracy                           0.88    337572
   macro avg       0.52      0.90      0.51    337572
weighted avg       0.99      0.88      0.93    337572



In [23]:
over = SMOTE(sampling_strategy=0.1,random_state=1)
under = RandomUnderSampler(sampling_strategy=0.5,random_state=1)

In [27]:
x_smoted,y_smoted=over.fit_resample(X_train,y_train)

In [30]:
x_combined,y_combined=under.fit_resample(x_smoted,y_smoted)

In [31]:
print(y_smoted.value_counts())
print(y_combined.value_counts())
print(y_train.value_counts())

0    1341276
1     134127
Name: went_on_backorder, dtype: int64
0    268254
1    134127
Name: went_on_backorder, dtype: int64
0    1341276
1       9012
Name: went_on_backorder, dtype: int64


In [32]:
rf=RandomForestClassifier()
rf.fit(x_combined,y_combined)
preds=rf.predict(X_test)


In [35]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    335291
           1       0.20      0.62      0.30      2281

    accuracy                           0.98    337572
   macro avg       0.60      0.80      0.65    337572
weighted avg       0.99      0.98      0.99    337572



In [37]:
from sklearn.model_selection import cross_val_score

In [40]:
scores=cross_val_score(rf,X_test,y_test,cv=3,scoring="roc_auc",n_jobs=1)


NameError: name 'mean' is not defined

In [44]:
print('Mean ROC AUC: %.3f' % scores.mean())

Mean ROC AUC: 0.872


In [None]:
#Before Trying to get a good accuracy on the minority class we have to consider the trade off between recall and precision
#Here we would be concere

In [46]:
from sklearn.naive_bayes import GaussianNB
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(x_combined,y_combined)
preds2=naive_bayes_classifier.predict(X_test)
print(classification_report(y_test,preds2))

              precision    recall  f1-score   support

           0       1.00      0.06      0.11    335291
           1       0.01      0.99      0.01      2281

    accuracy                           0.07    337572
   macro avg       0.50      0.52      0.06    337572
weighted avg       0.99      0.07      0.11    337572



In [61]:
#Final Thoughts  : RandomForest Performed the best  -Hyper parameter Tuning Required for finding optimal params        

In [66]:
over = SMOTE(sampling_strategy=0.9,random_state=1)
under = RandomUnderSampler(sampling_strategy=0.9,random_state=1)

In [67]:
x_smoted2,y_smoted2=over.fit_resample(X_train,y_train)
x_combined2,y_combined2=under.fit_resample(x_smoted,y_smoted)
print(y_combined2.value_counts())

0    149030
1    134127
Name: went_on_backorder, dtype: int64


In [68]:
rf3=RandomForestClassifier()
rf3.fit(x_combined2,y_combined2)

In [69]:
rf_pred=rf3.predict(X_test)
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98    335291
           1       0.14      0.71      0.24      2281

    accuracy                           0.97    337572
   macro avg       0.57      0.84      0.61    337572
weighted avg       0.99      0.97      0.98    337572



In [72]:
rf3.feature_importances_.round(2)

array([0.28, 0.04, 0.05, 0.3 , 0.14, 0.05, 0.02, 0.09, 0.03, 0.  , 0.01,
       0.  , 0.01, 0.  , 0.  ])

In [73]:
rf3.feature_names_in_

array(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
       'sales_1_month', 'min_bank', 'potential_issue', 'pieces_past_due',
       'perf_6_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop'], dtype=object)

In [74]:
from sklearn.feature_selection import mutual_info_classif

In [75]:
mutual_info=mutual_info_classif(x_combined2,y_combined2)

In [76]:
mutual_info=pd.Series(mutual_info)

In [77]:
mutual_info.index=x_combined2.columns

In [78]:
mutual_info.sort_values(ascending=False)

national_inv        0.455010
forecast_3_month    0.353816
sales_1_month       0.270687
pieces_past_due     0.259943
min_bank            0.161947
in_transit_qty      0.093195
perf_6_month_avg    0.034282
potential_issue     0.028884
lead_time           0.026627
deck_risk           0.008275
ppap_risk           0.003293
local_bo_qty        0.002113
stop_auto_buy       0.000254
oe_constraint       0.000244
rev_stop            0.000000
dtype: float64