In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import unique
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from numpy import mean
from numpy import std
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import plot_confusion_matrix
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve
import mlxtend
import pandas as pd
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.neighbors import KNeighborsClassifier



In [None]:
df = pd.read_csv('data.csv') #load dataset
#map binary variables with 'Yes' as 1 and 'No' as 0 
df['stop_auto_buy']= df['stop_auto_buy'].map({'Yes':1, 'No':0})
df['potential_issue']= df['potential_issue'].map({'Yes':1, 'No':0})
df['oe_constraint']= df['oe_constraint'].map({'Yes':1, 'No':0})
df['ppap_risk']= df['ppap_risk'].map({'Yes':1, 'No':0})
df['deck_risk']= df['deck_risk'].map({'Yes':1, 'No':0})
df['rev_stop']= df['rev_stop'].map({'Yes':1, 'No':0})
df['went_on_backorder']= df['went_on_backorder'].map({'Yes':1, 'No':0})
df = df.dropna(how='any',axis=0)  #delete nulls

# pie chart of percentage of two classes
b = df['went_on_backorder'].value_counts()
total_values = []
went_on_backorder = ['Class:0', 'Class:1']
for n in b:
    total_values.append(n)
plt.pie(total_values, labels=went_on_backorder, autopct='%0.f%%', shadow=True,
        startangle=90)  
plt.show()
# Input_ y_Target_Variable. 
y = df['went_on_backorder']
y=y.to_numpy()

#plot correlation matrix (pearson)

fig, ax = plt.subplots(figsize=(13,13))
corr = df.corr()
sns.heatmap(corr, vmax=1.0, center=0, fmt='.2f',
                 square=True, linewidths=.6, annot=True, cbar_kws={"shrink": .60})

print('Correlation Matrix (Pearsons Correlation)' )
plt.show()

#plot correlation matrix (Spearman's)

fig, ax = plt.subplots(figsize=(13,13))
corr = df.corr(method='spearman')
sns.heatmap(corr, vmax=1.0, center=0, fmt='.2f',
                 square=True, linewidths=.6, annot=True, cbar_kws={"shrink": .60})
print('Correlation Matrix (Spearman Correlation)' )
plt.show()

#print percentage per class
classes = unique(y)
total = len(y)
for c in classes:
	n_examples = len(y[y==c])
	percent = n_examples / total * 100
	print('> Class=%d: %d/%d (%.1f%%)' % (c, n_examples, total, percent))

del df['went_on_backorder'] #delete target col from df


df[df['national_inv'] < 0] = 0 #set negative values of feture egual to zero
df[df['perf_6_month_avg'] < 0] = np.NaN #set negative values of feture as NaN
df[df['perf_12_month_avg'] < 0] = np.NaN #set negative values of feture as NaN

# Input_x_Features. 
X= df.to_numpy(dtype=float)

In [None]:
#split dataset in train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
# summarize datasets
classes = unique(y)
total = len(y)
classes_y_train = unique(y_train)
total_y_train = len(y_train)
for c in classes_y_train:
	n_examples = len(y_train[y_train==c])
	percent = n_examples / total_y_train * 100
	print('> Class=%d in train: %d/%d (%.1f%%)' % (c, n_examples, total_y_train, percent))
classes_y_test = unique(y_test)
total_y_test = len(y_test)
for c in classes_y_test:
	n_examples = len(y_test[y_test==c])
	percent = n_examples / total_y_test * 100
	print('> Class=%d in test: %d/%d (%.1f%%)' % (c, n_examples, total_y_test, percent))
imputation=SimpleImputer(strategy='mean')
scaling=StandardScaler()
X_train=imputation.fit_transform(X_train)
X_train=scaling.fit_transform(X_train)
sm=SMOTE(sampling_strategy=0.4)
X_train,y_train=sm.fit_resample(X_train,y_train)
X_test=imputation.transform(X_test)
X_test=scaling.transform(X_test)
model=SVC()
model.fit(X_train, y_train.ravel())
y_pred_svm = model.predict(X_test)
# Evaluate predictions
print('-----------------------------------------')
print('Prediction results Of SVM')
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))
disp = plot_confusion_matrix(model, X_test, y_test,cmap=plt.cm.Blues)
plt.show()
disp = plot_precision_recall_curve(model, X_test, y_test)
print('-----------------------------------------')
model = RandomForestClassifier()
model.fit(X_train, y_train.ravel())
y_pred_RF = model.predict(X_test)
# Evaluate predictions
print('Prediction results Of Random Forest')
print(confusion_matrix(y_test, y_pred_RF))
print(classification_report(y_test, y_pred_RF))
disp = plot_confusion_matrix(model, X_test, y_test,cmap=plt.cm.Blues)
plt.show()
disp = plot_precision_recall_curve(model, X_test, y_test)
print('-----------------------------------------')
model=KNeighborsClassifier()
model.fit(X_train,y_train.ravel())
y_pred_KNN = model.predict(X_test)
# Evaluate predictions
print('Prediction results Of KNN')
print(confusion_matrix(y_test, y_pred_KNN))
print(classification_report(y_test, y_pred_KNN))
disp = plot_confusion_matrix(model, X_test, y_test,cmap=plt.cm.Blues)
plt.show()

#high precision relates to a low false positive rate
#high recall relates to a low false negative rate. 
#plot precision-recall
disp = plot_precision_recall_curve(model, X_test, y_test)

In [None]:
#Hyparameter tunig for Random Forest
#split dataset in train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
#define the steps of the pipeline for train-validation dataset
estimators_RF = [('imputer', SimpleImputer(strategy='mean')),
             ('scale', StandardScaler()),
             ('sm',SMOTE(sampling_strategy=0.4)),
               ('RF', RandomForestClassifier())]
#define grid params  
grid_RF = [{'RF__criterion': ['gini','entropy'],
         'RF__n_estimators': [100,110,120,130],
         'RF__max_depth':[1,5,10,15,20,25],
         'RF__max_features':['sqrt','auto']}]
pipeline_RF = Pipeline(estimators_RF)
kfold = StratifiedKFold(n_splits=5)
grid_search_RF = GridSearchCV(pipeline_RF, grid_RF,cv=kfold,scoring='f1')
grid_search_RF.fit(X_train,y_train.ravel()) 
print('Best parameters for Random Forest')
print(grid_search_RF.best_params_)
print('---------------------------')

In [None]:
#Hyparameter tunig for SVM
estimators_svm = [('imputer', SimpleImputer(strategy='mean')),
                    ('scale', StandardScaler()),
                    ('sm',SMOTE(sampling_strategy=0.4)),
                   ('svc', SVC())]
#define grid params  
grid_svm = [{'svc__kernel': ['rbf'],
         'svc__gamma': ['auto','scale',0.2],
         'svc__C':[1,10,20]}]
pipeline_svm = Pipeline(estimators_svm)
grid_search_svm = GridSearchCV(pipeline_svm, grid_svm,cv=kfold,scoring='f1')
grid_search_svm.fit(X_train,y_train.ravel()) 
print('Best parameters for SVM')
print(grid_search_svm.best_params_)
print('---------------------------')

In [None]:
#outlier detection
imputation=SimpleImputer(strategy='mean')
scaling=StandardScaler()
X=imputation.fit_transform(X)
X=scaling.fit_transform(X)
z = np.abs(stats.zscore(X))
# value greater then 3 is mark as outlier
threshold = 3
new=np.where(z>3)
indices = unique(new[0])
how_many_classes=np.take(y, indices)
classes_outliers = unique(how_many_classes)
total_outliers = len(how_many_classes)
for c in classes_outliers:
  n_examples = len(how_many_classes[how_many_classes==c])
  percent = (n_examples / total_outliers) * 100
  print('> Class=%d has: %d/%d (%.1f%%) outliers' % (c, n_examples, total_outliers, percent))
X = np.delete(X, indices, axis=0)
y=np.delete(y, indices, axis=0)
classes_rem_outliers = unique(y)
total = len(y)
for c in classes_rem_outliers:
  n_examples = len(y[y==c])
  percent = (n_examples / total) * 100
  print('> Class=%d after removing outliers: %d/%d (%.1f%%)' % (c, n_examples, total, percent))