# Business Problem: Predict product failure for a transmission device based harware company.

In [1]:
#Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
# Importing the dataset
df=pd.read_csv('DeviceFailure.csv')

In [3]:
# Checking the dataset loaded correctly
df.head()

Unnamed: 0,device,failure,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9
0,S1F01085,0,215630672,56,52,6,0,7
1,S1F01085,0,1650864,56,52,6,0,7
2,S1F01085,0,124017368,56,52,6,0,7
3,S1F01085,0,128073224,56,52,6,0,7
4,S1F01085,0,97393448,56,52,6,0,7


In [4]:
df.tail()

Unnamed: 0,device,failure,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9
124489,Z1F2PBHX,0,180917784,0,0,5,0,0
124490,Z1F2PBHX,0,33952520,0,0,5,0,0
124491,Z1F2PBHX,0,59053184,0,0,5,0,0
124492,Z1F2PBHX,0,110545440,0,0,5,0,0
124493,Z1F2PBHX,0,130522432,0,0,5,0,0


In [5]:
df['failure'].value_counts()

0    124388
1       106
Name: failure, dtype: int64

In [6]:
# Data Pre processing
x = df.loc[:, df.columns != "failure"]
y = df['failure']

In [7]:
x.head()

Unnamed: 0,device,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9
0,S1F01085,215630672,56,52,6,0,7
1,S1F01085,1650864,56,52,6,0,7
2,S1F01085,124017368,56,52,6,0,7
3,S1F01085,128073224,56,52,6,0,7
4,S1F01085,97393448,56,52,6,0,7


In [8]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: failure, dtype: int64

In [9]:
# Splitting the Data
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=0)

In [10]:
# Removing the device column as it is not a real feature.
# but at the end we will use this data associating the prediction from user it came from

train_identifier = x_train['device']
x_train = x_train.drop(columns=['device'])

test_identifier = x_test['device']
x_test = x_test.drop(columns=['device'])

In [11]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()

In [12]:
# scale the x_train and create a new variable for scaled x_train
#Standard scaler returns a numpy array so we need to convert to DataFrame
#In array form 'index' and 'column names' are lost we need it so convert to Df

x_train_scaled= pd.DataFrame(sc.fit_transform(x_train))
x_test_scaled = pd.DataFrame(sc.transform(x_test))

In [13]:
# put x_train_scaled have the columns of original x_train set
x_train_scaled.columns = x_train.columns.values

# put x_test_scaled have the columns of original x_test set
x_test_scaled.columns = x_test.columns.values

# take the indexes also
x_train_scaled.index = x_train.index.values
x_test_scaled.index = x_test.index.values

## Balanicing the Dataset.

In [14]:
# Balanicing the Dataset.
# create the training df by remerging X_train and y_train
df_train = x_train_scaled.join(y_train)
df_train.sample(10)

Unnamed: 0,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9,failure
12239,0.68292,-0.073271,-0.077923,-0.454198,-0.041268,-0.065921,0
93561,1.408937,-0.073271,-0.077923,-0.202428,-0.041268,-0.065921,0
83776,1.089498,-0.073271,-0.077923,-0.202428,-0.041268,-0.065921,0
35276,0.959044,-0.073271,-0.077923,-0.391255,-0.041268,-0.060674,0
28461,-1.670694,-0.073271,-0.077923,-0.202428,-0.041268,-0.065921,0
96033,1.521889,-0.073271,-0.077923,-0.391255,-0.041268,-0.065921,0
96488,0.444899,-0.073271,-0.077923,-0.202428,-0.041268,-0.065921,0
5253,-0.241317,-0.073271,-0.077923,-0.26537,-0.041268,-0.065921,0
34415,-1.374782,-0.073271,-0.077923,-0.26537,-0.041268,-0.065921,0
66059,-0.482727,-0.073271,-0.077923,-0.51714,-0.041268,-0.060674,0


In [15]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df_train[df_train.failure==0]
df_minority = df_train[df_train.failure==1]

print(df_majority.failure.count())
print("-----------")
print(df_minority.failure.count())
print("-----------")
print(df_train.failure.value_counts())

99512
-----------
83
-----------
0    99512
1       83
Name: failure, dtype: int64


<b> UP-Sampling the Data

In [16]:
# UPSAMPLING
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=99512,    # to match majority class
                                 random_state=587) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
# Display new class counts
df_upsampled.failure.value_counts()

1    99512
0    99512
Name: failure, dtype: int64

In [17]:
x_upsampled = df_upsampled.drop(['failure'], axis= 1)
y_upsampled = df_upsampled.failure

In [18]:
# Splitting the Data
from sklearn.model_selection import train_test_split

x_up_train,x_up_test,y_up_train,y_up_test= train_test_split(x_upsampled,y_upsampled,test_size=0.2,random_state=0)

In [19]:
x_up_train.head()

Unnamed: 0,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9
64496,0.473767,-0.073271,-0.077923,-0.0136,-0.041268,-0.065921
23861,0.417016,-0.073271,-0.077923,-0.391255,10.918911,-0.065921
10716,0.293393,-0.073271,-0.077923,-0.0136,-0.041268,-0.065921
35081,-0.790088,-0.073271,-0.077923,-0.202428,-0.041268,-0.034435
22640,-1.024272,-0.073271,-0.077923,-0.51714,-0.041268,-0.065921


<b># Applying machine learning algorithms

In [20]:
# Applying machine learning algorithms
# Creating a function that will give the Following output when any model is run

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, x_up_train,x_up_test,y_up_train,y_up_test, train=True):
    if train:
        pred = clf.predict(x_up_train)
        clf_report = pd.DataFrame(classification_report(y_up_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_up_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_up_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(x_up_test)
        clf_report = pd.DataFrame(classification_report(y_up_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_up_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_up_test, pred)}\n")

In [21]:
# LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=0)

clf_lr.fit(x_up_train ,y_up_train)
print_score(clf_lr, x_up_train,x_up_test,y_up_train,y_up_test, train=True)
print_score(clf_lr, x_up_train,x_up_test,y_up_train,y_up_test, train=False)

Train Result:
Accuracy Score: 76.83%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      0.695323      0.929328  0.768338       0.812325       0.812228
recall         0.955934      0.580428  0.768338       0.768181       0.768338
f1-score       0.805063      0.714563  0.768338       0.759813       0.759851
support    79676.000000  79543.000000  0.768338  159219.000000  159219.000000
_______________________________________________
Confusion Matrix: 
 [[76165  3511]
 [33374 46169]]

Test Result:
Accuracy Score: 76.94%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy     macro avg  weighted avg
precision      0.694555      0.934230  0.769351      0.814392      0.814793
recall         0.958812      0.581151  0.769351      0.769982      0.769351
f1-score       0.805566      0.716557  0.769351      0.761061      

In [22]:
# DECISION TREE

from sklearn.tree import DecisionTreeClassifier
clf_tree= DecisionTreeClassifier(random_state=0)

clf_tree.fit(x_up_train ,y_up_train)
print_score(clf_tree, x_up_train,x_up_test,y_up_train,y_up_test, train=True)
print_score(clf_tree, x_up_train,x_up_test,y_up_train,y_up_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      1.000000      0.999987  0.999994       0.999994       0.999994
recall         0.999987      1.000000  0.999994       0.999994       0.999994
f1-score       0.999994      0.999994  0.999994       0.999994       0.999994
support    79676.000000  79543.000000  0.999994  159219.000000  159219.000000
_______________________________________________
Confusion Matrix: 
 [[79675     1]
 [    0 79543]]

Test Result:
Accuracy Score: 99.96%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy     macro avg  weighted avg
precision      1.000000      0.999299  0.999648      0.999650      0.999649
recall         0.999294      1.000000  0.999648      0.999647      0.999648
f1-score       0.999647      0.999650  0.999648      0.999648     

In [23]:
# Random Forest
# Random Forest
from sklearn.ensemble import RandomForestClassifier
clf_rf= RandomForestClassifier(random_state=0)

clf_rf.fit(x_up_train,y_up_train)
print_score(clf_rf, x_up_train,x_up_test,y_up_train,y_up_test, train=True)
print_score(clf_rf, x_up_train,x_up_test,y_up_train,y_up_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      1.000000      0.999987  0.999994       0.999994       0.999994
recall         0.999987      1.000000  0.999994       0.999994       0.999994
f1-score       0.999994      0.999994  0.999994       0.999994       0.999994
support    79676.000000  79543.000000  0.999994  159219.000000  159219.000000
_______________________________________________
Confusion Matrix: 
 [[79675     1]
 [    0 79543]]

Test Result:
Accuracy Score: 99.96%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy     macro avg  weighted avg
precision      1.000000      0.999249  0.999623      0.999625      0.999623
recall         0.999244      1.000000  0.999623      0.999622      0.999623
f1-score       0.999622      0.999625  0.999623      0.999623     

In [32]:
# SVM
from sklearn.svm import SVC

print("=======================Linear Kernel SVM==========================")
model = SVC(kernel='linear')
model.fit(x_up_train,y_up_train)
print_score(model, x_up_train,x_up_test,y_up_train,y_up_test, train=True)
print_score(model, x_up_train,x_up_test,y_up_train,y_up_test, train=False)



KeyboardInterrupt: 

# Down-Sampling the Data

In [24]:
# Down-Sampling the Data
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=83,     # to match minority class
                                 random_state=24) # reproducible results
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
# Display new class counts
df_downsampled.failure.value_counts()

1    83
0    83
Name: failure, dtype: int64

In [25]:
x_downsampled = df_downsampled.drop(['failure'], axis = 1)
y_downsampled = df_downsampled.failure

In [26]:
# Splitting the Data
from sklearn.model_selection import train_test_split

x_down_train,x_down_test,y_down_train,y_down_test= train_test_split(x_downsampled,y_downsampled,test_size=0.2,random_state=0)

<b># Applying machine learning algorithms

In [27]:
# Applying machine learning algorithms
# Creating a function that will give the Following output when any model is run

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, x_down_train,x_down_test,y_down_train,y_down_test, train=True):
    if train:
        pred = clf.predict(x_down_train)
        clf_report = pd.DataFrame(classification_report(y_down_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_down_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_down_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(x_down_test)
        clf_report = pd.DataFrame(classification_report(y_down_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_down_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_down_test, pred)}\n")

In [28]:
# LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=0)

clf_lr.fit(x_down_train, y_down_train)
print_score(clf_lr, x_down_train,x_down_test,y_down_train,y_down_test, train=True)
print_score(clf_lr, x_down_train,x_down_test,y_down_train,y_down_test, train=False)

Train Result:
Accuracy Score: 77.27%
_______________________________________________
CLASSIFICATION REPORT:
                   0          1  accuracy   macro avg  weighted avg
precision   0.711111   0.904762  0.772727    0.807937      0.805002
recall      0.941176   0.593750  0.772727    0.767463      0.772727
f1-score    0.810127   0.716981  0.772727    0.763554      0.764965
support    68.000000  64.000000  0.772727  132.000000    132.000000
_______________________________________________
Confusion Matrix: 
 [[64  4]
 [26 38]]

Test Result:
Accuracy Score: 67.65%
_______________________________________________
CLASSIFICATION REPORT:
                   0          1  accuracy  macro avg  weighted avg
precision   0.583333   0.900000  0.676471   0.741667      0.760294
recall      0.933333   0.473684  0.676471   0.703509      0.676471
f1-score    0.717949   0.620690  0.676471   0.669319      0.663598
support    15.000000  19.000000  0.676471  34.000000     34.000000
______________________

In [29]:
# DECISION TREE

from sklearn.tree import DecisionTreeClassifier
clf_tree= DecisionTreeClassifier(random_state=0)

clf_tree.fit(x_down_train,y_down_train)
print_score(clf_tree, x_down_train,x_down_test,y_down_train,y_down_test, train=True)
print_score(clf_tree, x_down_train,x_down_test,y_down_train,y_down_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
              0     1  accuracy  macro avg  weighted avg
precision   1.0   1.0       1.0        1.0           1.0
recall      1.0   1.0       1.0        1.0           1.0
f1-score    1.0   1.0       1.0        1.0           1.0
support    68.0  64.0       1.0      132.0         132.0
_______________________________________________
Confusion Matrix: 
 [[68  0]
 [ 0 64]]

Test Result:
Accuracy Score: 76.47%
_______________________________________________
CLASSIFICATION REPORT:
                   0          1  accuracy  macro avg  weighted avg
precision   0.705882   0.823529  0.764706   0.764706      0.771626
recall      0.800000   0.736842  0.764706   0.768421      0.764706
f1-score    0.750000   0.777778  0.764706   0.763889      0.765523
support    15.000000  19.000000  0.764706  34.000000     34.000000
_______________________________________________
Confusion Matrix: 
 [[12  3]

In [30]:
# Random Forest
# Random Forest
from sklearn.ensemble import RandomForestClassifier
clf_rf= RandomForestClassifier(random_state=0)

clf_rf.fit(x_down_train,y_down_train)
print_score(clf_rf, x_down_train,x_down_test,y_down_train,y_down_test, train=True)
print_score(clf_rf, x_down_train,x_down_test,y_down_train,y_down_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
              0     1  accuracy  macro avg  weighted avg
precision   1.0   1.0       1.0        1.0           1.0
recall      1.0   1.0       1.0        1.0           1.0
f1-score    1.0   1.0       1.0        1.0           1.0
support    68.0  64.0       1.0      132.0         132.0
_______________________________________________
Confusion Matrix: 
 [[68  0]
 [ 0 64]]

Test Result:
Accuracy Score: 73.53%
_______________________________________________
CLASSIFICATION REPORT:
                   0          1  accuracy  macro avg  weighted avg
precision   0.666667   0.812500  0.735294   0.739583      0.748162
recall      0.800000   0.684211  0.735294   0.742105      0.735294
f1-score    0.727273   0.742857  0.735294   0.735065      0.735982
support    15.000000  19.000000  0.735294  34.000000     34.000000
_______________________________________________
Confusion Matrix: 
 [[12  3]

In [None]:
# SVM
from sklearn.svm import SVC

print("=======================Linear Kernel SVM==========================")
model = SVC(kernel='linear')
model.fit(x_down_train,y_down_train)
print_score(model, x_down_train,x_down_test,y_down_train,y_down_test, train=True)
print_score(model, x_down_train,x_down_test,y_down_train,y_down_test, train=False)

### Performing Hyper Parameter tuning for DT for UP Sampled Data

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    "criterion":("gini", "entropy"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1,10))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}


tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(tree_clf, params, scoring="precision", n_jobs=-1, verbose=1, cv=3)
tree_cv.fit(x_up_train ,y_up_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

tree_clf = DecisionTreeClassifier(**best_params)
tree_clf.fit(x_up_train ,y_up_train)
print_score(clf_tree, x_up_train,x_up_test,y_up_train,y_up_test, train=True)
print_score(clf_tree, x_up_train,x_up_test,y_up_train,y_up_test, train=False)


Fitting 3 folds for each of 2052 candidates, totalling 6156 fits
Best paramters: {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'})
Train Result:
Accuracy Score: 85.12%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      0.926015      0.798702  0.851249       0.862358       0.862411
recall         0.763768      0.938876  0.851249       0.851322       0.851249
f1-score       0.837102      0.863135  0.851249       0.850119       0.850108
support    79676.000000  79543.000000  0.851249  159219.000000  159219.000000
_______________________________________________
Confusion Matrix: 
 [[60854 18822]
 [ 4862 74681]]

Test Result:
Accuracy Score: 85.03%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy     macro avg  weighted avg
precision      0.925184    

### Performing Hyper Parameter tuning for DT for DOWN Sampled Data

In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    "criterion":("gini", "entropy"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1,10))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}


tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(tree_clf, params, scoring="precision", n_jobs=-1, verbose=1, cv=3)
tree_cv.fit(x_down_train,y_down_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

clf_tree = DecisionTreeClassifier(**best_params)
clf_tree.fit(x_down_train,y_down_train)
print_score(clf_tree, x_down_train,x_down_test,y_down_train,y_down_test, train=True)
print_score(clf_tree, x_down_train,x_down_test,y_down_train,y_down_test, train=False)

Fitting 3 folds for each of 2052 candidates, totalling 6156 fits
Best paramters: {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'})
Train Result:
Accuracy Score: 54.55%
_______________________________________________
CLASSIFICATION REPORT:
                   0          1  accuracy   macro avg  weighted avg
precision   0.531250   1.000000  0.545455    0.765625      0.758523
recall      1.000000   0.062500  0.545455    0.531250      0.545455
f1-score    0.693878   0.117647  0.545455    0.405762      0.414493
support    68.000000  64.000000  0.545455  132.000000    132.000000
_______________________________________________
Confusion Matrix: 
 [[68  0]
 [60  4]]

Test Result:
Accuracy Score: 50.00%
_______________________________________________
CLASSIFICATION REPORT:
                   0          1  accuracy  macro avg  weighted avg
precision   0.468750   1.000000       0.5   0.734375      0.765625
recall      1.000000   0.105263   

### Random Forest hyperparameter tuning for UP Sampled Data

In [None]:
n_estimators = [100, 500, 1000, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2, 3, 5]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 10]


params_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}


rf_clf = RandomForestClassifier(random_state=42)

rf_cv = GridSearchCV(rf_clf, params_grid, scoring="precision", cv=3, verbose=1, n_jobs=-1)


rf_cv.fit(x_up_train,y_up_train)
best_params = rf_cv.best_params_
print(f"Best parameters: {best_params}")

rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(x_up_train,y_up_train)

print_score(clf_rf, x_up_train,x_up_test,y_up_train,y_up_test, train=True)
print_score(clf_rf, x_up_train,x_up_test,y_up_train,y_up_test, train=False)


Fitting 3 folds for each of 384 candidates, totalling 1152 fits


### Performing Hyper Parameter tuning for DT for DOWN Sampled Data

In [None]:
n_estimators = [100, 500, 1000, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2, 3, 5]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 10]


params_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}


rf_clf = RandomForestClassifier(random_state=42)

rf_cv = GridSearchCV(rf_clf, params_grid, scoring="precision", cv=3, verbose=1, n_jobs=-1)


rf_cv.fit(x_down_train,y_down_train)
best_params = rf_cv.best_params_
print(f"Best parameters: {best_params}")

rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(x_down_train,y_down_train)

print_score(clf_rf, x_down_train,x_down_test,y_down_train,y_down_test, train=True)
print_score(clf_rf, x_down_train,x_down_test,y_down_train,y_down_test, train=False)