# Business Problem: Predict product failure for a transmission device based harware company.

In [1]:
#Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
# Importing the dataset
df=pd.read_csv('DeviceFailure.csv')

In [3]:
# Checking the dataset loaded correctly
df.head()

Unnamed: 0,device,failure,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9
0,S1F01085,0,215630672,56,52,6,0,7
1,S1F01085,0,1650864,56,52,6,0,7
2,S1F01085,0,124017368,56,52,6,0,7
3,S1F01085,0,128073224,56,52,6,0,7
4,S1F01085,0,97393448,56,52,6,0,7


In [4]:
df.tail()

Unnamed: 0,device,failure,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9
124489,Z1F2PBHX,0,180917784,0,0,5,0,0
124490,Z1F2PBHX,0,33952520,0,0,5,0,0
124491,Z1F2PBHX,0,59053184,0,0,5,0,0
124492,Z1F2PBHX,0,110545440,0,0,5,0,0
124493,Z1F2PBHX,0,130522432,0,0,5,0,0


In [5]:
# Data Pre processing
x = df.loc[:, df.columns != "failure"]
y = df['failure']

In [6]:
x.head()

Unnamed: 0,device,attribute1,attribute2,attribute4,attribute5,attribute7,attribute9
0,S1F01085,215630672,56,52,6,0,7
1,S1F01085,1650864,56,52,6,0,7
2,S1F01085,124017368,56,52,6,0,7
3,S1F01085,128073224,56,52,6,0,7
4,S1F01085,97393448,56,52,6,0,7


In [7]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: failure, dtype: int64

In [None]:
y.mean()

In [8]:
# Splitting the Data
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=0)

In [9]:
# Removing the device column as it is not a real feature.
# but at the end we will use this data associating the prediction from user it came from

train_identifier = x_train['device']
x_train = x_train.drop(columns=['device'])

test_identifier = x_test['device']
x_test = x_test.drop(columns=['device'])

In [10]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()

In [11]:
# scale the x_train and create a new variable for scaled x_train
#Standard scaler returns a numpy array so we need to convert to DataFrame
#In array form 'index' and 'column names' are lost we need it so convert to Df

x_train_scaled= pd.DataFrame(sc.fit_transform(x_train))
x_test_scaled = pd.DataFrame(sc.transform(x_test))

In [12]:
# put x_train_scaled have the columns of original x_train set
x_train_scaled.columns = x_train.columns.values

# put x_test_scaled have the columns of original x_test set
x_test_scaled.columns = x_test.columns.values

# take the indexes also
x_train_scaled.index = x_train.index.values
x_test_scaled.index = x_test.index.values

In [None]:
# Applying machine learning algorithms
# Creating a function that will give the Following output when any model is run

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, x_train, y_train, x_test, y_test, train=True):
    if train:
        pred = clf.predict(x_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(x_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [14]:
# LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=0)

clf_lr.fit(x_train_scaled,y_train)
print_score(clf_lr, x_train_scaled, y_train, x_test_scaled, y_test, train=True)
print_score(clf_lr, x_train_scaled, y_train, x_test_scaled, y_test, train=False)

Train Result:
Accuracy Score: 99.92%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999187   0.500000  0.999167      0.749593      0.998771
recall         0.999980   0.024096  0.999167      0.512038      0.999167
f1-score       0.999583   0.045977  0.999167      0.522780      0.998788
support    99512.000000  83.000000  0.999167  99595.000000  99595.000000
_______________________________________________
Confusion Matrix: 
 [[99510     2]
 [   81     2]]

Test Result:
Accuracy Score: 99.90%
_______________________________________________
CLASSIFICATION REPORT:
                      0     1  accuracy     macro avg  weighted avg
precision      0.999076   0.0  0.998956      0.499538      0.998153
recall         0.999879   0.0  0.998956      0.499940      0.998956
f1-score       0.999478   0.0  0.998956      0.499739      0.998554
support    24876.000000  23.0  0.998956  24899.0

In [15]:
# DECISION TREE

from sklearn.tree import DecisionTreeClassifier
clf_tree= DecisionTreeClassifier(random_state=0)

clf_tree.fit(x_train_scaled,y_train)
print_score(clf_tree, x_train_scaled, y_train, x_test_scaled, y_test, train=True)
print_score(clf_tree, x_train_scaled, y_train, x_test_scaled, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999990   1.000000   0.99999      0.999995       0.99999
recall         1.000000   0.987952   0.99999      0.993976       0.99999
f1-score       0.999995   0.993939   0.99999      0.996967       0.99999
support    99512.000000  83.000000   0.99999  99595.000000   99595.00000
_______________________________________________
Confusion Matrix: 
 [[99512     0]
 [    1    82]]

Test Result:
Accuracy Score: 99.82%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999116   0.043478  0.998233      0.521297      0.998233
recall         0.999116   0.043478  0.998233      0.521297      0.998233
f1-score       0.999116   0.043478  0.998233      0.521297      0.998233
support    24876.000000  23

In [16]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
clf_rf= RandomForestClassifier(random_state=0)

clf_rf.fit(x_train_scaled,y_train)
print_score(clf_rf, x_train_scaled, y_train, x_test_scaled, y_test, train=True)
print_score(clf_rf, x_train_scaled, y_train, x_test_scaled, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                     0          1  accuracy     macro avg  weighted avg
precision      0.99998   1.000000   0.99998      0.999990       0.99998
recall         1.00000   0.975904   0.99998      0.987952       0.99998
f1-score       0.99999   0.987805   0.99998      0.993897       0.99998
support    99512.00000  83.000000   0.99998  99595.000000   99595.00000
_______________________________________________
Confusion Matrix: 
 [[99512     0]
 [    2    81]]

Test Result:
Accuracy Score: 99.88%
_______________________________________________
CLASSIFICATION REPORT:
                      0     1  accuracy     macro avg  weighted avg
precision      0.999076   0.0  0.998795      0.499538      0.998153
recall         0.999719   0.0  0.998795      0.499859      0.998795
f1-score       0.999397   0.0  0.998795      0.499699      0.998474
support    24876.000000  23.0  0.998795  24899.00000

In [17]:
# SVM
from sklearn.svm import SVC

print("=======================Linear Kernel SVM==========================")
model = SVC(kernel='linear')
model.fit(x_train_scaled, y_train)
print_score(model, x_train_scaled, y_train, x_test_scaled, y_test, train=True)
print_score(model, x_train_scaled, y_train, x_test_scaled, y_test, train=False)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train Result:
Accuracy Score: 99.92%
_______________________________________________
CLASSIFICATION REPORT:
                      0     1  accuracy     macro avg  weighted avg
precision      0.999167   0.0  0.999167      0.499583      0.998334
recall         1.000000   0.0  0.999167      0.500000      0.999167
f1-score       0.999583   0.0  0.999167      0.499792      0.998750
support    99512.000000  83.0  0.999167  99595.000000  99595.000000
_______________________________________________
Confusion Matrix: 
 [[99512     0]
 [   83     0]]

Test Result:
Accuracy Score: 99.91%
_______________________________________________
CLASSIFICATION REPORT:
                      0     1  accuracy     macro avg  weighted avg
precision      0.999076   0.0  0.999076      0.499538      0.998153
recall         1.000000   0.0  0.999076      0.500000      0.999076
f1-score       0.999538   0.0  0.999076      0.499769      0.998615
support    24876.000000  23.0  0.999076  24899.000000  24899.000000
_____

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
