In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [None]:
df = pd.read_csv('predictive_maintenance.csv')
df.info()
df.head()

In [None]:
deep_df = df.copy(deep = True)

In [None]:
# list numerical cols that are not necessarily continuous
numerical_cols = [col for col in df.columns if 
                 (df[col].dtype=='int64' or df[col].dtype=='float64')]

In [None]:
df[numerical_cols].describe().reindex(['min', 'max', 'mean', '50%'])

In [None]:
# dropping columns with data where:
# the mean and 50th percentile are significantly different where many points are close to 0
# there is not useful data
cols_to_drop = ['date', 'device', 'metric2', 'metric3', 'metric4', 'metric7', 'metric8', 'metric9']
df = df.drop(cols_to_drop, axis=1)

In [None]:
df.head()

In [None]:
# looking at the distribution of values in metrics 1, 5 and 6
# info in metric1 is not very varied
plt.hist(df['metric1'], bins=30)
plt.xlabel('metric1')
plt.ylabel('Frequency')

In [None]:
plt.hist(df['metric5'], bins=30)
plt.xlabel('metric5')
plt.ylabel('Frequency')

In [None]:
plt.hist(df['metric6'], bins=30)
plt.xlabel('metric6')
plt.ylabel('Frequency')

In [None]:
# confirms metric1 is not a good sources of info
plt.scatter(x=range(len(list(df['metric1'][df['failure']==1]))),
           y=df['metric1'][df['failure']==1], s=1)
plt.xlabel('Successes (rows)')
plt.ylabel('metric1')
plt.title('Metric1 vlaues with failure (failure = 1)')

In [None]:
# most failures happend with metric 5 values between 0 and 20
plt.scatter(x=range(len(list(df['metric5'][df['failure']==1]))),
           y=df['metric5'][df['failure']==1], s=1)
plt.xlabel('Successes (rows)')
plt.ylabel('metric5')
plt.title('Metric5 vlaues with failure (failure = 1)')

In [None]:
# most failures happend with metric 5 values between 200000 and 350000
plt.scatter(x=range(len(list(df['metric6'][df['failure']==1]))),
           y=df['metric6'][df['failure']==1], s=1)
plt.xlabel('Successes (rows)')
plt.ylabel('metric6')
plt.title('Metric6 vlaues with failure (failure = 1)')

In [None]:
# groups metric1 values into buckets by % of failure
metric1_bucket = df.groupby(pd.cut(df['metric1'], bins=
                                  [0, .5, 1, 1.5, 2, 2.5]))
metric1_bucket = round((metric1_bucket.sum()['failure']/metric1_bucket.size())*100, 2)
metric1_bucket

In [None]:
# groups metric5 values into buckets by % of failure
metric5_bucket = df.groupby(pd.cut(df['metric5'], bins=
                                  [x for x in range(0, 110, 10)]))
metric5_bucket = round((metric5_bucket.sum()['failure']/metric5_bucket.size())*100, 2)
metric5_bucket

In [None]:
# groups metric6 values into buckets by % of failure
metric6_bucket = df.groupby(pd.cut(df['metric6'], bins=
                                   [x for x in range(0, 770000, 70000)]))
metric6_bucket = round((metric6_bucket.sum()['failure']/metric6_bucket.size())*100, 2)
metric6_bucket

In [None]:
# plotting the buckets
x = [str(i)+'-'+str(i+10) for i in range(0, 100, 10)]
plt.plot(x, metric5_bucket.values)
plt.xlabel('metric5 group')
plt.ylabel('% failure')
plt.title('% of devices in metric5 group that failed')

In [None]:
# plotting the buckets
x = [str(i)+'-'+str(i+7) for i in range(0, 70, 7)]
plt.plot(x, metric6_bucket.values)
plt.xlabel('metric6 group (in 00000s)')
plt.ylabel('% failure')
plt.title('% of devices in metric6 group that failed')

In [None]:
# replacing metric5 and metric6 colums with their respective buckets
df['metric5'] = pd.cut(df['metric5'], bins=
                       [x for x in range(0, 110, 10)])
df['metric6'] = pd.cut(df['metric6'], bins=[x for x in range(0, 770000, 70000)])

In [None]:
df = pd.get_dummies(df)
df.head()

In [None]:
# dropping 2 dummy variable columns as they are implied
cols_to_drop3 = ['metric5_(0, 10]', 'metric6_(0, 70000]']
df = df.drop(cols_to_drop3, axis=1)

In [None]:
# saving a clean copy
df.to_csv('Clean_data.csv')

In [None]:
df.info()

In [None]:
feat = df.drop(columns=['failure'], axis=1)
label = df['failure']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feat, label, test_size=0.3)

In [None]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.fit_transform(X_test)

In [None]:
support_vector_classifier = SVC(kernel='rbf')
support_vector_classifier.fit(X_train, y_train)
y_pred_svc = support_vector_classifier.predict(X_test)

In [None]:
cm_support_vector_classifier = confusion_matrix(y_test, y_pred_svc)

In [None]:
print(cm_support_vector_classifier, end='\n\n')

In [None]:
num = cm_support_vector_classifier[0][0] + cm_support_vector_classifier[1][1]
den = sum(cm_support_vector_classifier[0]) + sum(cm_support_vector_classifier[1])
acc_svc = (num/den)*100
print('Accuracy: ', round(acc_svc, 2), '%')

In [None]:
# using cross validation, rechecks the data model
cross_val_svc = cross_val_score(estimator=
                                SVC(kernel='rbf'),
                                   X=X_train, y=y_train, cv=10,
                                   n_jobs=-1)

print('Cross Validation Accruacy:', round(cross_val_svc.mean()
                                         *100, 2), '%')