In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
from pylab import rcParams

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
from sklearn.metrics import precision_recall_fscore_support, f1_score

from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean

from imblearn.pipeline import Pipeline

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import ADASYN

from sklearn.preprocessing import LabelEncoder

from numpy.random import seed
seed(7)

from sklearn.model_selection import train_test_split

SEED = 123 #used to help randomly select the data points
DATA_SPLIT_PCT = 0.2

rcParams['figure.figsize'] = 8, 6
LABELS = ["Normal","Break"]

In [2]:
df_normal = pd.read_excel("Normal_bearing_100_mat.xlsx")
df_normal = df_normal[["DE", "FE"]]
df_normal = df_normal.iloc[:19900]
df_normal['status'] = '0'
df_normal

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,0
1,0.054449,0.164360,0
2,0.107650,0.090811,0
3,0.133720,0.086496,0
4,0.112650,0.099235,0
...,...,...,...
19895,0.042558,-0.068622,0
19896,0.024408,-0.043967,0
19897,0.016898,-0.019313,0
19898,0.003129,0.001644,0


In [3]:
df_fault = pd.read_excel("Normal_bearing_100_mat.xlsx")
df_fault = df_fault[["DE", "FE"]]
df_fault = df_fault.iloc[:100]
df_fault['status'] = '1'
df_fault

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,1
1,0.054449,0.164360,1
2,0.107650,0.090811,1
3,0.133720,0.086496,1
4,0.112650,0.099235,1
...,...,...,...
95,0.028789,0.016642,1
96,-0.042766,0.073347,1
97,-0.077605,0.079305,1
98,-0.061333,0.044789,1


In [4]:
df_combined=pd.concat([df_normal, df_fault], axis=0)
df_combined

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,0
1,0.054449,0.164360,0
2,0.107650,0.090811,0
3,0.133720,0.086496,0
4,0.112650,0.099235,0
...,...,...,...
95,0.028789,0.016642,1
96,-0.042766,0.073347,1
97,-0.077605,0.079305,1
98,-0.061333,0.044789,1


In [5]:
df_combined['status'].value_counts()[1]

100

In [6]:
df_combined['status'].value_counts()[0]

19900

In [7]:
df_combined.to_csv("bearing_dataset_combined_0.5%.csv")

In [8]:
df=df_combined
df

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,0
1,0.054449,0.164360,0
2,0.107650,0.090811,0
3,0.133720,0.086496,0
4,0.112650,0.099235,0
...,...,...,...
95,0.028789,0.016642,1
96,-0.042766,0.073347,1
97,-0.077605,0.079305,1
98,-0.061333,0.044789,1


In [9]:
X = df.drop(['status'], axis=1)

y = df['status']

In [10]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 0, ..., 1, 1, 1])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((16000, 2), (4000, 2))

In [12]:
#y_train.value_counts()

In [13]:
#y_test.value_counts()

In [14]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
# Feature scaling

In [16]:
cols = X_train.columns

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])


In [17]:
# define model
model = XGBClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.56822


In [18]:
model = XGBClassifier()

clf_0 = model.fit(X_train, y_train)

In [19]:
pred_y_0 = clf_0.predict(X_test)

print(accuracy_score(pred_y_0, y_test))

0.99525


In [20]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_0, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_0)

              precision    recall  f1-score   support

     class 0       1.00      1.00      1.00      3983
     class 1       0.00      0.00      0.00        17

    accuracy                           1.00      4000
   macro avg       0.50      0.50      0.50      4000
weighted avg       0.99      1.00      0.99      4000



array([[3981,    2],
       [  17,    0]])

# Weighted XGBoost for Class Imbalance


In [22]:
# define model
#scale_pos_weight=total_negative_examples / total_positive_examples
#total number of examples in the majority class / total number of examples in the minority class.
#19900/100=199
model = XGBClassifier(scale_pos_weight=199)

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.56582


In [23]:
clf_1 = model.fit(X_train, y_train)
pred_y_1 = clf_1.predict(X_test)

print(accuracy_score(pred_y_1, y_test))

0.9845


In [24]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_1, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_1)

              precision    recall  f1-score   support

     class 0       1.00      0.99      0.99      3983
     class 1       0.00      0.00      0.00        17

    accuracy                           0.98      4000
   macro avg       0.50      0.49      0.50      4000
weighted avg       0.99      0.98      0.99      4000



array([[3938,   45],
       [  17,    0]])

# Tune with GridSearch CV

In [25]:
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
param_grid = dict(scale_pos_weight=weights)

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [33]:
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.931436 using {'scale_pos_weight': 1}
0.931436 (0.040774) with: {'scale_pos_weight': 1}
0.927670 (0.048305) with: {'scale_pos_weight': 10}
0.929479 (0.045071) with: {'scale_pos_weight': 25}
0.922069 (0.046362) with: {'scale_pos_weight': 50}
0.923126 (0.051366) with: {'scale_pos_weight': 75}
0.922714 (0.053929) with: {'scale_pos_weight': 99}
0.922413 (0.053952) with: {'scale_pos_weight': 100}
0.905774 (0.058887) with: {'scale_pos_weight': 1000}


In [48]:
# define model
model = XGBClassifier(scale_pos_weight=7000)

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.53541


In [49]:
clf_2 = model.fit(X_train, y_train)
pred_y_2 = clf_2.predict(X_test)

print(accuracy_score(pred_y_2, y_test))

0.97125


In [50]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_2, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_2)

              precision    recall  f1-score   support

     class 0       1.00      0.98      0.99      3983
     class 1       0.00      0.00      0.00        17

    accuracy                           0.97      4000
   macro avg       0.50      0.49      0.49      4000
weighted avg       0.99      0.97      0.98      4000



array([[3885,   98],
       [  17,    0]])

# sampling

In [15]:
model = XGBClassifier(scale_pos_weight=199)
X_train=X_train.fillna(0)
# Define SMOTE-Tomek Links
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='all'))
# Define pipeline
pipeline=Pipeline(steps=[('r', resample), ('m', model)])

model = XGBClassifier()

clf_0 = pipeline.fit(X_train, y_train)
pred_y_0 = clf_0.predict(X_test)

print(accuracy_score(pred_y_0, y_test))

target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_0, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_0)


#scale_pos_weight=199

0.6135
              precision    recall  f1-score   support

     class 0       1.00      0.61      0.76      3983
     class 1       0.01      0.47      0.01        17

    accuracy                           0.61      4000
   macro avg       0.50      0.54      0.39      4000
weighted avg       0.99      0.61      0.76      4000



array([[2446, 1537],
       [   9,    8]])

In [16]:
# define model
model = XGBClassifier(scale_pos_weight=199)

X_train=X_train.fillna(0)

#Define SMOTE-ENN
resample=SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))
#Define pipeline
pipeline=Pipeline(steps=[('r', resample), ('m', model)])

clf_2 = pipeline.fit(X_train, y_train)
pred_y_2 = clf_2.predict(X_test)

print(accuracy_score(pred_y_2, y_test))

target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_2, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_2)

0.59375
              precision    recall  f1-score   support

     class 0       1.00      0.59      0.74      3983
     class 1       0.00      0.41      0.01        17

    accuracy                           0.59      4000
   macro avg       0.50      0.50      0.38      4000
weighted avg       0.99      0.59      0.74      4000



array([[2368, 1615],
       [  10,    7]])

In [17]:
#ADASYN
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X, y)
model = XGBClassifier(scale_pos_weight=199)

#Define ADASYN
resample=ADASYN(random_state=42)
clf_2 = pipeline.fit(X_train, y_train)
pred_y_2 = clf_2.predict(X_test)

print(accuracy_score(pred_y_2, y_test))

pred_y_train = clf_2.predict(X_train)
print(accuracy_score(pred_y_train, y_train))


target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_2, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_2)

0.58725
0.621125
              precision    recall  f1-score   support

     class 0       1.00      0.59      0.74      3983
     class 1       0.00      0.47      0.01        17

    accuracy                           0.59      4000
   macro avg       0.50      0.53      0.37      4000
weighted avg       0.99      0.59      0.74      4000



array([[2341, 1642],
       [   9,    8]])