In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
from pylab import rcParams

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
from sklearn.metrics import precision_recall_fscore_support, f1_score

from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean

from imblearn.pipeline import Pipeline

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import ADASYN

from sklearn.preprocessing import LabelEncoder

from numpy.random import seed
seed(7)

from sklearn.model_selection import train_test_split

SEED = 123 #used to help randomly select the data points
DATA_SPLIT_PCT = 0.2

rcParams['figure.figsize'] = 8, 6
LABELS = ["Normal","Break"]

In [2]:
df_normal = pd.read_excel("Normal_bearing_100_mat.xlsx")
df_normal = df_normal[["DE", "FE"]]
df_normal = df_normal.iloc[:19000]
df_normal['status'] = '0'
df_normal

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,0
1,0.054449,0.164360,0
2,0.107650,0.090811,0
3,0.133720,0.086496,0
4,0.112650,0.099235,0
...,...,...,...
18995,0.030458,-0.042324,0
18996,0.053197,-0.020135,0
18997,0.050485,0.020135,0
18998,0.015646,0.043762,0


In [3]:
df_fault = pd.read_excel("Fault_bearning_130_mat.xlsx")
df_fault = df_fault[["DE", "FE"]]
df_fault = df_fault.iloc[:1000]
df_fault['status'] = '1'
df_fault

Unnamed: 0,DE,FE,status
0,0.008528,-0.40701,1
1,0.423550,0.26278,1
2,0.012995,0.49515,1
3,-0.265180,-0.42344,1
4,0.237160,-0.30715,1
...,...,...,...
995,-0.058071,0.10273,1
996,0.088121,-0.23566,1
997,0.019898,0.28825,1
998,-0.140100,0.19169,1


In [3]:
df_fault = pd.read_excel("Normal_bearing_100_mat.xlsx")
df_fault = df_fault[["DE", "FE"]]
df_fault = df_fault.iloc[:1000]
df_fault['status'] = '1'
df_fault

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,1
1,0.054449,0.164360,1
2,0.107650,0.090811,1
3,0.133720,0.086496,1
4,0.112650,0.099235,1
...,...,...,...
995,-0.044435,-0.108480,1
996,-0.015020,-0.141350,1
997,-0.032961,-0.115470,1
998,-0.084698,-0.076018,1


In [4]:
df_combined=pd.concat([df_normal, df_fault], axis=0)
df_combined

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,0
1,0.054449,0.164360,0
2,0.107650,0.090811,0
3,0.133720,0.086496,0
4,0.112650,0.099235,0
...,...,...,...
995,-0.058071,0.102730,1
996,0.088121,-0.235660,1
997,0.019898,0.288250,1
998,-0.140100,0.191690,1


In [5]:
df_combined['status'].value_counts()[1]

1000

In [6]:
df_combined['status'].value_counts()[0]

19000

In [5]:
df_combined.to_csv("bearing_dataset_combined_5%.csv")

In [6]:
df_normal = pd.read_excel("Normal_bearing_100_mat.xlsx")
df_normal = df_normal[["DE", "FE"]]
df_normal = df_normal.iloc[:19900]
df_normal['status'] = '0'
df_normal

df_fault = pd.read_excel("Normal_bearing_100_mat.xlsx")
df_fault = df_fault[["DE", "FE"]]
df_fault = df_fault.iloc[:100]
df_fault['status'] = '1'
df_fault


df_combined=pd.concat([df_normal, df_fault], axis=0)
df_combined


df_combined.to_csv("bearing_dataset_combined_0.5%.csv")

In [53]:
df=df_combined
df

Unnamed: 0,DE,FE,status
0,0.014603,0.192920,0
1,0.054449,0.164360,0
2,0.107650,0.090811,0
3,0.133720,0.086496,0
4,0.112650,0.099235,0
...,...,...,...
995,-0.044435,-0.108480,1
996,-0.015020,-0.141350,1
997,-0.032961,-0.115470,1
998,-0.084698,-0.076018,1


In [54]:
X = df.drop(['status'], axis=1)

y = df['status']

In [55]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 0, ..., 1, 1, 1])

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((16000, 2), (4000, 2))

In [58]:
#y_train.value_counts()

In [59]:
#y_test.value_counts()

In [60]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [61]:
# Feature scaling

In [62]:
cols = X_train.columns

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])


In [64]:
# define model
model = XGBClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.48553


In [65]:
model = XGBClassifier()

clf_0 = model.fit(X_train, y_train)


In [66]:
pred_y_0 = clf_0.predict(X_test)

print(accuracy_score(pred_y_0, y_test))



0.95025


In [67]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_0, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_0)

              precision    recall  f1-score   support

     class 0       0.95      1.00      0.97      3802
     class 1       0.00      0.00      0.00       198

    accuracy                           0.95      4000
   macro avg       0.48      0.50      0.49      4000
weighted avg       0.90      0.95      0.93      4000



array([[3801,    1],
       [ 198,    0]])

# Weighted XGBoost for Class Imbalance


In [69]:
# define model
#scale_pos_weight=total_negative_examples / total_positive_examples
#total number of examples in the majority class / total number of examples in the minority class.
#19000/1000=19
model = XGBClassifier(scale_pos_weight=19)

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.46782


In [70]:
clf_1 = model.fit(X_train, y_train)
pred_y_1 = clf_1.predict(X_test)

print(accuracy_score(pred_y_1, y_test))

0.76525


In [71]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_1, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_1)

              precision    recall  f1-score   support

     class 0       0.95      0.80      0.87      3802
     class 1       0.04      0.17      0.07       198

    accuracy                           0.77      4000
   macro avg       0.49      0.48      0.47      4000
weighted avg       0.90      0.77      0.83      4000



array([[3028,  774],
       [ 165,   33]])

# Tune with GridSearch CV

In [72]:
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
param_grid = dict(scale_pos_weight=weights)

In [73]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [33]:
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.931436 using {'scale_pos_weight': 1}
0.931436 (0.040774) with: {'scale_pos_weight': 1}
0.927670 (0.048305) with: {'scale_pos_weight': 10}
0.929479 (0.045071) with: {'scale_pos_weight': 25}
0.922069 (0.046362) with: {'scale_pos_weight': 50}
0.923126 (0.051366) with: {'scale_pos_weight': 75}
0.922714 (0.053929) with: {'scale_pos_weight': 99}
0.922413 (0.053952) with: {'scale_pos_weight': 100}
0.905774 (0.058887) with: {'scale_pos_weight': 1000}


In [98]:
# define model
model = XGBClassifier(scale_pos_weight=20000)

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.48653


In [99]:
clf_2 = model.fit(X_train, y_train)
pred_y_2 = clf_2.predict(X_test)

print(accuracy_score(pred_y_2, y_test))



0.4025


In [100]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_2, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_2)

              precision    recall  f1-score   support

     class 0       0.95      0.39      0.56      3802
     class 1       0.05      0.58      0.09       198

    accuracy                           0.40      4000
   macro avg       0.50      0.48      0.32      4000
weighted avg       0.90      0.40      0.53      4000



array([[1496, 2306],
       [  84,  114]])