In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
from pylab import rcParams

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
from sklearn.metrics import precision_recall_fscore_support, f1_score

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
from sklearn.metrics import accuracy_score


from numpy.random import seed
seed(7)

from sklearn.model_selection import train_test_split

SEED = 123 #used to help randomly select the data points
DATA_SPLIT_PCT = 0.2

rcParams['figure.figsize'] = 8, 6
LABELS = ["Normal","Break"]

In [2]:
df = pd.read_csv("processed_data19.csv") 
df.head(n=5)  # visualize the data.

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,time,y,x1,x2,x3,x4,x5,x6,...,seasonal_x52,seasonal_x53,seasonal_x54,seasonal_x55,seasonal_x56,seasonal_x57,seasonal_x58,seasonal_x59,seasonal_x60,seasonal_x61
0,0,0,5/1/99 0:00,0,0.376665,-4.596435,-4.095756,13.497687,-0.11883,-20.669883,...,0.000514,0.085976,-0.012507,0.390741,0.150311,-0.025204,9.9e-05,-0.03811,-2.694573e-06,0.002125
1,1,1,5/1/99 0:02,0,0.47572,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,...,0.004109,0.088369,-0.013685,0.456846,0.119423,0.034005,8.8e-05,-0.054529,-2.601313e-06,0.000545
2,2,2,5/1/99 0:04,0,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,...,0.005728,0.08814,-0.013378,0.30994,0.172026,0.041005,-1.2e-05,-0.071054,-1.225272e-06,0.000545
3,3,3,5/1/99 0:06,0,0.30159,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,...,0.006392,0.088963,-0.010932,0.33385,-0.213034,0.017433,9e-06,-0.234072,1.490749e-08,-0.001035
4,4,4,5/1/99 0:08,0,0.265578,-4.749928,-4.33315,15.26734,-0.155314,-17.505913,...,0.007396,0.090237,-0.003588,0.441475,-0.263479,-0.00333,-0.000145,-0.226569,1.74485e-06,0.000545


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18398 entries, 0 to 18397
Columns: 674 entries, Unnamed: 0.1 to seasonal_x61
dtypes: float64(661), int64(12), object(1)
memory usage: 94.6+ MB


In [4]:
sign = lambda x: (1, -1)[x < 0]

def curve_shift(df, shift_by):
    '''
    This function will shift the binary labels in a dataframe.
    The curve shift will be with respect to the 1s. 
    For example, if shift is -2, the following process
    will happen: if row n is labeled as 1, then
    - Make row (n+shift_by):(n+shift_by-1) = 1.
    - Remove row n.
    i.e. the labels will be shifted up to 2 rows up.
    
    Inputs:
    df       A pandas dataframe with a binary labeled column. 
             This labeled column should be named as 'y'.
    shift_by An integer denoting the number of rows to shift.
    
    Output
    df       A dataframe with the binary labels shifted by shift.
    '''

    vector = df['y'].copy()
    for s in range(abs(shift_by)):
        tmp = vector.shift(sign(shift_by))
        tmp = tmp.fillna(0)
        vector += tmp
    labelcol = 'y'
    # Add vector to the df
    df.insert(loc=0, column=labelcol+'tmp', value=vector)
    # Remove the rows with labelcol == 1.
    df = df.drop(df[df[labelcol] == 1].index)
    # Drop labelcol and rename the tmp col as labelcol
    df = df.drop(labelcol, axis=1)
    df = df.rename(columns={labelcol+'tmp': labelcol})
    # Make the labelcol binary
    df.loc[df[labelcol] > 0, labelcol] = 1

    return df

In [5]:
'''
Shift the data by 2 units, equal to 4 minutes.

Test: Testing whether the shift happened correctly.
'''
print('Before shifting')  # Positive labeled rows before shifting.
one_indexes = df.index[df['y'] == 1]
display(df.iloc[(one_indexes[0]-3):(one_indexes[0]+2), 0:5].head(n=5))

# Shift the response column y by 2 rows to do a 4-min ahead prediction.
df = curve_shift(df, shift_by = -2)

print('After shifting')  # Validating if the shift happened correctly.
display(df.iloc[(one_indexes[0]-4):(one_indexes[0]+1), 0:5].head(n=5))  

Before shifting


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,time,y,x1
256,256,256,5/1/99 8:32,0,1.016235
257,257,257,5/1/99 8:34,0,1.005602
258,258,258,5/1/99 8:36,0,0.933933
259,259,259,5/1/99 8:38,1,0.892311
260,260,260,5/1/99 10:50,0,0.020062


After shifting


Unnamed: 0.2,y,Unnamed: 0.1,Unnamed: 0,time,x1
255,0.0,255,255,5/1/99 8:30,0.997107
256,0.0,256,256,5/1/99 8:32,1.016235
257,1.0,257,257,5/1/99 8:34,1.005602
258,1.0,258,258,5/1/99 8:36,0.933933
260,0.0,260,260,5/1/99 10:50,0.020062


In [6]:
# Remove time column
df = df.drop(['time'], axis=1)

In [7]:
X = df.drop(['y'], axis=1)
y = df['y']

In [8]:
X
X=X.fillna(0)
X

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,...,seasonal_x52,seasonal_x53,seasonal_x54,seasonal_x55,seasonal_x56,seasonal_x57,seasonal_x58,seasonal_x59,seasonal_x60,seasonal_x61
0,0,0,0.376665,-4.596435,-4.095756,13.497687,-0.118830,-20.669883,0.000732,-0.061114,...,0.000514,0.085976,-0.012507,0.390741,0.150311,-0.025204,0.000099,-0.038110,-2.694573e-06,0.002125
1,1,1,0.475720,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,-0.061114,...,0.004109,0.088369,-0.013685,0.456846,0.119423,0.034005,0.000088,-0.054529,-2.601313e-06,0.000545
2,2,2,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,0.010803,-0.061114,...,0.005728,0.088140,-0.013378,0.309940,0.172026,0.041005,-0.000012,-0.071054,-1.225272e-06,0.000545
3,3,3,0.301590,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,0.002075,-0.061114,...,0.006392,0.088963,-0.010932,0.333850,-0.213034,0.017433,0.000009,-0.234072,1.490749e-08,-0.001035
4,4,4,0.265578,-4.749928,-4.333150,15.267340,-0.155314,-17.505913,0.000732,-0.061114,...,0.007396,0.090237,-0.003588,0.441475,-0.263479,-0.003330,-0.000145,-0.226569,1.744850e-06,0.000545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18393,18393,18393,-0.877441,0.786430,0.406426,135.301215,0.112295,26.300392,-0.159185,0.058823,...,0.010495,0.093876,0.010204,0.414806,0.595085,-0.006900,-0.000075,0.088078,6.136818e-06,-0.001035
18394,18394,18394,-0.843988,0.633086,0.561918,133.228949,0.141332,25.678597,-0.159185,0.058823,...,0.003199,0.097729,0.008643,0.043938,0.774631,-0.006855,-0.000066,0.412652,-1.190088e-06,-0.001035
18395,18395,18395,-0.826547,0.450126,0.334582,134.977973,0.170370,25.056801,-0.159185,0.048752,...,0.003568,0.100886,0.005261,0.257265,0.552608,-0.013819,-0.000040,0.250467,-5.753947e-07,-0.001035
18396,18396,18396,-0.822843,0.419383,0.387263,135.658942,0.199422,24.435005,-0.159185,0.048752,...,0.008677,0.102600,0.007895,0.123481,0.951946,0.016317,-0.000039,0.282800,5.020488e-06,0.002125


In [9]:
X = X.drop(['Unnamed: 0'], axis=1)
X

Unnamed: 0,Unnamed: 0.1,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,seasonal_x52,seasonal_x53,seasonal_x54,seasonal_x55,seasonal_x56,seasonal_x57,seasonal_x58,seasonal_x59,seasonal_x60,seasonal_x61
0,0,0.376665,-4.596435,-4.095756,13.497687,-0.118830,-20.669883,0.000732,-0.061114,-0.059966,...,0.000514,0.085976,-0.012507,0.390741,0.150311,-0.025204,0.000099,-0.038110,-2.694573e-06,0.002125
1,1,0.475720,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,-0.061114,-0.059966,...,0.004109,0.088369,-0.013685,0.456846,0.119423,0.034005,0.000088,-0.054529,-2.601313e-06,0.000545
2,2,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,0.010803,-0.061114,-0.030057,...,0.005728,0.088140,-0.013378,0.309940,0.172026,0.041005,-0.000012,-0.071054,-1.225272e-06,0.000545
3,3,0.301590,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,0.002075,-0.061114,-0.019986,...,0.006392,0.088963,-0.010932,0.333850,-0.213034,0.017433,0.000009,-0.234072,1.490749e-08,-0.001035
4,4,0.265578,-4.749928,-4.333150,15.267340,-0.155314,-17.505913,0.000732,-0.061114,-0.030057,...,0.007396,0.090237,-0.003588,0.441475,-0.263479,-0.003330,-0.000145,-0.226569,1.744850e-06,0.000545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18393,18393,-0.877441,0.786430,0.406426,135.301215,0.112295,26.300392,-0.159185,0.058823,-0.080108,...,0.010495,0.093876,0.010204,0.414806,0.595085,-0.006900,-0.000075,0.088078,6.136818e-06,-0.001035
18394,18394,-0.843988,0.633086,0.561918,133.228949,0.141332,25.678597,-0.159185,0.058823,-0.080108,...,0.003199,0.097729,0.008643,0.043938,0.774631,-0.006855,-0.000066,0.412652,-1.190088e-06,-0.001035
18395,18395,-0.826547,0.450126,0.334582,134.977973,0.170370,25.056801,-0.159185,0.048752,-0.080108,...,0.003568,0.100886,0.005261,0.257265,0.552608,-0.013819,-0.000040,0.250467,-5.753947e-07,-0.001035
18396,18396,-0.822843,0.419383,0.387263,135.658942,0.199422,24.435005,-0.159185,0.048752,-0.080108,...,0.008677,0.102600,0.007895,0.123481,0.951946,0.016317,-0.000039,0.282800,5.020488e-06,0.002125


In [10]:
X = X.drop(['drift_x61'], axis=1)
X

Unnamed: 0,Unnamed: 0.1,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,seasonal_x52,seasonal_x53,seasonal_x54,seasonal_x55,seasonal_x56,seasonal_x57,seasonal_x58,seasonal_x59,seasonal_x60,seasonal_x61
0,0,0.376665,-4.596435,-4.095756,13.497687,-0.118830,-20.669883,0.000732,-0.061114,-0.059966,...,0.000514,0.085976,-0.012507,0.390741,0.150311,-0.025204,0.000099,-0.038110,-2.694573e-06,0.002125
1,1,0.475720,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,-0.061114,-0.059966,...,0.004109,0.088369,-0.013685,0.456846,0.119423,0.034005,0.000088,-0.054529,-2.601313e-06,0.000545
2,2,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,0.010803,-0.061114,-0.030057,...,0.005728,0.088140,-0.013378,0.309940,0.172026,0.041005,-0.000012,-0.071054,-1.225272e-06,0.000545
3,3,0.301590,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,0.002075,-0.061114,-0.019986,...,0.006392,0.088963,-0.010932,0.333850,-0.213034,0.017433,0.000009,-0.234072,1.490749e-08,-0.001035
4,4,0.265578,-4.749928,-4.333150,15.267340,-0.155314,-17.505913,0.000732,-0.061114,-0.030057,...,0.007396,0.090237,-0.003588,0.441475,-0.263479,-0.003330,-0.000145,-0.226569,1.744850e-06,0.000545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18393,18393,-0.877441,0.786430,0.406426,135.301215,0.112295,26.300392,-0.159185,0.058823,-0.080108,...,0.010495,0.093876,0.010204,0.414806,0.595085,-0.006900,-0.000075,0.088078,6.136818e-06,-0.001035
18394,18394,-0.843988,0.633086,0.561918,133.228949,0.141332,25.678597,-0.159185,0.058823,-0.080108,...,0.003199,0.097729,0.008643,0.043938,0.774631,-0.006855,-0.000066,0.412652,-1.190088e-06,-0.001035
18395,18395,-0.826547,0.450126,0.334582,134.977973,0.170370,25.056801,-0.159185,0.048752,-0.080108,...,0.003568,0.100886,0.005261,0.257265,0.552608,-0.013819,-0.000040,0.250467,-5.753947e-07,-0.001035
18396,18396,-0.822843,0.419383,0.387263,135.658942,0.199422,24.435005,-0.159185,0.048752,-0.080108,...,0.008677,0.102600,0.007895,0.123481,0.951946,0.016317,-0.000039,0.282800,5.020488e-06,0.002125


In [14]:
X = X.drop(['Unnamed: 0.1'], axis=1)
X

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,seasonal_x52,seasonal_x53,seasonal_x54,seasonal_x55,seasonal_x56,seasonal_x57,seasonal_x58,seasonal_x59,seasonal_x60,seasonal_x61
0,0.376665,-4.596435,-4.095756,13.497687,-0.118830,-20.669883,0.000732,-0.061114,-0.059966,-0.038189,...,0.000514,0.085976,-0.012507,0.390741,0.150311,-0.025204,0.000099,-0.038110,-2.694573e-06,0.002125
1,0.475720,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,-0.061114,-0.059966,-0.038189,...,0.004109,0.088369,-0.013685,0.456846,0.119423,0.034005,0.000088,-0.054529,-2.601313e-06,0.000545
2,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,0.010803,-0.061114,-0.030057,-0.018352,...,0.005728,0.088140,-0.013378,0.309940,0.172026,0.041005,-0.000012,-0.071054,-1.225272e-06,0.000545
3,0.301590,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,0.002075,-0.061114,-0.019986,-0.008280,...,0.006392,0.088963,-0.010932,0.333850,-0.213034,0.017433,0.000009,-0.234072,1.490749e-08,-0.001035
4,0.265578,-4.749928,-4.333150,15.267340,-0.155314,-17.505913,0.000732,-0.061114,-0.030057,-0.008280,...,0.007396,0.090237,-0.003588,0.441475,-0.263479,-0.003330,-0.000145,-0.226569,1.744850e-06,0.000545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18393,-0.877441,0.786430,0.406426,135.301215,0.112295,26.300392,-0.159185,0.058823,-0.080108,-0.038189,...,0.010495,0.093876,0.010204,0.414806,0.595085,-0.006900,-0.000075,0.088078,6.136818e-06,-0.001035
18394,-0.843988,0.633086,0.561918,133.228949,0.141332,25.678597,-0.159185,0.058823,-0.080108,-0.038189,...,0.003199,0.097729,0.008643,0.043938,0.774631,-0.006855,-0.000066,0.412652,-1.190088e-06,-0.001035
18395,-0.826547,0.450126,0.334582,134.977973,0.170370,25.056801,-0.159185,0.048752,-0.080108,-0.038189,...,0.003568,0.100886,0.005261,0.257265,0.552608,-0.013819,-0.000040,0.250467,-5.753947e-07,-0.001035
18396,-0.822843,0.419383,0.387263,135.658942,0.199422,24.435005,-0.159185,0.048752,-0.080108,-0.038189,...,0.008677,0.102600,0.007895,0.123481,0.951946,0.016317,-0.000039,0.282800,5.020488e-06,0.002125


In [12]:
#X=X[['x3','x2','lag_1_x2','lag_1_x3','x2_pct_change','x18','x17','x3_pct_change','x16','rolling_mean_x16','lag_1_x16']]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((14619, 669), (3655, 669))

In [16]:
# Feature scaling

In [17]:
cols = X_train.columns

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])


In [18]:
# define model
model = XGBClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.97395


In [19]:
model = XGBClassifier()

clf_0 = model.fit(X_train, y_train)


In [20]:
pred_y_0 = clf_0.predict(X_test)

print(accuracy_score(pred_y_0, y_test))



0.9906976744186047


In [21]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_0, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_0)

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      3608
     class 1       1.00      0.28      0.43        47

    accuracy                           0.99      3655
   macro avg       1.00      0.64      0.71      3655
weighted avg       0.99      0.99      0.99      3655



array([[3608,    0],
       [  34,   13]])

# Weighted XGBoost for Class Imbalance


In [22]:
# define model
#scale_pos_weight=total_negative_examples / total_positive_examples
#total number of examples in the majority class / total number of examples in the minority class.
#18274/124=147.37
model = XGBClassifier(scale_pos_weight=147)

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.97714


In [23]:
clf_1 = model.fit(X_train, y_train)
pred_y_1 = clf_1.predict(X_test)

print(accuracy_score(pred_y_1, y_test))

0.9928864569083448


In [24]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_1, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_1)

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      3608
     class 1       0.96      0.47      0.63        47

    accuracy                           0.99      3655
   macro avg       0.97      0.73      0.81      3655
weighted avg       0.99      0.99      0.99      3655



array([[3607,    1],
       [  25,   22]])

# Tune with GridSearch CV

In [25]:
# define grid
weights = [1, 10, 25, 50, 75, 99, 100,700,800,1000,2000]
param_grid = dict(scale_pos_weight=weights)

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [None]:
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [45]:
# define model
model = XGBClassifier(scale_pos_weight=3000)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.97523


In [46]:
clf_2 = model.fit(X_train, y_train)
pred_y_2 = clf_2.predict(X_test)

print(accuracy_score(pred_y_2, y_test))

0.9920656634746922


In [47]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_2, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_2)

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      3608
     class 1       0.75      0.57      0.65        47

    accuracy                           0.99      3655
   macro avg       0.87      0.79      0.82      3655
weighted avg       0.99      0.99      0.99      3655



array([[3599,    9],
       [  20,   27]])