In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
from pylab import rcParams

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
from sklearn.metrics import precision_recall_fscore_support, f1_score

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean

from sklearn.metrics import accuracy_score

from numpy.random import seed
seed(7)

from sklearn.model_selection import train_test_split

SEED = 123 #used to help randomly select the data points
DATA_SPLIT_PCT = 0.2

rcParams['figure.figsize'] = 8, 6
LABELS = ["Normal","Break"]

In [2]:
df = pd.read_csv("processed_data10.csv") 
df.head(n=5)  # visualize the data.

Unnamed: 0.1,Unnamed: 0,time,y,x1,x2,x3,x4,x5,x6,x7,...,x52_pct_change_5,x53_pct_change_5,x54_pct_change_5,x55_pct_change_5,x56_pct_change_5,x57_pct_change_5,x58_pct_change_5,x59_pct_change_5,x60_pct_change_5,x61_pct_change_5
0,0,5/1/99 0:00,0,0.376665,-4.596435,-4.095756,13.497687,-0.11883,-20.669883,0.000732,...,,,,,,,,,,
1,1,5/1/99 0:02,0,0.47572,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,...,,,,,,,,,,
2,2,5/1/99 0:04,0,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,0.010803,...,,,,,,,,,,
3,3,5/1/99 0:06,0,0.30159,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,0.002075,...,,,,,,,,,,
4,4,5/1/99 0:08,0,0.265578,-4.749928,-4.33315,15.26734,-0.155314,-17.505913,0.000732,...,,,,,,,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18398 entries, 0 to 18397
Columns: 308 entries, Unnamed: 0 to x61_pct_change_5
dtypes: float64(303), int64(4), object(1)
memory usage: 43.2+ MB


In [4]:
sign = lambda x: (1, -1)[x < 0]

def curve_shift(df, shift_by):
    '''
    This function will shift the binary labels in a dataframe.
    The curve shift will be with respect to the 1s. 
    For example, if shift is -2, the following process
    will happen: if row n is labeled as 1, then
    - Make row (n+shift_by):(n+shift_by-1) = 1.
    - Remove row n.
    i.e. the labels will be shifted up to 2 rows up.
    
    Inputs:
    df       A pandas dataframe with a binary labeled column. 
             This labeled column should be named as 'y'.
    shift_by An integer denoting the number of rows to shift.
    
    Output
    df       A dataframe with the binary labels shifted by shift.
    '''

    vector = df['y'].copy()
    for s in range(abs(shift_by)):
        tmp = vector.shift(sign(shift_by))
        tmp = tmp.fillna(0)
        vector += tmp
    labelcol = 'y'
    # Add vector to the df
    df.insert(loc=0, column=labelcol+'tmp', value=vector)
    # Remove the rows with labelcol == 1.
    df = df.drop(df[df[labelcol] == 1].index)
    # Drop labelcol and rename the tmp col as labelcol
    df = df.drop(labelcol, axis=1)
    df = df.rename(columns={labelcol+'tmp': labelcol})
    # Make the labelcol binary
    df.loc[df[labelcol] > 0, labelcol] = 1

    return df

In [5]:
'''
Shift the data by 1 unit, equal to 2 minutes.

Test: Testing whether the shift happened correctly.
'''
print('Before shifting')  # Positive labeled rows before shifting.
one_indexes = df.index[df['y'] == 1]
display(df.iloc[(one_indexes[0]-3):(one_indexes[0]+2), 0:5].head(n=5))

# Shift the response column y by 1 rows to do a 2-min ahead prediction.
df = curve_shift(df, shift_by = -1)

print('After shifting')  # Validating if the shift happened correctly.
display(df.iloc[(one_indexes[0]-4):(one_indexes[0]+1), 0:5].head(n=5))  

Before shifting


Unnamed: 0.1,Unnamed: 0,time,y,x1,x2
256,256,5/1/99 8:32,0,1.016235,-4.058394
257,257,5/1/99 8:34,0,1.005602,-3.876199
258,258,5/1/99 8:36,0,0.933933,-3.868467
259,259,5/1/99 8:38,1,0.892311,-13.332664
260,260,5/1/99 10:50,0,0.020062,-3.987897


After shifting


Unnamed: 0.1,y,Unnamed: 0,time,x1,x2
255,0.0,255,5/1/99 8:30,0.997107,-3.86572
256,0.0,256,5/1/99 8:32,1.016235,-4.058394
257,0.0,257,5/1/99 8:34,1.005602,-3.876199
258,1.0,258,5/1/99 8:36,0.933933,-3.868467
260,0.0,260,5/1/99 10:50,0.020062,-3.987897


In [6]:
# Remove time column
df = df.drop(['time'], axis=1)

In [7]:
X = df.drop(['y'], axis=1)
y = df['y']

In [8]:
X= X.drop(['Unnamed: 0'], axis=1)

In [9]:
X
X=X.fillna(0)

In [12]:
X

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x52_pct_change_5,x53_pct_change_5,x54_pct_change_5,x55_pct_change_5,x56_pct_change_5,x57_pct_change_5,x58_pct_change_5,x59_pct_change_5,x60_pct_change_5,x61_pct_change_5
0,0.376665,-4.596435,-4.095756,13.497687,-0.118830,-20.669883,0.000732,-0.061114,-0.059966,-0.038189,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,0.475720,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,-0.061114,-0.059966,-0.038189,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,0.010803,-0.061114,-0.030057,-0.018352,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.301590,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,0.002075,-0.061114,-0.019986,-0.008280,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.265578,-4.749928,-4.333150,15.267340,-0.155314,-17.505913,0.000732,-0.061114,-0.030057,-0.008280,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18393,-0.877441,0.786430,0.406426,135.301215,0.112295,26.300392,-0.159185,0.058823,-0.080108,-0.038189,...,0.0,0.0,0.001728,6.892115,-0.050004,-0.295126,-0.130506,0.151387,-0.003192,0.0
18394,-0.843988,0.633086,0.561918,133.228949,0.141332,25.678597,-0.159185,0.058823,-0.080108,-0.038189,...,0.0,0.0,0.001727,-0.792506,-0.007770,0.949981,-0.096476,-0.864725,-0.003194,0.0
18395,-0.826547,0.450126,0.334582,134.977973,0.170370,25.056801,-0.159185,0.048752,-0.080108,-0.038189,...,0.0,0.0,0.001726,-0.825591,-0.004766,0.001084,0.133484,1.322395,-0.003196,0.0
18396,-0.822843,0.419383,0.387263,135.658942,0.199422,24.435005,-0.159185,0.048752,-0.080108,-0.038189,...,0.0,0.0,0.001726,-0.614630,-0.001745,2.921698,0.096476,2.987735,-0.003198,0.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((14619, 305), (3655, 305))

In [14]:
# Feature scaling

In [15]:
cols = X_train.columns

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])


In [16]:
# define model
model = XGBClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % mean(scores))

Mean ROC AUC: 0.62019


In [17]:
model = XGBClassifier()

clf_0 = model.fit(X_train, y_train)


In [18]:
pred_y_0 = clf_0.predict(X_test)

print(accuracy_score(pred_y_0, y_test))


0.9926128590971273


In [19]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_0, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_0)

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      3628
     class 1       0.00      0.00      0.00        27

    accuracy                           0.99      3655
   macro avg       0.50      0.50      0.50      3655
weighted avg       0.99      0.99      0.99      3655



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[3628,    0],
       [  27,    0]])

# Weighted XGBoost for Class Imbalance


In [20]:
# define model
model = XGBClassifier(scale_pos_weight=90)

In [21]:
clf_1 = model.fit(X_train, y_train)
pred_y_1 = clf_1.predict(X_test)

print(accuracy_score(pred_y_1, y_test))

0.9920656634746922


In [22]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_1, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_1)

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      3628
     class 1       0.00      0.00      0.00        27

    accuracy                           0.99      3655
   macro avg       0.50      0.50      0.50      3655
weighted avg       0.99      0.99      0.99      3655



array([[3626,    2],
       [  27,    0]])

# Tune with GridSearch CV

In [23]:
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
param_grid = dict(scale_pos_weight=weights)

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [None]:
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [25]:
# define model
model = XGBClassifier(scale_pos_weight=855)

In [26]:
clf_2 = model.fit(X_train, y_train)
pred_y_2 = clf_2.predict(X_test)

print(accuracy_score(pred_y_2, y_test))

0.9912448700410397


In [27]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, pred_y_2, target_names=target_names))

# View confusion matrix for test data and predictions
confusion_matrix(y_test, pred_y_2)

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      3628
     class 1       0.00      0.00      0.00        27

    accuracy                           0.99      3655
   macro avg       0.50      0.50      0.50      3655
weighted avg       0.99      0.99      0.99      3655



array([[3623,    5],
       [  27,    0]])