In [52]:
# libraries importing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import numpy as np
import os

from data_processing.process_data import process_data, add_rolling_stats

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.svm import SVR, SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

from feature_engine.selection import SmartCorrelatedSelection, DropDuplicateFeatures, DropCorrelatedFeatures, SelectBySingleFeaturePerformance, RecursiveFeatureAddition
from feature_engine.outliers import Winsorizer

## Data loading

In [53]:
datasets = process_data()

valve1_X =  datasets["valve1_X"]
valve1_y = datasets["valve1_y"]
valve2_X = datasets["valve2_X"]
valve2_y = datasets["valve2_y"]
other_anomaly_X = datasets["other_anomaly_X"]
other_anomaly_y = datasets["other_anomaly_y"]

In [54]:
model = lgb.LGBMClassifier()

In [55]:
def test_train_split(df_X, df_y):
    size_train = int(df_X.shape[0]*0.8)
    size_test = df_X.shape[0] - size_train
    x_train = df_X[:size_train]
    y_train = df_y[:size_train].anomaly
    x_test = df_X[-size_test:]
    y_test = df_y[-size_test:].anomaly
    return x_train, y_train, x_test, y_test

In [56]:
x_train_valve1, y_train_valve1, x_test_valve1, y_test_valve1 = test_train_split(valve1_X, valve1_y)
x_train_valve2, y_train_valve2, x_test_valve2, y_test_valve2 = test_train_split(valve2_X, valve2_y)
x_train_other_anomaly, y_train_other_anomaly, x_test_other_anomaly, y_test_other_anomaly = test_train_split(other_anomaly_X, other_anomaly_y)


## Base models
#### Test model for valve 1

In [57]:
model.fit(x_train_valve1,y_train_valve1,eval_set=[(x_test_valve1,y_test_valve1),(x_train_valve1,y_train_valve1)],eval_metric=['rmse', 'l2', 'loggloss', 'mape'])

[LightGBM] [Info] Number of positive: 5074, number of negative: 9455
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1752
[LightGBM] [Info] Number of data points in the train set: 14529, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.349233 -> initscore=-0.622414
[LightGBM] [Info] Start training from score -0.622414


In [58]:
print('Training accuracy {:.4f}'.format(model.score(x_train_valve1,y_train_valve1)))
print('Testing accuracy {:.4f}'.format(model.score(x_test_valve1,y_test_valve1)))

Training accuracy 0.9791
Testing accuracy 0.9317


In [59]:
yhat = model.predict(x_test_valve1)

accuracy = accuracy_score(y_test_valve1, yhat)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test_valve1, yhat)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_valve1, yhat)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_valve1, yhat)
print('F1 score: %f' % f1)

Accuracy: 0.931737
Precision: 0.997982
Recall: 0.800810
F1 score: 0.888589


In [60]:
conf_matrix = metrics.confusion_matrix(y_test_valve1, yhat)

TN, FP, FN, TP = conf_matrix.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate FAR false alarm rate
FPR = FP/(FP+TN)
# False negative rate MAR missing alarm rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print(f'False Alarm Rate: {FPR}')
print(f'Missing Alarm Rate: {FNR}')

False Alarm Rate: 0.0008340283569641367
Missing Alarm Rate: 0.19919028340080971


In [61]:
print(metrics.classification_report(y_test_valve1,model.predict(x_test_valve1)))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95      2398
         1.0       1.00      0.80      0.89      1235

    accuracy                           0.93      3633
   macro avg       0.95      0.90      0.92      3633
weighted avg       0.94      0.93      0.93      3633



#### Test model for valve 2

In [62]:
model.fit(x_train_valve2,y_train_valve2,eval_set=[(x_test_valve2,y_test_valve2),(x_train_valve2,y_train_valve2)],eval_metric=['rmse', 'l2', 'loggloss', 'mape'])

[LightGBM] [Info] Number of positive: 1122, number of negative: 2327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 3449, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.325312 -> initscore=-0.729467
[LightGBM] [Info] Start training from score -0.729467


In [63]:
print('Training accuracy {:.4f}'.format(model.score(x_train_valve2,y_train_valve2)))
print('Testing accuracy {:.4f}'.format(model.score(x_test_valve2,y_test_valve2)))

Training accuracy 1.0000
Testing accuracy 0.8795


In [64]:
yhat = model.predict(x_test_valve2)

accuracy = accuracy_score(y_test_valve2, yhat)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test_valve2, yhat)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_valve2, yhat)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_valve2, yhat)
print('F1 score: %f' % f1)

Accuracy: 0.879490
Precision: 0.967846
Recall: 0.762025
F1 score: 0.852691


In [65]:
conf_matrix = metrics.confusion_matrix(y_test_valve2, yhat)

TN, FP, FN, TP = conf_matrix.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate FAR false alarm rate
FPR = FP/(FP+TN)
# False negative rate MAR missing alarm rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print(f'False Alarm Rate: {FPR}')
print(f'Missing Alarm Rate: {FNR}')

False Alarm Rate: 0.021367521367521368
Missing Alarm Rate: 0.2379746835443038


In [66]:
print(metrics.classification_report(y_test_valve2,model.predict(x_test_valve2)))

              precision    recall  f1-score   support

         0.0       0.83      0.98      0.90       468
         1.0       0.97      0.76      0.85       395

    accuracy                           0.88       863
   macro avg       0.90      0.87      0.88       863
weighted avg       0.89      0.88      0.88       863



#### Test model for other anomalies

In [67]:
model.fit(x_train_other_anomaly,y_train_other_anomaly,eval_set=[(x_test_other_anomaly,y_test_other_anomaly),(x_train_other_anomaly,y_train_other_anomaly)],eval_metric=['rmse', 'l2', 'loggloss', 'mape'])

[LightGBM] [Info] Number of positive: 4046, number of negative: 7790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1767
[LightGBM] [Info] Number of data points in the train set: 11836, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.341838 -> initscore=-0.655112
[LightGBM] [Info] Start training from score -0.655112


In [68]:
print('Training accuracy {:.4f}'.format(model.score(x_train_other_anomaly,y_train_other_anomaly)))
print('Testing accuracy {:.4f}'.format(model.score(x_test_other_anomaly,y_test_other_anomaly)))

Training accuracy 0.9292
Testing accuracy 0.5377


In [69]:
yhat = model.predict(x_test_other_anomaly)

accuracy = accuracy_score(y_test_other_anomaly, yhat)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test_other_anomaly, yhat)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_other_anomaly, yhat)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_other_anomaly, yhat)
print('F1 score: %f' % f1)

Accuracy: 0.537682
Precision: 1.000000
Recall: 0.000730
F1 score: 0.001460


In [70]:
conf_matrix = metrics.confusion_matrix(y_test_other_anomaly, yhat)

TN, FP, FN, TP = conf_matrix.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate FAR false alarm rate
FPR = FP/(FP+TN)
# False negative rate MAR missing alarm rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print(f'False Alarm Rate: {FPR}')
print(f'Missing Alarm Rate: {FNR}')

False Alarm Rate: 0.0
Missing Alarm Rate: 0.9992695398100804


In [71]:
print(metrics.classification_report(y_test_other_anomaly,model.predict(x_test_other_anomaly)))

              precision    recall  f1-score   support

         0.0       0.54      1.00      0.70      1590
         1.0       1.00      0.00      0.00      1369

    accuracy                           0.54      2959
   macro avg       0.77      0.50      0.35      2959
weighted avg       0.75      0.54      0.38      2959



## Apply steps on datasets

__Drop duplicates__

In [72]:
# print(f'valve 1 dataset shape: {x_train_valve1.shape}')
# print(f'valve 2 dataset shape: {x_train_valve2.shape}')
# print(f'other anomalies dataset shape: {x_train_other_anomaly.shape}')

# valve1_duplicates = x_train_valve1.duplicated()
# x_train_valve1 = x_train_valve1.loc[~valve1_duplicates, :]
# y_train_valve1 = y_train_valve1[~valve1_duplicates]

# valve2_duplicates = x_train_valve2.duplicated()
# x_train_valve2 = x_train_valve2.loc[~valve2_duplicates, :]
# y_train_valve2 = y_train_valve2[~valve2_duplicates]

# other_anomaly_duplicates = x_train_other_anomaly.duplicated()
# x_train_other_anomaly = x_train_other_anomaly.loc[~other_anomaly_duplicates, :]
# y_train_other_anomaly = y_train_other_anomaly[~other_anomaly_duplicates]

# print('********************* Drop Duplicates *********************')
# print(f'valve 1 dataset shape: {x_train_valve1.shape}')
# print(f'valve 2 dataset shape: {x_train_valve2.shape}')
# print(f'other anomalies dataset shape: {x_train_other_anomaly.shape}')

__Add rolling stats__

In [73]:
# x_train_valve1 = add_rolling_stats(x_train_valve1, '1min', x_train_valve1.columns)
# x_test_valve1 = add_rolling_stats(x_test_valve1, '1min', x_test_valve1.columns)
# x_train_valve2 = add_rolling_stats(x_train_valve2, '1min', x_train_valve2.columns)
# x_test_valve2 = add_rolling_stats(x_test_valve2, '1min', x_test_valve2.columns)
# x_train_other_anomaly = add_rolling_stats(x_train_other_anomaly, '1min', x_train_other_anomaly.columns)
# x_test_other_anomaly = add_rolling_stats(x_test_other_anomaly, '1min', x_test_other_anomaly.columns)


__Winsorizer__

In [74]:
wz = Winsorizer(capping_method='quantiles', tail='both', fold=3)

In [75]:
wz.fit(x_train_valve1)
x_train_valve1 = wz.transform(x_train_valve1)
x_test_valve1 = wz.transform(x_test_valve1)

wz.fit(x_train_valve2)
x_train_valve2 = wz.transform(x_train_valve2)
x_test_valve2 = wz.transform(x_test_valve2)

wz.fit(x_train_other_anomaly)
x_train_other_anomaly = wz.transform(x_train_other_anomaly)
x_test_other_anomaly = wz.transform(x_test_other_anomaly)

__Standard scaler__

In [76]:
sc = StandardScaler()

In [77]:
# sc.fit(x_train_valve1, y_train_valve1)
# x_train_valve1 = sc.transform(x_train_valve1)
# x_test_valve1 = sc.transform(x_test_valve1)

# sc.fit(x_train_valve2, y_train_valve2)
# x_train_valve2 = sc.transform(x_train_valve2)
# x_test_valve2 = sc.transform(x_test_valve2)

# sc.fit(x_train_other_anomaly, y_train_other_anomaly)
# x_train_other_anomaly = sc.transform(x_train_other_anomaly)
# x_test_other_anomaly = sc.transform(x_test_other_anomaly)

__PCA__

In [78]:
pca = PCA(n_components='mle', svd_solver='full')

In [79]:
# pca.fit(x_train_valve1, y_train_valve1)
# x_train_valve1 = pca.transform(x_train_valve1)
# x_test_valve1 = pca.transform(x_test_valve1)

# pca.fit(x_train_valve2, y_train_valve2)
# x_train_valve2 = pca.transform(x_train_valve2)
# x_test_valve2 = pca.transform(x_test_valve2)

# pca.fit(x_train_other_anomaly, y_train_other_anomaly)
# x_train_other_anomaly = pca.transform(x_train_other_anomaly)
# x_test_other_anomaly = pca.transform(x_test_other_anomaly)

__RFE based on SVM__

In [80]:
estimator = SVR(kernel="linear")
rfe = RFE(estimator, n_features_to_select=3, step=1)

In [81]:
rfe.fit(x_train_valve1, y_train_valve1)
x_train_valve1 = rfe.transform(x_train_valve1)
x_test_valve1 = rfe.transform(x_test_valve1)

rfe.fit(x_train_valve2, y_train_valve2)
x_train_valve2 = rfe.transform(x_train_valve2)
x_test_valve2 = rfe.transform(x_test_valve2)

rfe.fit(x_train_other_anomaly, y_train_other_anomaly)
x_train_other_anomaly = rfe.transform(x_train_other_anomaly)
x_test_other_anomaly = rfe.transform(x_test_other_anomaly)

__Feature selection by single feature performance using random forest estimator__

In [82]:
sfp = SelectBySingleFeaturePerformance(
                    RandomForestClassifier(random_state=42),
                    cv=2)

In [83]:
# sfp.fit(x_train_valve1, y_train_valve1)
# x_train_valve1 = sfp.transform(x_train_valve1)
# x_test_valve1 = sfp.transform(x_test_valve1)

# sfp.fit(x_train_valve2, y_train_valve2)
# x_train_valve2 = sfp.transform(x_train_valve2)
# x_test_valve2 = sfp.transform(x_test_valve2)

# sfp.fit(x_train_other_anomaly, y_train_other_anomaly)
# x_train_other_anomaly = sfp.transform(x_train_other_anomaly)
# x_test_other_anomaly = sfp.transform(x_test_other_anomaly)

__Feature selection by information value__

In [84]:
rfa = RecursiveFeatureAddition(RandomForestClassifier(random_state=42), cv=3)

In [85]:
# rfa.fit(x_train_valve1, y_train_valve1)
# x_train_valve1 = rfa.transform(x_train_valve1)
# x_test_valve1 = rfa.transform(x_test_valve1)

# rfa.fit(x_train_valve2, y_train_valve2)
# x_train_valve2 = rfa.transform(x_train_valve2)
# x_test_valve2 = rfa.transform(x_test_valve2)

# rfa.fit(x_train_other_anomaly, y_train_other_anomaly)
# x_train_other_anomaly = rfa.transform(x_train_other_anomaly)
# x_test_other_anomaly = rfa.transform(x_test_other_anomaly)

__Smart correlated features__

In [86]:
scs = SmartCorrelatedSelection(threshold=0.8)

In [87]:
# print(f'valve 1 dataset number of columns: {x_train_valve1.shape[1]}')
# print(f'valve 2 dataset number of columns: {x_train_valve2.shape[1]}')
# print(f'other animalies dataset number of columns: {x_train_other_anomaly.shape[1]}')

# scs.fit(x_train_valve1, y_train_valve1)
# x_train_valve1 = scs.transform(x_train_valve1)
# x_test_valve1 = scs.transform(x_test_valve1)

# scs.fit(x_train_valve2, y_train_valve2)
# x_train_valve2 = scs.transform(x_train_valve2)
# x_test_valve2 = scs.transform(x_test_valve2)

# scs.fit(x_train_other_anomaly, y_train_other_anomaly)
# x_train_other_anomaly = scs.transform(x_train_other_anomaly)
# x_test_other_anomaly = scs.transform(x_test_other_anomaly)

# print('********************* Drop correlated columns *********************')
# print(f'valve 1 dataset number of columns: {x_train_valve1.shape[1]}')
# print(f'valve 2 dataset number of columns: {x_train_valve2.shape[1]}')
# print(f'other animalies dataset number of columns: {x_train_other_anomaly.shape[1]}')

## Retest models
#### Test model for valve 1

In [88]:
model.fit(x_train_valve1,y_train_valve1,eval_set=[(x_test_valve1,y_test_valve1),(x_train_valve1,y_train_valve1)],eval_metric=['rmse', 'l2', 'loggloss', 'mape'])

[LightGBM] [Info] Number of positive: 5074, number of negative: 9455
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 707
[LightGBM] [Info] Number of data points in the train set: 14529, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.349233 -> initscore=-0.622414
[LightGBM] [Info] Start training from score -0.622414


In [89]:
print('Training accuracy {:.4f}'.format(model.score(x_train_valve1,y_train_valve1)))
print('Testing accuracy {:.4f}'.format(model.score(x_test_valve1,y_test_valve1)))

Training accuracy 0.9364
Testing accuracy 0.9317


In [90]:
yhat = model.predict(x_test_valve1)

accuracy = accuracy_score(y_test_valve1, yhat)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test_valve1, yhat)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_valve1, yhat)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_valve1, yhat)
print('F1 score: %f' % f1)

Accuracy: 0.931737
Precision: 0.997982
Recall: 0.800810
F1 score: 0.888589


In [91]:
conf_matrix = metrics.confusion_matrix(y_test_valve1, yhat)

TN, FP, FN, TP = conf_matrix.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate FAR false alarm rate
FPR = FP/(FP+TN)
# False negative rate MAR missing alarm rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print(f'False Alarm Rate: {FPR}')
print(f'Missing Alarm Rate: {FNR}')

False Alarm Rate: 0.0008340283569641367
Missing Alarm Rate: 0.19919028340080971


In [92]:
print(metrics.classification_report(y_test_valve1,model.predict(x_test_valve1)))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95      2398
         1.0       1.00      0.80      0.89      1235

    accuracy                           0.93      3633
   macro avg       0.95      0.90      0.92      3633
weighted avg       0.94      0.93      0.93      3633



#### Test model for valve 2

In [93]:
model.fit(x_train_valve2,y_train_valve2,eval_set=[(x_test_valve2,y_test_valve2),(x_train_valve2,y_train_valve2)],eval_metric=['rmse', 'l2', 'loggloss', 'mape'])

[LightGBM] [Info] Number of positive: 1122, number of negative: 2327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 628
[LightGBM] [Info] Number of data points in the train set: 3449, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.325312 -> initscore=-0.729467
[LightGBM] [Info] Start training from score -0.729467


In [94]:
print('Training accuracy {:.4f}'.format(model.score(x_train_valve2,y_train_valve2)))
print('Testing accuracy {:.4f}'.format(model.score(x_test_valve2,y_test_valve2)))

Training accuracy 0.9736
Testing accuracy 0.8841


In [95]:
yhat = model.predict(x_test_valve2)

accuracy = accuracy_score(y_test_valve2, yhat)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test_valve2, yhat)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_valve2, yhat)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_valve2, yhat)
print('F1 score: %f' % f1)

Accuracy: 0.884125
Precision: 0.980456
Recall: 0.762025
F1 score: 0.857550


In [96]:
conf_matrix = metrics.confusion_matrix(y_test_valve2, yhat)

TN, FP, FN, TP = conf_matrix.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate FAR false alarm rate
FPR = FP/(FP+TN)
# False negative rate MAR missing alarm rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print(f'False Alarm Rate: {FPR}')
print(f'Missing Alarm Rate: {FNR}')

False Alarm Rate: 0.01282051282051282
Missing Alarm Rate: 0.2379746835443038


In [97]:
print(metrics.classification_report(y_test_valve2,model.predict(x_test_valve2)))

              precision    recall  f1-score   support

         0.0       0.83      0.99      0.90       468
         1.0       0.98      0.76      0.86       395

    accuracy                           0.88       863
   macro avg       0.91      0.87      0.88       863
weighted avg       0.90      0.88      0.88       863



#### Test model for other anomalies

In [98]:
model.fit(x_train_other_anomaly,y_train_other_anomaly,eval_set=[(x_test_other_anomaly,y_test_other_anomaly),(x_train_other_anomaly,y_train_other_anomaly)],eval_metric=['rmse', 'l2', 'loggloss', 'mape'])

[LightGBM] [Info] Number of positive: 4046, number of negative: 7790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 11836, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.341838 -> initscore=-0.655112
[LightGBM] [Info] Start training from score -0.655112


In [99]:
print('Training accuracy {:.4f}'.format(model.score(x_train_other_anomaly,y_train_other_anomaly)))
print('Testing accuracy {:.4f}'.format(model.score(x_test_other_anomaly,y_test_other_anomaly)))

Training accuracy 0.7589
Testing accuracy 0.5377


In [100]:
yhat = model.predict(x_test_other_anomaly)

accuracy = accuracy_score(y_test_other_anomaly, yhat)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test_other_anomaly, yhat)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_other_anomaly, yhat)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_other_anomaly, yhat)
print('F1 score: %f' % f1)

Accuracy: 0.537682
Precision: 1.000000
Recall: 0.000730
F1 score: 0.001460


In [101]:
conf_matrix = metrics.confusion_matrix(y_test_other_anomaly, yhat)

TN, FP, FN, TP = conf_matrix.ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate FAR false alarm rate
FPR = FP/(FP+TN)
# False negative rate MAR missing alarm rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print(f'False Alarm Rate: {FPR}')
print(f'Missing Alarm Rate: {FNR}')

False Alarm Rate: 0.0
Missing Alarm Rate: 0.9992695398100804


In [102]:
print(metrics.classification_report(y_test_other_anomaly,model.predict(x_test_other_anomaly)))

              precision    recall  f1-score   support

         0.0       0.54      1.00      0.70      1590
         1.0       1.00      0.00      0.00      1369

    accuracy                           0.54      2959
   macro avg       0.77      0.50      0.35      2959
weighted avg       0.75      0.54      0.38      2959

