In [2]:
#Citations for the work
# Alam, Mahbubul. “Supervised Machine Learning Technique for Anomaly Detection: 
# Logistic Regression.” Medium, Towards Data Science, 1 Nov. 2020, 
# https://towardsdatascience.com/supervised-machine-learning-technique-for-anomaly-detection-logistic-regression-97fc7a9cacd4


# https://scikit-learn.org/stable/modules/preprocessing.html

In [3]:
#data wrangling
import pandas as pd
import numpy as np

#inputs data preperation
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# modeling
from sklearn.linear_model import LogisticRegression 

# model validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix 

In [4]:
# import data
df = pd.read_csv("ecg.csv", header=None)

In [5]:
#view column names
df.columns

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            131, 132, 133, 134, 135, 136, 137, 138, 139, 140],
           dtype='int64', length=141)

In [6]:
#view table 
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,131,132,133,134,135,136,137,138,139,140
0,-0.112522,-2.827204,-3.773897,-4.349751,-4.376041,-3.474986,-2.181408,-1.818286,-1.250522,-0.477492,...,0.792168,0.933541,0.796958,0.578621,0.257740,0.228077,0.123431,0.925286,0.193137,1.0
1,-1.100878,-3.996840,-4.285843,-4.506579,-4.022377,-3.234368,-1.566126,-0.992258,-0.754680,0.042321,...,0.538356,0.656881,0.787490,0.724046,0.555784,0.476333,0.773820,1.119621,-1.436250,1.0
2,-0.567088,-2.593450,-3.874230,-4.584095,-4.187449,-3.151462,-1.742940,-1.490659,-1.183580,-0.394229,...,0.886073,0.531452,0.311377,-0.021919,-0.713683,-0.532197,0.321097,0.904227,-0.421797,1.0
3,0.490473,-1.914407,-3.616364,-4.318823,-4.268016,-3.881110,-2.993280,-1.671131,-1.333884,-0.965629,...,0.350816,0.499111,0.600345,0.842069,0.952074,0.990133,1.086798,1.403011,-0.383564,1.0
4,0.800232,-0.874252,-2.384761,-3.973292,-4.338224,-3.802422,-2.534510,-1.783423,-1.594450,-0.753199,...,1.148884,0.958434,1.059025,1.371682,1.277392,0.960304,0.971020,1.614392,1.421456,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4993,0.608558,-0.335651,-0.990948,-1.784153,-2.626145,-2.957065,-2.931897,-2.664816,-2.090137,-1.461841,...,1.757705,2.291923,2.704595,2.451519,2.017396,1.704358,1.688542,1.629593,1.342651,0.0
4994,-2.060402,-2.860116,-3.405074,-3.748719,-3.513561,-3.006545,-2.234850,-1.593270,-1.075279,-0.976047,...,1.388947,2.079675,2.433375,2.159484,1.819747,1.534767,1.696818,1.483832,1.047612,0.0
4995,-1.122969,-2.252925,-2.867628,-3.358605,-3.167849,-2.638360,-1.664162,-0.935655,-0.866953,-0.645363,...,-0.472419,-1.310147,-2.029521,-3.221294,-4.176790,-4.009720,-2.874136,-2.008369,-1.808334,0.0
4996,-0.547705,-1.889545,-2.839779,-3.457912,-3.929149,-3.966026,-3.492560,-2.695270,-1.849691,-1.374321,...,1.258419,1.907530,2.280888,1.895242,1.437702,1.193433,1.261335,1.150449,0.804932,0.0


In [7]:
# number of anomaly cases and non-anomaly cases
anomaly = len(df[df.iloc[: , -1] == 0])
non_anomaly = len(df[df.iloc[: , -1] == 1])
totalCases = anomaly + non_anomaly

In [8]:
print("anomaly: ", anomaly)
print("non-anomaly: ", non_anomaly)
print("Total Cases: ", totalCases)

anomaly:  2079
non-anomaly:  2919
Total Cases:  4998


In [9]:
print("Percentage of anomaly cases: ", round(anomaly / totalCases, 2))
# Because the data-set is not un-even we do not have to do under-sampling or over- sampling

Percentage of anomaly cases:  0.42


In [10]:
# define X and Y variables
x = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [11]:
#Scaling the data and instantiating the model
x, y = make_classification(random_state = 42)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression())

#fit model
pipe.fit(x_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [12]:
#predict
y_pred = pipe.predict(x_test)

In [13]:
#model evaluation
classification_report = classification_report(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

print("CLASSIFICATION REPORT")
print(classification_report)

print("CONFUSION MATRIX")
print(confusion_matrix)

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      0.93      0.97        15
           1       0.91      1.00      0.95        10

    accuracy                           0.96        25
   macro avg       0.95      0.97      0.96        25
weighted avg       0.96      0.96      0.96        25

CONFUSION MATRIX
[[14  1]
 [ 0 10]]
