In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv("./data/train.csv")

In [3]:
data.head()

Unnamed: 0,IP,Country,class
0,1.10.195.126,TH,normal
1,1.1.217.211,TH,normal
2,1.115.198.107,JP,anomaly
3,1.121.152.143,AU,normal
4,1.123.135.213,AU,normal


In [4]:
y = data['class']
y.head()

0     normal
1     normal
2    anomaly
3     normal
4     normal
Name: class, dtype: object

In [5]:
X = data.drop(['class'], axis=1)
X.head()

Unnamed: 0,IP,Country
0,1.10.195.126,TH
1,1.1.217.211,TH
2,1.115.198.107,JP
3,1.121.152.143,AU
4,1.123.135.213,AU


In [6]:
categ = ['IP', 'Country']
le = LabelEncoder()
X[categ] = X[categ].apply(le.fit_transform)
X.head()

Unnamed: 0,IP,Country
0,1,147
1,0,147
2,2,81
3,3,9
4,4,9


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)

In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [9]:
predictions = model.predict(X_test)

In [10]:
cm = confusion_matrix(y_test, predictions)

TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy =  (TP+TN) /(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

True Positive(TP)  =  2731
False Positive(FP) =  2251
True Negative(TN)  =  0
False Negative(FN) =  0
Accuracy of the binary classification = 0.548


In [11]:
from joblib import dump, load
dump(model, 'model.joblib')

['model.joblib']