# Multivariate logit regression

In [1]:
# data wrangling:
import pandas as pd
import numpy as np

# data_preparation
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# modeling
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE


# model validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### Dataset

In [4]:
data = pd.read_csv("labelled.csv", low_memory=False)

In [6]:
data = data.drop(columns=["Unnamed: 0"])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11170084 entries, 0 to 11170083
Data columns (total 14 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   block_id        object
 1   first_digits    int64 
 2   second_digits   int64 
 3   third_digits    int64 
 4   message         object
 5   event_type      object
 6   block_asterisk  object
 7   add_ver         object
 8   dru             object
 9   trans_fail      object
 10  rrswe           object
 11  cerj            object
 12  pack_rec        object
 13  Label           object
dtypes: int64(3), object(11)
memory usage: 1.2+ GB


In [8]:
# Count the number of anormaly, and normal.
normal = data[data.Label == "Normal"].index
anomaly = data[data.Label == "Anomaly"].index

print("normal: ", len(normal), "anomaly:", len(anomaly))

normal:  10887157 anomaly: 282927


In [9]:
# Avoiding find a needle in the haystack

# from all normal observations, randomly select observations equal to number of anomaly observations
random_normal = np.random.choice(normal, len(anomaly), replace = False)
random_normal = np.array(random_normal)

under_sample_indices = np.concatenate([anomaly, random_normal])

under_sample_data = data.iloc[under_sample_indices,:]
under_sample_data.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
291,blk_-3544583377289625738,81109,203521,29,INFO,dfs.FSNamesystem:,NameSystem.allocateBlock:,,,,,,,Anomaly
292,blk_-3544583377289625738,81109,203523,29,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Anomaly
293,blk_-3544583377289625738,81109,203523,33,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Anomaly
294,blk_-3544583377289625738,81109,203523,33,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Anomaly
295,blk_-3544583377289625738,81109,213809,32,INFO,dfs.FSNamesystem:,NameSystem.delete:,,,,,,,Anomaly


In [10]:
X = under_sample_data.drop(columns = ["Label", "first_digits", "second_digits", "third_digits"])
Y = under_sample_data.Label

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [11]:
# One hot encoding, to deal with the categorical data

encoder = OneHotEncoder()
encoder.fit(X)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [12]:
# Label encoding, now the data are set for model-fitting.

label_encoder = LabelEncoder()
label_encoder.fit(Y)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

### Model implementation and training

In [13]:
# Model-fitting

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Model evaluation

In [14]:
# Use the model to predict.

yhat = model.predict(X_test)

In [15]:
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 97.20


In [18]:
# Confusion matrix:
tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
(tn, fp, fn, tp)

(79898, 4689, 62, 85108)

In [17]:
# calssification report
c_r = classification_report(y_test, yhat)
print(c_r)

              precision    recall  f1-score   support

           0       1.00      0.94      0.97     84587
           1       0.95      1.00      0.97     85170

    accuracy                           0.97    169757
   macro avg       0.97      0.97      0.97    169757
weighted avg       0.97      0.97      0.97    169757

