# Multivariate logit regression

In [68]:
# data wrangling:
import pandas as pd
import numpy as np

# data_preparation
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# modeling
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE


# model validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


## Dataset

In [8]:
data = pd.read_csv("encoded_labelled.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11170084 entries, 0 to 11170083
Data columns (total 15 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   Unnamed: 0      int64 
 1   block_id        object
 2   first_digits    int64 
 3   second_digits   int64 
 4   third_digits    int64 
 5   message         int64 
 6   event_type      int64 
 7   block_asterisk  int64 
 8   add_ver         int64 
 9   dru             int64 
 10  trans_fail      int64 
 11  rrswe           int64 
 12  cerj            int64 
 13  pack_rec        int64 
 14  Label           int64 
dtypes: int64(14), object(1)
memory usage: 1.2+ GB


In [4]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
0,0,blk_-1608999687919862906,81109,203518,35,0,7,1,-1,-1,-1,-1,-1,-1,1
1,1,blk_-1608999687919862906,81109,203519,29,0,7,0,-1,-1,-1,-1,-1,-1,1
2,2,blk_-1608999687919862906,81109,203519,30,0,7,0,-1,-1,-1,-1,-1,-1,1
3,3,blk_-1608999687919862906,81109,203519,31,0,7,0,-1,-1,-1,-1,-1,-1,1
4,4,blk_-1608999687919862906,81109,203521,19,0,7,4,-1,-1,-1,-1,-1,-1,1


In [9]:
data = data.drop(columns=["Unnamed: 0"])
data.head(5)

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
0,blk_-1608999687919862906,81109,203518,35,0,7,1,-1,-1,-1,-1,-1,-1,1
1,blk_-1608999687919862906,81109,203519,29,0,7,0,-1,-1,-1,-1,-1,-1,1
2,blk_-1608999687919862906,81109,203519,30,0,7,0,-1,-1,-1,-1,-1,-1,1
3,blk_-1608999687919862906,81109,203519,31,0,7,0,-1,-1,-1,-1,-1,-1,1
4,blk_-1608999687919862906,81109,203521,19,0,7,4,-1,-1,-1,-1,-1,-1,1


In [13]:
# Count the number of anormaly, and normal.
normal = data[data.Label == 1].index
anomaly = data[data.Label == 0].index

print("normal: ", len(normal), "anomaly:", len(anomaly))

normal:  10887157 anomaly: 282927


In [106]:
# Avoiding find a needle in the haystack

# from all normal observations, randomly select observations equal to number of anomaly observations
random_normal = np.random.choice(normal, len(anomaly), replace = False)
random_normal = np.array(random_normal)

under_sample_indices = np.concatenate([anomaly, random_normal])

under_sample_data = data.iloc[under_sample_indices,:]
under_sample_data

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
291,blk_-3544583377289625738,81109,203521,29,0,7,1,-1,-1,-1,-1,-1,-1,0
292,blk_-3544583377289625738,81109,203523,29,0,7,0,-1,-1,-1,-1,-1,-1,0
293,blk_-3544583377289625738,81109,203523,33,0,7,0,-1,-1,-1,-1,-1,-1,0
294,blk_-3544583377289625738,81109,203523,33,0,7,0,-1,-1,-1,-1,-1,-1,0
295,blk_-3544583377289625738,81109,213809,32,0,7,2,-1,-1,-1,-1,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6917734,blk_-1758301402364512991,81111,44322,35,0,7,2,-1,-1,-1,-1,-1,-1,1
3440456,blk_-3625523464086656995,81110,165426,13650,0,3,-1,-1,-1,-1,2,-1,-1,1
89749,blk_3167974888615449726,81109,204159,368,0,4,-1,-1,-1,-1,-1,-1,1,1
5287081,blk_4342855459971297868,81110,220928,19,0,6,-1,-1,0,-1,-1,-1,-1,1


In [107]:
rob_scaler = RobustScaler()
under_sample_data["scaled_event_type"] = rob_scaler.fit_transform(under_sample_data["event_type"].values.reshape(-1,1))
under_sample_data.drop("event_type", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  under_sample_data["scaled_event_type"] = rob_scaler.fit_transform(under_sample_data["event_type"].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [122]:
# Split the data into training parameters and the labels
X = under_sample_data.drop(columns = ["block_id", "Label", "first_digits", "second_digits", "third_digits"])
Y = under_sample_data.Label

In [123]:
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

## Model implementation & training

In [124]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [125]:
data.drop(columns = ["block_id", "Label", "first_digits", "second_digits", "third_digits"])
y_predict = model.predict(X_test)

## Model evaluation

In [126]:
# Accuracy
accuracy = accuracy_score(y_test, yhat)
print(accuracy)

0.5222465052987506


In [127]:
# calssification report
c_r = classification_report(y_test, y_predict)
print(c_r)

              precision    recall  f1-score   support

           0       0.55      0.50      0.52     84587
           1       0.54      0.59      0.56     85170

    accuracy                           0.54    169757
   macro avg       0.54      0.54      0.54    169757
weighted avg       0.54      0.54      0.54    169757



In [121]:
# Confusion matrix:
c_m = confusion_matrix(y_test, y_predict)
c_m

array([[47725, 36862],
       [44401, 40769]], dtype=int64)

It seems like our model's predictability is only a little bit better guessing randomly. Since we used ordinal encoding in the data wrangling, let's try if using one-hot encoding instead will enhance the performance of our model. 

In [85]:
df1 = pd.read_csv("labelled.csv", low_memory=False)

In [86]:
df1.head(5)

Unnamed: 0.1,Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
0,0,blk_-1608999687919862906,81109,203518,35,INFO,dfs.FSNamesystem:,NameSystem.allocateBlock:,,,,,,,Normal
1,1,blk_-1608999687919862906,81109,203519,29,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Normal
2,2,blk_-1608999687919862906,81109,203519,30,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Normal
3,3,blk_-1608999687919862906,81109,203519,31,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Normal
4,4,blk_-1608999687919862906,81109,203521,19,INFO,dfs.FSNamesystem:,ask,,,,,,,Normal


In [77]:
# labeled_df.loc[labeled_df.Label == 'Normal', 'Label'] = 1
# labeled_df.loc[labeled_df.Label == 'Anomaly', 'Label'] = 0
# labeled_df.head()

Unnamed: 0.1,Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,cerj,Label
0,0,blk_-3842070622043972712,81109,204530,526,INFO,dfs.DataNode$BlockReceiver:,Receiving,0
1,1,blk_-3842070622043972712,81109,204530,536,INFO,dfs.DataNode$BlockReceiver:,Receiving,0
2,2,blk_-3842070622043972712,81109,204530,543,INFO,dfs.DataNode$BlockReceiver:,Receiving,0
3,3,blk_7621437832633701631,81109,204553,525,INFO,dfs.DataNode$BlockReceiver:,Receiving,0
4,4,blk_7621437832633701631,81109,204553,532,INFO,dfs.DataNode$BlockReceiver:,Receiving,0


In [87]:
df1 = df1.drop(columns=["Unnamed: 0"])
df1.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
0,blk_-1608999687919862906,81109,203518,35,INFO,dfs.FSNamesystem:,NameSystem.allocateBlock:,,,,,,,Normal
1,blk_-1608999687919862906,81109,203519,29,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Normal
2,blk_-1608999687919862906,81109,203519,30,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Normal
3,blk_-1608999687919862906,81109,203519,31,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Normal
4,blk_-1608999687919862906,81109,203521,19,INFO,dfs.FSNamesystem:,ask,,,,,,,Normal


In [89]:
# Count the number of anormaly, and normal.
normal = df1[df1.Label == "Normal"].index
anomaly = df1[df1.Label == "Anomaly"].index

print("normal: ", len(normal), "anomaly:", len(anomaly))

normal:  10887157 anomaly: 282927


In [91]:
# Avoiding find a needle in the haystack

# from all normal observations, randomly select observations equal to number of anomaly observations
random_normal = np.random.choice(normal, len(anomaly), replace = False)
random_normal = np.array(random_normal)

under_sample_indices = np.concatenate([anomaly, random_normal])

under_sample_data = df1.iloc[under_sample_indices,:]
under_sample_data.head()

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
291,blk_-3544583377289625738,81109,203521,29,INFO,dfs.FSNamesystem:,NameSystem.allocateBlock:,,,,,,,Anomaly
292,blk_-3544583377289625738,81109,203523,29,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Anomaly
293,blk_-3544583377289625738,81109,203523,33,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Anomaly
294,blk_-3544583377289625738,81109,203523,33,INFO,dfs.FSNamesystem:,NameSystem.addStoredBlock:,,,,,,,Anomaly
295,blk_-3544583377289625738,81109,213809,32,INFO,dfs.FSNamesystem:,NameSystem.delete:,,,,,,,Anomaly


In [98]:
X = under_sample_data.drop(columns = ["Label", "first_digits", "second_digits", "third_digits"])
Y = under_sample_data.Label

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [99]:
encoder = OneHotEncoder()
encoder.fit(X)
X_train = encoder.transform(X_train)

In [100]:
X_test = encoder.transform(X_test)

In [101]:
label_encoder = LabelEncoder()
label_encoder.fit(Y)

LabelEncoder()

In [102]:
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [103]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [104]:
yhat = model.predict(X_test)

In [105]:
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 97.25


In [108]:
# Confusion matrix:
c_m = confusion_matrix(y_test, yhat)
c_m

array([[80004,  4583],
       [   82, 85088]], dtype=int64)

In [109]:
# calssification report
c_r = classification_report(y_test, yhat)
print(c_r)

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     84587
           1       0.95      1.00      0.97     85170

    accuracy                           0.97    169757
   macro avg       0.97      0.97      0.97    169757
weighted avg       0.97      0.97      0.97    169757



In [110]:
tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
(tn, fp, fn, tp)

(80004, 4583, 82, 85088)

In [137]:
X = df1.drop(columns = ["Label","block_id", "first_digits", "second_digits", "third_digits"])
Y = df1.Label

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [139]:
encoder = OneHotEncoder()
encoder = encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_train)

In [140]:
model = LogisticRegression()

In [141]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [142]:
yhat = model.predict(X_test)

In [143]:
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

ValueError: Found input variables with inconsistent numbers of samples: [7819058, 3351026]

In [131]:
# Confusion matrix:
c_m = confusion_matrix(y_test, yhat)
c_m

ValueError: Found input variables with inconsistent numbers of samples: [7819058, 3351026]

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
(tn, fp, fn, tp)

In [None]:
# calssification report
c_r = classification_report(y_test, yhat)
print(c_r)