# Multivariate logit regression

In [47]:
# data wrangling:
import pandas as pd
import numpy as np

# data_preparation
from sklearn.model_selection import train_test_split

# modeling
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE


# model validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


## Dataset

In [8]:
data = pd.read_csv("encoded_labelled.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11170084 entries, 0 to 11170083
Data columns (total 15 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   Unnamed: 0      int64 
 1   block_id        object
 2   first_digits    int64 
 3   second_digits   int64 
 4   third_digits    int64 
 5   message         int64 
 6   event_type      int64 
 7   block_asterisk  int64 
 8   add_ver         int64 
 9   dru             int64 
 10  trans_fail      int64 
 11  rrswe           int64 
 12  cerj            int64 
 13  pack_rec        int64 
 14  Label           int64 
dtypes: int64(14), object(1)
memory usage: 1.2+ GB


In [4]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
0,0,blk_-1608999687919862906,81109,203518,35,0,7,1,-1,-1,-1,-1,-1,-1,1
1,1,blk_-1608999687919862906,81109,203519,29,0,7,0,-1,-1,-1,-1,-1,-1,1
2,2,blk_-1608999687919862906,81109,203519,30,0,7,0,-1,-1,-1,-1,-1,-1,1
3,3,blk_-1608999687919862906,81109,203519,31,0,7,0,-1,-1,-1,-1,-1,-1,1
4,4,blk_-1608999687919862906,81109,203521,19,0,7,4,-1,-1,-1,-1,-1,-1,1


In [9]:
data = data.drop(columns=["Unnamed: 0"])
data.head(5)

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
0,blk_-1608999687919862906,81109,203518,35,0,7,1,-1,-1,-1,-1,-1,-1,1
1,blk_-1608999687919862906,81109,203519,29,0,7,0,-1,-1,-1,-1,-1,-1,1
2,blk_-1608999687919862906,81109,203519,30,0,7,0,-1,-1,-1,-1,-1,-1,1
3,blk_-1608999687919862906,81109,203519,31,0,7,0,-1,-1,-1,-1,-1,-1,1
4,blk_-1608999687919862906,81109,203521,19,0,7,4,-1,-1,-1,-1,-1,-1,1


In [13]:
# Count the number of anormaly, and normal.
normal = data[data.Label == 1].index
anomaly = data[data.Label == 0].index

print("normal: ", len(normal), "anomaly:", len(anomaly))

normal:  10887157 anomaly: 282927


In [35]:
# Avoiding find a needle in the haystack

# from all normal observations, randomly select observations equal to number of anomaly observations
random_normal = np.random.choice(normal, len(anomaly), replace = False)
random_normal = np.array(random_normal)

under_sample_indices = np.concatenate([anomaly, random_normal])

under_sample_data = data.iloc[under_sample_indices,:]
under_sample_data

Unnamed: 0,block_id,first_digits,second_digits,third_digits,message,event_type,block_asterisk,add_ver,dru,trans_fail,rrswe,cerj,pack_rec,Label
291,blk_-3544583377289625738,81109,203521,29,0,7,1,-1,-1,-1,-1,-1,-1,0
292,blk_-3544583377289625738,81109,203523,29,0,7,0,-1,-1,-1,-1,-1,-1,0
293,blk_-3544583377289625738,81109,203523,33,0,7,0,-1,-1,-1,-1,-1,-1,0
294,blk_-3544583377289625738,81109,203523,33,0,7,0,-1,-1,-1,-1,-1,-1,0
295,blk_-3544583377289625738,81109,213809,32,0,7,2,-1,-1,-1,-1,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2685554,blk_-2132225618977982593,81110,104413,34,0,7,0,-1,-1,-1,-1,-1,-1,1
8252708,blk_-6911468655968473335,81111,65346,19,0,6,-1,-1,0,-1,-1,-1,-1,1
10682817,blk_-6674274565364658925,81111,101840,26550,0,3,-1,-1,-1,-1,1,-1,-1,1
7325903,blk_329657882818696792,81111,42348,19574,0,4,-1,-1,-1,-1,-1,-1,0,1


In [36]:
# Split the data into training parameters and the labels
X = under_sample_data.drop(columns = ["block_id", "Label"])
Y = under_sample_data.Label

In [37]:
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

## Model implementation & training

In [38]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [44]:
y_predict = model.predict(X_test)

## Model evaluation

In [45]:
# Accuracy
accuracy = accuracy_score(y_test, yhat)
print(accuracy)

0.5222465052987506


In [49]:
# calssification report
c_r = classification_report(y_test, y_predict)
print(c_r)

              precision    recall  f1-score   support

           0       0.52      0.57      0.54     84587
           1       0.53      0.48      0.50     85170

    accuracy                           0.52    169757
   macro avg       0.52      0.52      0.52    169757
weighted avg       0.52      0.52      0.52    169757



In [50]:
# Confusion matrix:
c_m = confusion_matrix(y_test, y_predict)
c_m

array([[47820, 36767],
       [44335, 40835]], dtype=int64)