<a href="https://colab.research.google.com/github/CatarinaL/ddosLogs/blob/master/ddos_logs_logit_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install category_encoders

import numpy as np
import pandas as pd
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import category_encoders as ce
from category_encoders.wrapper import PolynomialWrapper

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 15.2MB/s eta 0:00:01[K     |████████▏                       | 20kB 11.6MB/s eta 0:00:01[K     |████████████▏                   | 30kB 7.7MB/s eta 0:00:01[K     |████████████████▎               | 40kB 7.5MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 4.4MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 4.9MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 5.1MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 3.7MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


  import pandas.util.testing as tm


In [2]:
#/content/drive/MyDrive/datasets/ddosLogs/final-dataset.arff
np.random.seed(123)
file = "/content/drive/MyDrive/datasets/ddosLogs/final-dataset.arff"

data, meta = arff.loadarff(file)
df = pd.DataFrame(data=data)
df.info

<bound method DataFrame.info of          SRC_ADD  DES_ADD  ...  LAST_PKT_RESEVED     PKT_CLASS
0           3.00    24.30  ...         50.021920     b'Normal'
1          15.00    24.15  ...         50.030211     b'Normal'
2          24.15    15.00  ...         50.060221  b'UDP-Flood'
3          24.90     9.00  ...         50.060098     b'Normal'
4          24.80     8.00  ...         50.061864     b'Normal'
...          ...      ...  ...               ...           ...
2160663    24.10    10.00  ...         50.040562     b'Normal'
2160664    24.12    12.00  ...         50.051067     b'Normal'
2160665     1.00    24.10  ...         50.013418     b'Normal'
2160666     5.00    24.50  ...         50.027339     b'Normal'
2160667    15.00    24.15  ...         50.030211     b'Normal'

[2160668 rows x 28 columns]>

In [3]:
df.describe()
df.isnull().values.any()

False

In [4]:
#decode byte strings to remove the "b" before the srting values at cols PKT_CLASS, NODE_NAME_FROM, NODE_NAME_TO, FLAGS, PKT_TYPE 
categoricals = ["NODE_NAME_FROM", "NODE_NAME_TO", "FLAGS", "PKT_TYPE", "PKT_CLASS"]

for categorical in categoricals:
  df[categorical] = df[categorical].apply(lambda x: x.decode('utf-8')).astype("category") 


In [6]:
intgs = ["SRC_ADD", "DES_ADD", "PKT_ID", "FROM_NODE", "TO_NODE", "FID"]
for intg in intgs:
  df[intg] = df[intg].astype('int64')


In [7]:
df.dtypes


SRC_ADD                int64
DES_ADD                int64
PKT_ID                 int64
FROM_NODE              int64
TO_NODE                int64
PKT_TYPE            category
PKT_SIZE             float64
FLAGS               category
FID                    int64
SEQ_NUMBER           float64
NUMBER_OF_PKT        float64
NUMBER_OF_BYTE       float64
NODE_NAME_FROM      category
NODE_NAME_TO        category
PKT_IN               float64
PKT_OUT              float64
PKT_R                float64
PKT_DELAY_NODE       float64
PKT_RATE             float64
BYTE_RATE            float64
PKT_AVG_SIZE         float64
UTILIZATION          float64
PKT_DELAY            float64
PKT_SEND_TIME        float64
PKT_RESEVED_TIME     float64
FIRST_PKT_SENT       float64
LAST_PKT_RESEVED     float64
PKT_CLASS           category
dtype: object

copying the df to encode transform categoricals into dummy variables so I can use them in a logistical regression model, leaving original intact 

In [9]:
df2 = df.copy()

X = df2.iloc[:, 0:-1]
y = df2.iloc[:, [-1]]

In [52]:
X

Unnamed: 0,SRC_ADD,DES_ADD,PKT_ID,FROM_NODE,TO_NODE,PKT_TYPE,PKT_SIZE,FLAGS,FID,SEQ_NUMBER,NUMBER_OF_PKT,NUMBER_OF_BYTE,NODE_NAME_FROM,NODE_NAME_TO,PKT_IN,PKT_OUT,PKT_R,PKT_DELAY_NODE,PKT_RATE,BYTE_RATE,PKT_AVG_SIZE,UTILIZATION,PKT_DELAY,PKT_SEND_TIME,PKT_RESEVED_TIME,FIRST_PKT_SENT,LAST_PKT_RESEVED
0,3,24,389693,21,23,tcp,1540.0,-------,4,11339.0,16091.0,24780100.0,Switch1,Router,35.529786,35.529786,35.539909,0.0,328.240918,505490.0,1540.0,0.236321,0.0,35.519662,35.550032,1.000000,50.021920
1,15,24,201196,23,24,tcp,1540.0,-------,16,6274.0,16092.0,24781700.0,Router,server1,20.176725,20.176725,20.186848,0.0,328.205808,505437.0,1540.0,0.236337,0.0,20.156478,20.186848,1.000000,50.030211
2,24,15,61905,23,22,ack,55.0,-------,16,1930.0,16092.0,885060.0,Router,Switch2,7.049955,7.049955,7.059958,0.0,328.206042,18051.3,55.0,0.008441,0.0,7.039952,7.069962,1.030045,50.060221
3,24,9,443135,23,21,ack,55.0,-------,10,12670.0,16085.0,884675.0,Router,Switch1,39.627970,39.627970,39.637973,0.0,328.064183,18043.5,55.0,0.008437,0.0,39.617967,39.647976,1.030058,50.060098
4,24,8,157335,23,21,ack,55.0,-------,9,4901.0,16088.0,884840.0,Router,Switch1,16.039806,16.039806,16.049810,0.0,328.113525,18046.2,55.0,0.008438,0.0,16.029803,16.059813,1.030054,50.061864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160663,24,10,43184,22,10,ack,55.0,-------,11,1343.0,16103.0,885665.0,Switch2,clien-10,5.295040,5.295040,5.305043,0.0,328.561918,18070.9,55.0,0.008446,0.0,5.275034,5.305043,1.030013,50.040562
2160664,24,12,376386,23,22,ack,55.0,-------,13,10991.0,16103.0,885665.0,Router,Switch2,34.508649,34.508649,34.518652,0.0,328.491596,18067.0,55.0,0.008446,0.0,34.498646,34.528655,1.030026,50.051067
2160665,1,24,140121,21,23,tcp,1540.0,-------,2,4374.0,16091.0,24780100.0,Switch1,Router,14.418011,14.418011,14.428134,0.0,328.297855,505578.0,1540.0,0.236321,0.0,14.407888,14.438258,1.000000,50.013418
2160666,5,24,26800,5,21,tcp,1540.0,-------,6,832.0,16091.0,24780100.0,clien-5,Switch1,3.732227,3.732227,3.742350,0.0,328.204637,505434.0,1540.0,0.236321,0.0,3.732227,3.762597,1.000000,50.027339


In [53]:
y

Unnamed: 0,PKT_CLASS
0,Normal
1,Normal
2,UDP-Flood
3,Normal
4,Normal
...,...
2160663,Normal
2160664,Normal
2160665,Normal
2160666,Normal


In [10]:
#split into test and training sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

#dummy variables w/ WoE
# polynomial encoder wrapper for selected categorical variables - weight of evidence encoder
enc = PolynomialWrapper(ce.WOEEncoder(cols=["NODE_NAME_FROM", "NODE_NAME_TO", "FLAGS", "PKT_TYPE"]))

# transform the datasets
X_train_enc = enc.fit_transform(X_train, y_train)
X_test_enc = enc.transform(X_test)

X_train_enc.info()

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512467 entries, 303819 to 305711
Data columns (total 39 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   SRC_ADD                    1512467 non-null  int64  
 1   DES_ADD                    1512467 non-null  int64  
 2   PKT_ID                     1512467 non-null  int64  
 3   FROM_NODE                  1512467 non-null  int64  
 4   TO_NODE                    1512467 non-null  int64  
 5   PKT_SIZE                   1512467 non-null  float64
 6   FID                        1512467 non-null  int64  
 7   SEQ_NUMBER                 1512467 non-null  float64
 8   NUMBER_OF_PKT              1512467 non-null  float64
 9   NUMBER_OF_BYTE             1512467 non-null  float64
 10  PKT_IN                     1512467 non-null  float64
 11  PKT_OUT                    1512467 non-null  float64
 12  PKT_R                      1512467 non-null  float64
 13  PKT_DELA

In [13]:
#TODO try without scaling, try different scalers
#normalize 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enc)
X_test_scaled = scaler.fit_transform(X_test_enc)

In [14]:
X_test_scaled

array([[ 1.11952814, -1.19607929,  1.27748827, ..., -0.24040455,
        -0.019877  , -0.6964335 ],
       [-0.49851185,  0.71197053, -0.87051728, ..., -0.22056381,
        -0.019877  ,  1.28836133],
       [-0.38293756,  0.71197053,  0.35977584, ..., -0.25674982,
        -0.019877  ,  1.28836133],
       ...,
       [-1.6542547 ,  0.71197053,  0.28308765, ..., -0.24040455,
        -0.019877  ,  1.28836133],
       [ 1.11952814, -1.43458551, -0.88417198, ..., -0.24040455,
        -0.019877  , -0.6964335 ],
       [-1.6542547 ,  0.71197053, -0.64767624, ..., -0.25674982,
        -0.019877  ,  1.28836133]])

The “lbfgs”, “sag” and “newton-cg” solvers only support 
 regularization or no regularization, and are found to converge faster for some high-dimensional data. Setting multi_class to “multinomial” with these solvers learns a true multinomial logistic regression model, which means that its probability estimates should be better calibrated than the default “one-vs-rest” setting.

The “sag” solver uses Stochastic Average Gradient descent (see https://hal.inria.fr/hal-00860051/document). It is faster than other solvers for large datasets, when both the number of samples and the number of features are large.

In [15]:
#instantiate the model - multinomial w/ sag solver #TODO: fix, model is not converging
'''
result with c= 0.05 and max_iter =100
array([[  1207,      0,     75,      0,      0],
       [    13, 581075,     60,      0,      0],
       [     0,    117,   1860,      0,      0],
       [     9,   2490,     88,   1222,      0],
       [     0,   5939,      0,      0,  54046]]) 
'''

logreg = LogisticRegression(solver='sag', C=0.5, multi_class='multinomial', max_iter = 250,
                          random_state=42)

# fit the model with data
logreg.fit(X_train_scaled, np.ravel(y_train, order = 'C')) #np.ravel is used to flatten the target afeature array, to avoid a DataConversionWarning

#
y_pred=logreg.predict(X_test_scaled)

# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

  y = column_or_1d(y, warn=True)


array([[  1207,      0,     75,      0,      0],
       [    13, 581075,     60,      0,      0],
       [     0,    117,   1860,      0,      0],
       [     9,   2490,     88,   1222,      0],
       [     0,   5939,      0,      0,  54046]])

In [23]:
target_classes = np.unique(y_test, return_counts=True)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_classes[0]))

              precision    recall  f1-score   support

  HTTP-FLOOD       0.98      0.94      0.96      1282
      Normal       0.99      1.00      0.99    581148
      SIDDOS       0.89      0.94      0.92      1977
       Smurf       1.00      0.32      0.49      3809
   UDP-Flood       1.00      0.90      0.95     59985

    accuracy                           0.99    648201
   macro avg       0.97      0.82      0.86    648201
weighted avg       0.99      0.99      0.99    648201



Intuitively, precision is the ability of the classifier not to label as positive a sample that is negative, and recall is the ability of the classifier to find all the positive samples.

The F-measure (Fbeta and F1 measures) can be interpreted as a weighted harmonic mean of the precision and recall. A Fbeta measure reaches its best value at 1 and its worst score at 0. With beta=0, Fbeta and F1 are equivalent, and the recall and the precision are equally important.

The precision_recall_curve computes a precision-recall curve from the ground truth label and a score given by the classifier by varying a decision threshold.

The average_precision_score function computes the average precision (AP) from prediction scores. The value is between 0 and 1 and higher is better. With random predictions, the AP is the fraction of positive samples.

References [Manning2008] and [Everingham2010] present alternative variants of AP that interpolate the precision-recall curve. Currently, average_precision_score does not implement any interpolated variant. References [Davis2006] and [Flach2015] describe why a linear interpolation of points on the precision-recall curve provides an overly-optimistic measure of classifier performance. This linear interpolation is used when computing area under the curve with the trapezoidal rule in auc.

See also https://scikit-learn.org/stable/modules/model_evaluation.html#multiclass-and-multilabel-classification

In [20]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average="weighted"))
print("Recall:",metrics.recall_score(y_test, y_pred, average = "weighted"))

Accuracy: 0.9864378487537044
Precision: 0.986643352071877
Recall: 0.9864378487537044


Precision-Recall Trade-off
When building a classification model, we need to consider both precision and recall. It is always possible to increase one value at the expense of the other (recall-focussed model/precision-focussed model)

In [31]:
logreg.predict_log_proba(X_test_scaled)

array([[-12.86770013,  -0.01483056,  -8.94870411,  -5.51344323,
         -4.55101892],
       [-12.12136262,  -0.01447118,  -9.0132526 ,  -5.53980764,
         -4.57440147],
       [-12.43695299,  -0.01464863,  -8.70426895,  -5.51956874,
         -4.56937469],
       ...,
       [-10.58954292,  -0.01452914,  -8.47362062,  -5.5314572 ,
         -4.5824503 ],
       [-13.2414347 ,  -0.0145437 ,  -8.76370039,  -5.49550592,
         -4.58780264],
       [-10.32031095,  -0.01486031,  -8.10861068,  -5.44525921,
         -4.5952371 ]])

In [None]:
#todo: variable analysis to selet new set of features

In [None]:
#todo:
#fit different models and compare through a loop

In [None]:
#todo visualizations