## Model creation for prediction
### Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Data and creation model

In [4]:
df = pd.read_csv(r'F:\Programacion\1.BOOTCAMP\data\trabajo_interdisciplinar\df_ready\whole_encoded.csv')
df.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,time,duration,source,src_port,destination,dest_port,protocol,length,info,attack,flag,lb_protocol
0,0,1,"Jul 13, 2020 15:28:31.152545818 CEST",60.924132,49.202835,10.0.2.15,37232.0,10.0.2.13,80.0,TCP,74,37232 > 80 [SYN] Seq=0 Win=64240 Len=0 MSS=1...,1,SYN,60


In [5]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1)
df.head(1)

Unnamed: 0,date,time,duration,source,src_port,destination,dest_port,protocol,length,info,attack,flag,lb_protocol
0,"Jul 13, 2020 15:28:31.152545818 CEST",60.924132,49.202835,10.0.2.15,37232.0,10.0.2.13,80.0,TCP,74,37232 > 80 [SYN] Seq=0 Win=64240 Len=0 MSS=1...,1,SYN,60


In [54]:
def counter(counter):
    """
    ---What it does---
    Counter system to show progress of function
    """
    counter += 1
    sys.stdout.write("\r {0} %".format(counter))
    sys.stdout.flush()

In [6]:
def to_ML (df):
    """
    ---What it does---
    Creates a XGBClassifier model. Printing the overall accuracy and a classification report to asess performance.
    ---What it needs---
        + A df object with numerical columns (by default named 'time', 'duration', 'lenght' and 'lb_protocol').
        + The necessary libraries for XGBClassifier to work
            * confusion_matrix (sklearn)
            * MinMaxScaler (sklearn)
            * train_test_split (sklearn)
            * classification_report (sklearn)
            * XGBClassifier (xgboost)
    ---What it returns---
    The XGBClassifier model with learning_rate = 0.5
    """
    
    # Splittig of data
    X = df[['time', 'duration', 'length', 'lb_protocol']]
    y = df['attack']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    balance_ratio = dict(y.value_counts())
    print(f'Classes in dataframe {balance_ratio}')
    print(f"Class 1 represents {round(balance_ratio[1]*100/(balance_ratio[0] + balance_ratio[1]) , 2)} % of the whole data\n")

    # Scaling data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Creation of the model
    xgb_model = XGBClassifier(learning_rate=0.5)
    xgb_model.fit(X_train, y_train)

    # Reporting on performance
    y_pred = xgb_model.predict(X_test)
    m = confusion_matrix(y_test, y_pred)
    precission = (m[0][0] + m[1][1]) / (m.sum())

    print("Classification Report")
    print(classification_report(y_test, y_pred))
    print(f"Overall accuracy: {round(precission*100, 2)}%")

    return xgb_model

In [7]:
xgb_model = to_ML(df)
xgb_model

Classes in dataframe {0: 377720, 1: 315579}
Class 1 represents 45.52 % of the whole data

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.88      0.90    124691
           1       0.86      0.91      0.89    104098

    accuracy                           0.89    228789
   macro avg       0.89      0.90      0.89    228789
weighted avg       0.90      0.89      0.89    228789

Overall accuracy: 89.43%


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Data for prediction

In [34]:
df_copy = df.copy()
# df_copy = df_copy.drop('attack', axis = 1)
# df_copy.insert(df_copy.shape[1], 'attack', np.nan)
df_copy.columns

Index(['date', 'time', 'duration', 'source', 'src_port', 'destination',
       'dest_port', 'protocol', 'length', 'info', 'attack', 'flag',
       'lb_protocol'],
      dtype='object')

In [55]:
t = 0
n_pruebas = 100000
for e in range (n_pruebas):
    random = np.random.randint(len(df.index))

    to_predict = df_copy.loc[random,:]
    real_class = to_predict['attack']
    to_predict = to_predict[['time', 'duration', 'length', 'lb_protocol']].astype('double')

    y_pred = xgb_model.predict(to_predict)
    if(y_pred == real_class):
        t+=1
    counter(t*100/n_pruebas)
    if y_pred == 1:
        break

print('\n')
if y_pred == 1:
    print(f'{random} is an attack')
else:
    print(f'{random} is nothing to worry about')

1387 %

553533 is an attack
