# Model creation for prediction

## Libraries

In [1]:
import pandas as pd
import numpy as np

In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.1.1-py3-none-win_amd64.whl (54.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.1.1


In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Data and creation model

In [6]:
df = pd.read_csv(r'data_final_categorical.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,date,time,duration,source,src_port,destination,dest_port,protocol,length,info,type_of_attack,machine,exito,attack,flag,lb_protocol,lb_type_of_attack,lb_machine,lb_flag
0,0,"Jul 13, 2020 15:28:31.152545818 CEST",60.924132,49.202835,10.0.2.15,37232,10.0.2.13,80,TCP,74,37232 > 80 [SYN] Seq=0 Win=64240 Len=0 MSS=1...,Dirbuster,DVL,1,1,SYN,60,1,1,53
1,1,"Jul 13, 2020 15:28:31.153114217 CEST",60.924701,0.000568,10.0.2.13,80,10.0.2.15,37232,TCP,74,"80 > 37232 [SYN, ACK] Seq=0 Ack=1 Win=5792 L...",Dirbuster,DVL,1,1,"SYN, ACK",60,1,1,54
2,2,"Jul 13, 2020 15:28:31.153145026 CEST",60.924731,3.1e-05,10.0.2.15,37232,10.0.2.13,80,TCP,66,37232 > 80 [ACK] Seq=1 Ack=1 Win=64256 Len=0...,Dirbuster,DVL,1,1,ACK,60,1,1,14
3,3,"Jul 13, 2020 15:28:31.153450428 CEST",60.925037,0.000305,10.0.2.15,37232,10.0.2.13,80,TCP,66,"37232 > 80 [FIN, ACK] Seq=1 Ack=1 Win=64256 ...",Dirbuster,DVL,1,1,"FIN, ACK",60,1,1,31
4,4,"Jul 13, 2020 15:28:31.154054557 CEST",60.925641,0.000604,10.0.2.13,80,10.0.2.15,37232,TCP,66,"80 > 37232 [FIN, ACK] Seq=1 Ack=2 Win=5792 L...",Dirbuster,DVL,1,1,"FIN, ACK",60,1,1,31


In [7]:
df = df.drop(['Unnamed: 0'], axis = 1)
df.head(1)

Unnamed: 0,date,time,duration,source,src_port,destination,dest_port,protocol,length,info,type_of_attack,machine,exito,attack,flag,lb_protocol,lb_type_of_attack,lb_machine,lb_flag
0,"Jul 13, 2020 15:28:31.152545818 CEST",60.924132,49.202835,10.0.2.15,37232,10.0.2.13,80,TCP,74,37232 > 80 [SYN] Seq=0 Win=64240 Len=0 MSS=1...,Dirbuster,DVL,1,1,SYN,60,1,1,53


In [8]:
def counter(counter):
    """
    ---What it does---
    Counter system to show progress of function
    """
    counter += 1
    sys.stdout.write("\r {0} %".format(counter))
    sys.stdout.flush()

In [9]:
def to_ML (df):
    """
    ---What it does---
    Creates a XGBClassifier model. Printing the overall accuracy and a classification report to asess performance.
    ---What it needs---
        + A df object with numerical columns (by default named 'time', 'duration', 'lenght' and 'lb_protocol').
        + The necessary libraries for XGBClassifier to work
            * confusion_matrix (sklearn)
            * MinMaxScaler (sklearn)
            * train_test_split (sklearn)
            * classification_report (sklearn)
            * XGBClassifier (xgboost)
    ---What it returns---
    The XGBClassifier model with learning_rate = 0.5
    """
    
    # Splittig of data
    df = df.loc[df.attack ==1]
    X = df[['time', 'duration', 'length', 'lb_protocol', "lb_type_of_attack", "lb_machine", "lb_flag"]]
    y = df['exito']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    balance_ratio = dict(y.value_counts())
    print(f'Classes in dataframe {balance_ratio}')
    print(f"Class 1 represents {round(balance_ratio[1]*100/(balance_ratio[0] + balance_ratio[1]) , 2)} % of the whole data\n")

    # Scaling data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Creation of the model
    xgb_model = XGBClassifier(learning_rate=0.5)
    xgb_model.fit(X_train, y_train)

    # Reporting on performance
    y_pred = xgb_model.predict(X_test)
    m = confusion_matrix(y_test, y_pred)
    precission = (m[0][0] + m[1][1]) / (m.sum())

    print("Classification Report")
    print(classification_report(y_test, y_pred))
    print(f"Overall accuracy: {round(precission*100, 2)}%")

    return xgb_model

In [11]:
xgb_model = to_ML(df)
xgb_model

Classes in dataframe {1: 253250, 0: 62329}
Class 1 represents 80.25 % of the whole data

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20585
           1       1.00      1.00      1.00     83557

    accuracy                           1.00    104142
   macro avg       1.00      1.00      1.00    104142
weighted avg       1.00      1.00      1.00    104142

Overall accuracy: 99.95%


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Data for prediction

In [None]:
df_copy = df.copy()
# df_copy = df_copy.drop('attack', axis = 1)
# df_copy.insert(df_copy.shape[1], 'attack', np.nan)
df_copy.columns

In [None]:
t = 0
n_pruebas = 100000
for e in range (n_pruebas):
    random = np.random.randint(len(df.index))

    to_predict = df_copy.loc[random,:]
    real_class = to_predict['attack']
    to_predict = to_predict[['time', 'duration', 'length', 'lb_protocol']].astype('double')

    y_pred = xgb_model.predict(to_predict)
    if(y_pred == real_class):
        t+=1
    counter(t*100/n_pruebas)
    if y_pred == 1:
        break

print('\n')
if y_pred == 1:
    print(f'{random} is an attack')
else:
    print(f'{random} is nothing to worry about')