## Load the dataset using pandas

In [37]:
import pandas as pd

logs_dataset = pd.read_csv("/home/akallada/Desktop/projects/sec/logs_dataset.csv")

In [38]:
logs_dataset.shape

(95695, 7)

In [39]:
#Take a part of the dataset (10,000), out of 95k daatpoints. 
dataset = logs_dataset[:10000]

In [40]:
#check the label distribution

dataset['attack'].value_counts()

#highly imbalanced.!!

0    9890
1     110
Name: attack, dtype: int64

## use sample function to split the dataset into train and test

In [41]:
data = dataset.sample(frac=0.95, random_state=786)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (9500, 7)
Unseen Data For Predictions: (500, 7)


In [42]:
#import all names from the classification module

from pycaret.classification import *

#setup() used to assign the features and target values to perform classification.
#session ID is to seed in all functions for later reproducibility
exp_name = setup(data = data,  target = 'attack', session_id=123) 


Unnamed: 0,Description,Value
0,session_id,123
1,Target,attack
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(9500, 7)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,False


In [43]:
# Comparing all models to evaluate performance on the dataset and each of the models performnace is evaluated on 10 fold cross validation.
#the models are compared based on the "Accuracy", which is the default parameter.
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9995,0.0,0.9548,1.0,0.9755,0.9753,0.9762,0.115
ada,Ada Boost Classifier,0.9994,1.0,0.9548,0.9875,0.9689,0.9686,0.9697,0.857
et,Extra Trees Classifier,0.9992,0.9999,0.9238,1.0,0.9587,0.9584,0.96,1.045
dt,Decision Tree Classifier,0.9991,0.9772,0.9548,0.9607,0.9556,0.9551,0.9562,0.067
gbc,Gradient Boosting Classifier,0.9991,0.9928,0.9548,0.9607,0.9556,0.9551,0.9562,3.123
rf,Random Forest Classifier,0.9989,1.0,0.8929,1.0,0.9401,0.9396,0.9428,0.494
lightgbm,Light Gradient Boosting Machine,0.9983,0.9998,0.8738,0.9653,0.9031,0.9023,0.9105,0.098
knn,K Neighbors Classifier,0.9943,0.8322,0.569,0.8321,0.6677,0.665,0.6812,0.371
lda,Linear Discriminant Analysis,0.9941,0.6008,0.5262,0.86,0.6368,0.6342,0.6609,2.484
lr,Logistic Regression,0.9898,0.1478,0.0,0.0,0.0,0.0,0.0,0.85


In [44]:
# The best model is highlighed in yellow from the above table.
# we can also print to check the best model and its parameters.
print(best_model)


RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=123, solver='auto',
                tol=0.001)


In [45]:
# we can specifically create a model using create_model() and can further tune them using tune_model()
predict_model(best_model);


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.9989,0.9531,0.9062,1.0,0.9508,0.9503,0.9515


In [46]:
# finalize_model() - fits the model onto the complete dataset including the test/hold-out sample
# final_model = finalize_model(best_model)


In [47]:
# print(final_model)

## Predict model on unseen data

In [49]:
unseen_predictions = predict_model(final_model, data=data_unseen)
unseen_predictions.head()

Unnamed: 0.1,Unnamed: 0,size,param_number,length,return_code,url,attack,Label
0,5,295,1,26,404,GET /wp-login.php HTTP/1.1,0,0
1,18,295,1,26,404,GET /wp-login.php HTTP/1.1,0,0
2,37,295,1,26,404,GET /wp-login.php HTTP/1.1,0,0
3,39,518,1,24,301,GET /robots.txt HTTP/1.1,0,0
4,57,1432,1,47,200,GET /self.logs/error.log.2015-12-20.gz HTTP/1.1,0,0


In [50]:
#check_metric function is used to evaluate the model on the test data, based on the metric specified. 

from pycaret.utils import check_metric
check_metric(unseen_predictions['attack'], unseen_predictions['Label'], metric = 'Accuracy')

1.0

## saving the model

In [51]:
save_model(final_model,'final_model_web_attacks')


Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='attack',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('fix_perfect', Remove_100(target='attack')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
         

## Loading the model

In [52]:
saved_final_rf = load_model('final_model_web_attacks')


Transformation Pipeline and Model Successfully Loaded


In [53]:
new_prediction = predict_model(saved_final_rf, data=data_unseen)


In [54]:
new_prediction.head()


Unnamed: 0.1,Unnamed: 0,size,param_number,length,return_code,url,attack,Label
0,5,295,1,26,404,GET /wp-login.php HTTP/1.1,0,0
1,18,295,1,26,404,GET /wp-login.php HTTP/1.1,0,0
2,37,295,1,26,404,GET /wp-login.php HTTP/1.1,0,0
3,39,518,1,24,301,GET /robots.txt HTTP/1.1,0,0
4,57,1432,1,47,200,GET /self.logs/error.log.2015-12-20.gz HTTP/1.1,0,0


In [55]:
from pycaret.utils import check_metric
check_metric(new_prediction['attack'], new_prediction['Label'], metric = 'Accuracy')

1.0