In [1]:
import pandas as pd

## Revisar los datos de KDD99Cup y Limpiar

In [2]:
df = pd.read_csv('../Datasets/KDDCup99_Limpio.csv')

Revisamos los datos iniciales

In [3]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,logged_in,num_outbound_cmds,is_host_login,count,srv_count,serror_rate,...,dst_host_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,attack,Attack Type,Is attack
0,0,1,22,9,1,0,0,8,8,0.0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0,0,0
1,0,1,22,9,1,0,0,8,8,0.0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0,0,0
2,0,1,22,9,1,0,0,8,8,0.0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0,0,0
3,0,1,22,9,1,0,0,6,6,0.0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0,0,0
4,0,1,22,9,1,0,0,6,6,0.0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0,0,0


Vamos a revisar los valores nulos

In [4]:
df.isna().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
logged_in                      0
num_outbound_cmds              0
is_host_login                  0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
same_srv_rate                  0
srv_diff_host_rate             0
dst_host_count                 0
dst_host_same_srv_rate         0
dst_host_diff_srv_rate         0
dst_host_same_src_port_rate    0
dst_host_srv_diff_host_rate    0
dst_host_serror_rate           0
dst_host_srv_serror_rate       0
attack                         0
Attack Type                    0
Is attack                      0
dtype: int64

Vamos a revisar la correlación

In [11]:
df['Is attack'].value_counts()

1    396743
0     97278
Name: Is attack, dtype: int64

In [10]:
df['Attack Type'].value_counts()

1    391458
0     97278
2      4107
3      1126
4        52
Name: Attack Type, dtype: int64

In [15]:
df.columns.size

23

In [16]:
df.corrwith(df['Is attack']).abs().nlargest(23)

Is attack                      1.000000
Attack Type                    0.948661
attack                         0.879475
logged_in                      0.795282
count                          0.752978
dst_host_count                 0.642110
protocol_type                  0.616601
srv_count                      0.566829
dst_host_same_src_port_rate    0.481458
srv_diff_host_rate             0.364687
same_srv_rate                  0.247405
dst_host_srv_serror_rate       0.227975
serror_rate                    0.227739
dst_host_serror_rate           0.227205
srv_serror_rate                0.227189
dst_host_srv_diff_host_rate    0.204958
flag                           0.155672
service                        0.131723
duration                       0.118014
dst_host_diff_srv_rate         0.115901
dst_host_same_srv_rate         0.109950
num_outbound_cmds                   NaN
is_host_login                       NaN
dtype: float64

Droppeamos los atributos sin correlación

In [17]:
df = df.drop(['num_outbound_cmds','is_host_login'], axis=1)

In [18]:
df.columns.size

21

## Ahora, vamos a empezar el entrenamiento con Pycaret

In [19]:
!pip install pycaret



In [20]:
from pycaret.classification import *

### 2.1 Intento de Setup y Modelado sin normalización

#### Temporalmente removemos Attack types para que no influya en Is attack

In [22]:
final_df = df.drop(['Attack Type'], axis=1)

In [23]:
final_df.columns.size

20

In [26]:
setup(final_df, target='Is attack')

Unnamed: 0,Description,Value
0,Session id,3349
1,Target,Is attack
2,Target type,Binary
3,Original data shape,"(494021, 20)"
4,Transformed data shape,"(494021, 20)"
5,Transformed train set shape,"(345814, 20)"
6,Transformed test set shape,"(148207, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x1ce0283e310>

In [27]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.917
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.236
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.218
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.286
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.616
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.49
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.123
knn,K Neighbors Classifier,0.9983,0.9995,0.9981,0.9998,0.999,0.9947,0.9947,21.061
nb,Naive Bayes,0.9982,0.9974,1.0,0.9978,0.9989,0.9944,0.9944,0.268
ridge,Ridge Classifier,0.9953,0.0,0.9942,1.0,0.9971,0.9853,0.9854,0.258


#### Como vimos, el accuracy es muy grande, así que hacemos otro setup quitando más columnas con correlación alta

In [34]:
other_setup_df.corrwith(df['Is attack']).abs().nlargest(23)

Is attack                      1.000000
count                          0.752978
dst_host_count                 0.642110
protocol_type                  0.616601
srv_count                      0.566829
dst_host_same_src_port_rate    0.481458
srv_diff_host_rate             0.364687
same_srv_rate                  0.247405
dst_host_srv_serror_rate       0.227975
serror_rate                    0.227739
dst_host_serror_rate           0.227205
srv_serror_rate                0.227189
dst_host_srv_diff_host_rate    0.204958
flag                           0.155672
service                        0.131723
duration                       0.118014
dst_host_diff_srv_rate         0.115901
dst_host_same_srv_rate         0.109950
dtype: float64

In [35]:
setup(other_setup_df, target='Is attack')

Unnamed: 0,Description,Value
0,Session id,6816
1,Target,Is attack
2,Target type,Binary
3,Original data shape,"(494021, 18)"
4,Transformed data shape,"(494021, 18)"
5,Transformed train set shape,"(345814, 18)"
6,Transformed test set shape,"(148207, 18)"
7,Numeric features,17
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x1ce1e2d2e50>

In [36]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9966,0.9999,0.9967,0.9991,0.9979,0.9892,0.9892,1.973
rf,Random Forest Classifier,0.9965,0.9998,0.9964,0.9992,0.9978,0.9889,0.9889,7.015
et,Extra Trees Classifier,0.9965,0.9995,0.9964,0.9992,0.9978,0.989,0.989,6.255
dt,Decision Tree Classifier,0.9961,0.9978,0.9963,0.9989,0.9976,0.9879,0.9879,0.422
knn,K Neighbors Classifier,0.9943,0.998,0.994,0.9988,0.9964,0.982,0.982,25.471
gbc,Gradient Boosting Classifier,0.9939,0.9998,0.9929,0.9995,0.9962,0.9808,0.9809,15.003
ada,Ada Boost Classifier,0.9907,0.9995,0.9915,0.997,0.9942,0.9709,0.971,4.049
lr,Logistic Regression,0.9886,0.9965,0.9876,0.9982,0.9929,0.9645,0.9649,9.085
ridge,Ridge Classifier,0.9858,0.0,0.9852,0.997,0.9911,0.9558,0.9563,0.245
lda,Linear Discriminant Analysis,0.9858,0.9959,0.9852,0.9971,0.9911,0.956,0.9565,0.628
