# Dataset Feature Extraction

On this stage we will extract the parameters who most contributes to the caracterization of the dataset.

In [1]:
import pandas as pd
import pickle as pk
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
with open("data-normalized.pkl", 'rb') as f:
    X_train, y_train, X_test, y_test = pk.load(f)

In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3549974, 47), (3549974,), (508003, 47), (508003,))

Selecting the most important features of the dataset.

In [4]:
%%time

param_grid = {
    "criterion": ['gini', 'entropy'],
    "max_depth": [15, 20, 25],
    "n_estimators": [60, 70, 75],
    "min_samples_split":[0,1,2],
    "min_samples_leaf":[0.2,0.5,1]
}

cf = RandomForestClassifier()
cf_cv = GridSearchCV(estimator=cf, param_grid=param_grid, scoring="f1", cv=5)
cf_cv.fit(X_train, y_train)

cf_cv.best_params_

540 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\lipey\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\lipey\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\lipey\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\lipey\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\lipey\ana

CPU times: total: 8h 24min 30s
Wall time: 8h 48min 31s


{'criterion': 'gini',
 'max_depth': 25,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 70}

In [5]:
rfc = RandomForestClassifier(verbose=1, random_state = 0, max_depth = 25, min_samples_leaf=1,min_samples_split=2, n_estimators=75 )

In [None]:
rfc.fit(X_train, y_train)
y_pred= rfc.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print('Matrix:', classification_report(y_test,y_pred))

In [None]:
col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime',
       'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm']

In [None]:
fi = pd.DataFrame({ 'feature':col_names, 'importance': rfc.feature_importances_}).sort_values('importance',  ascending = False)
fi.head(7)

In [None]:
from matplotlib import pyplot
importance = rfc.feature_importances_
pyplot.xlabel('Característica')
pyplot.ylabel('Importância')
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
X_train= X_train[:,[0,36,9,23,8,14,10]].copy()
X_test = X_test[:,[0,36,9,23,8,14,10]].copy()

In [None]:
with open('dataset-nids.pkl',mode='wb') as f:
    pickle.dump([X_train, y_train, X_test, y_test],f)