In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report, accuracy_score

In [2]:
df = pd.read_csv("dataset/clean_df.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'DIRECTION', 'IN_PKTS', 'LABEL', 'PROTOCOL_MAP_gre',
       'PROTOCOL_MAP_icmp', 'PROTOCOL_MAP_ipv6_icmp', 'PROTOCOL_MAP_tcp',
       'PROTOCOL_MAP_udp', 'L7_PROTO_NAME_AFP',
       ...
       'L7_PROTO_NAME_WEBEX', 'L7_PROTO_NAME_WHATSAPP',
       'L7_PROTO_NAME_WHATSAPPFILES', 'L7_PROTO_NAME_WHOIS_DAS',
       'L7_PROTO_NAME_WIREGUARD', 'L7_PROTO_NAME_WSD', 'L7_PROTO_NAME_XBOX',
       'L7_PROTO_NAME_XDMCP', 'L7_PROTO_NAME_ZABBIX', 'L7_PROTO_NAME_ZOOM'],
      dtype='object', length=141)

In [4]:
len(df.columns)

141

In [5]:
# Drop the column unnamed
df = df.drop(["Unnamed: 0"], axis = 1)
len(df.columns)

140

In [6]:
df.describe()

Unnamed: 0,DIRECTION,IN_PKTS,LABEL,PROTOCOL_MAP_gre,PROTOCOL_MAP_icmp,PROTOCOL_MAP_ipv6_icmp,PROTOCOL_MAP_tcp,PROTOCOL_MAP_udp,L7_PROTO_NAME_AFP,L7_PROTO_NAME_AJP,...,L7_PROTO_NAME_WEBEX,L7_PROTO_NAME_WHATSAPP,L7_PROTO_NAME_WHATSAPPFILES,L7_PROTO_NAME_WHOIS_DAS,L7_PROTO_NAME_WIREGUARD,L7_PROTO_NAME_WSD,L7_PROTO_NAME_XBOX,L7_PROTO_NAME_XDMCP,L7_PROTO_NAME_ZABBIX,L7_PROTO_NAME_ZOOM
count,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,...,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0
mean,0.000958,1.333597,1.488744,0.0,0.018173,4e-06,0.872916,0.108907,1.6e-05,1.2e-05,...,0.0,0.0,3.3e-05,6e-06,2e-06,1.2e-05,2.5e-05,8e-06,4e-06,0.0
std,0.030941,1.22363,1.128002,0.0,0.133576,0.002024,0.333067,0.311523,0.004047,0.003505,...,0.0,0.0,0.005724,0.002478,0.001431,0.003505,0.004957,0.002862,0.002024,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.386294,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,2.397895,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,12.47505,3.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [7]:
def scale_x(data):
    min_max_scaler = MinMaxScaler()
    return min_max_scaler.fit_transform(data)
    

In [8]:
def store_df(name, X, y):
    compression_opts = dict(method='zip',
                            archive_name=name+'_X.csv')  
    X.to_csv(name+'_X.zip', index=True,
              compression=compression_opts)  
    compression_opts = dict(method='zip',
                            archive_name=name+'_y.csv')  
    y.to_csv(name+'_y.zip', index=True,
              compression=compression_opts)  

In [9]:
random_forest_df = df.copy()

# Get representative sampling

We will get 100k sampling form each label right now each one has ~2M so we will get 5% of each one

In [10]:
# sample_df = df.groupby('LABEL', group_keys=False).apply(lambda x: x.sample(frac=0.05))
sample_df = df.copy()

In [11]:
sample_df.describe()

Unnamed: 0,DIRECTION,IN_PKTS,LABEL,PROTOCOL_MAP_gre,PROTOCOL_MAP_icmp,PROTOCOL_MAP_ipv6_icmp,PROTOCOL_MAP_tcp,PROTOCOL_MAP_udp,L7_PROTO_NAME_AFP,L7_PROTO_NAME_AJP,...,L7_PROTO_NAME_WEBEX,L7_PROTO_NAME_WHATSAPP,L7_PROTO_NAME_WHATSAPPFILES,L7_PROTO_NAME_WHOIS_DAS,L7_PROTO_NAME_WIREGUARD,L7_PROTO_NAME_WSD,L7_PROTO_NAME_XBOX,L7_PROTO_NAME_XDMCP,L7_PROTO_NAME_ZABBIX,L7_PROTO_NAME_ZOOM
count,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,...,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0
mean,0.000958,1.333597,1.488744,0.0,0.018173,4e-06,0.872916,0.108907,1.6e-05,1.2e-05,...,0.0,0.0,3.3e-05,6e-06,2e-06,1.2e-05,2.5e-05,8e-06,4e-06,0.0
std,0.030941,1.22363,1.128002,0.0,0.133576,0.002024,0.333067,0.311523,0.004047,0.003505,...,0.0,0.0,0.005724,0.002478,0.001431,0.003505,0.004957,0.002862,0.002024,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.386294,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,2.397895,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,12.47505,3.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [12]:
for c in sample_df.columns:
    print(c)

DIRECTION
IN_PKTS
LABEL
PROTOCOL_MAP_gre
PROTOCOL_MAP_icmp
PROTOCOL_MAP_ipv6_icmp
PROTOCOL_MAP_tcp
PROTOCOL_MAP_udp
L7_PROTO_NAME_AFP
L7_PROTO_NAME_AJP
L7_PROTO_NAME_AMAZON
L7_PROTO_NAME_AMONGUS
L7_PROTO_NAME_APPLE
L7_PROTO_NAME_AYIYA
L7_PROTO_NAME_BGP
L7_PROTO_NAME_BITTORRENT
L7_PROTO_NAME_BJNP
L7_PROTO_NAME_BLOOMBERG
L7_PROTO_NAME_CAPWAP
L7_PROTO_NAME_CHECKMK
L7_PROTO_NAME_CISCOSKINNY
L7_PROTO_NAME_CISCOVPN
L7_PROTO_NAME_CITRIX
L7_PROTO_NAME_CLOUDFLARE
L7_PROTO_NAME_COAP
L7_PROTO_NAME_COLLECTD
L7_PROTO_NAME_CORBA
L7_PROTO_NAME_DCE_RPC
L7_PROTO_NAME_DIAMETER
L7_PROTO_NAME_DNP3
L7_PROTO_NAME_DNS
L7_PROTO_NAME_DOH_DOT
L7_PROTO_NAME_DROPBOX
L7_PROTO_NAME_DTLS
L7_PROTO_NAME_EAQ
L7_PROTO_NAME_FACEBOOK
L7_PROTO_NAME_FTP_CONTROL
L7_PROTO_NAME_FTP_DATA
L7_PROTO_NAME_GIT
L7_PROTO_NAME_GOOGLE
L7_PROTO_NAME_GRE
L7_PROTO_NAME_GTP
L7_PROTO_NAME_H323
L7_PROTO_NAME_HOTSPOTSHIELD
L7_PROTO_NAME_HTTP
L7_PROTO_NAME_HTTP_PROXY
L7_PROTO_NAME_IAX
L7_PROTO_NAME_ICMP
L7_PROTO_NAME_ICMPV6
L7_PROTO_NAME_IEC608

In [13]:
sample_df.describe()

Unnamed: 0,DIRECTION,IN_PKTS,LABEL,PROTOCOL_MAP_gre,PROTOCOL_MAP_icmp,PROTOCOL_MAP_ipv6_icmp,PROTOCOL_MAP_tcp,PROTOCOL_MAP_udp,L7_PROTO_NAME_AFP,L7_PROTO_NAME_AJP,...,L7_PROTO_NAME_WEBEX,L7_PROTO_NAME_WHATSAPP,L7_PROTO_NAME_WHATSAPPFILES,L7_PROTO_NAME_WHOIS_DAS,L7_PROTO_NAME_WIREGUARD,L7_PROTO_NAME_WSD,L7_PROTO_NAME_XBOX,L7_PROTO_NAME_XDMCP,L7_PROTO_NAME_ZABBIX,L7_PROTO_NAME_ZOOM
count,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,...,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0,488370.0
mean,0.000958,1.333597,1.488744,0.0,0.018173,4e-06,0.872916,0.108907,1.6e-05,1.2e-05,...,0.0,0.0,3.3e-05,6e-06,2e-06,1.2e-05,2.5e-05,8e-06,4e-06,0.0
std,0.030941,1.22363,1.128002,0.0,0.133576,0.002024,0.333067,0.311523,0.004047,0.003505,...,0.0,0.0,0.005724,0.002478,0.001431,0.003505,0.004957,0.002862,0.002024,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.386294,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,2.397895,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,12.47505,3.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [15]:
len(sample_df)
print(len(sample_df)*0.55)
print(len(sample_df)*0.15)
print(len(sample_df)*0.3)

268603.5
73255.5
146511.0


### Train, Test, Validation Split

In [17]:
y = sample_df["LABEL"]
X = sample_df.drop(["LABEL"], axis = 1)

In [25]:
from sklearn.model_selection import train_test_split
X_train_beta, X_test, y_train_beta, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
validation_rate = ((15*70)/100)/100
X_train, X_validation, y_train, y_validation = train_test_split(X_train_beta, y_train_beta, test_size=validation_rate, random_state=42)

In [26]:
print(len(X_test))

146511


## Random Forest Model

In [27]:
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

In [28]:
# Entrenamos el modelo
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=42)

In [29]:
# Realizamos predicciones con las metricas de validación
print("We predict validation")
predictions = rf.predict(X_validation)

We predict validation


#### Validación

In [30]:
# Get the results
validation_confusion_matrix = confusion_matrix(y_validation, predictions)
validation_recall_score = recall_score(y_validation, predictions, average=None)
validation_precision_score = precision_score(y_validation, predictions, average=None)
validation_f1_score = f1_score(y_validation, predictions, average=None)
print("Matrix de confusión: ", validation_confusion_matrix)
target_names =[    "Normal flow",
    "SYN Scan - aggressive",
    "Denial of Service R-U-Dead-Yet",
    "Denial of Service Slowloris"]
print(classification_report(y_validation, predictions, target_names = target_names))
print("recall_score: ",recall_score)
print("precision_score: ", precision_score)
print("f1_score: ", f1_score)
print("accuracy: ", accuracy_score(y_validation, predictions))

Matrix de confusión:  [[8019 1077   60   64]
 [  24 9113    1    1]
 [   3    0 7878  420]
 [   0    0  962 8274]]
                                precision    recall  f1-score   support

                   Normal flow       1.00      0.87      0.93      9220
         SYN Scan - aggressive       0.89      1.00      0.94      9139
Denial of Service R-U-Dead-Yet       0.89      0.95      0.92      8301
   Denial of Service Slowloris       0.94      0.90      0.92      9236

                      accuracy                           0.93     35896
                     macro avg       0.93      0.93      0.93     35896
                  weighted avg       0.93      0.93      0.93     35896

recall_score:  <function recall_score at 0x158589ca0>
precision_score:  <function precision_score at 0x158589c10>
f1_score:  <function f1_score at 0x1585898b0>
accuracy:  0.9272342322264319


### Real results

In [184]:
# Realizamos predicciones con las metricas de validación
print("We predict validation")
predictions_real = rf.predict(X_test)

We predict validation


In [185]:
# Get the results
validation_confusion_matrix = confusion_matrix(y_test, predictions_real)
validation_recall_score = recall_score(y_test, predictions_real, average=None)
validation_precision_score = precision_score(y_test, predictions_real, average=None)
validation_f1_score = f1_score(y_test, predictions_real, average=None)
print("Matrix de confusión: ", validation_confusion_matrix)
target_names =[    "Normal flow",
    "SYN Scan - aggressive",
    "Denial of Service R-U-Dead-Yet",
    "Denial of Service Slowloris"]
print(classification_report(y_test, predictions_real, target_names = target_names))
print("recall_score: ",recall_score)
print("precision_score: ", precision_score)
print("f1_score: ", f1_score)
print("accuracy: ", accuracy_score(y_test, predictions_real))

Matrix de confusión:  [[32683  4308   254   221]
 [  101 37440     2     3]
 [   12     0 32145  1689]
 [    0     0  4085 33568]]
                                precision    recall  f1-score   support

                   Normal flow       1.00      0.87      0.93     37466
         SYN Scan - aggressive       0.90      1.00      0.94     37546
Denial of Service R-U-Dead-Yet       0.88      0.95      0.91     33846
   Denial of Service Slowloris       0.95      0.89      0.92     37653

                      accuracy                           0.93    146511
                     macro avg       0.93      0.93      0.93    146511
                  weighted avg       0.93      0.93      0.93    146511

recall_score:  <function recall_score at 0x1515c3ca0>
precision_score:  <function precision_score at 0x1515c3c10>
f1_score:  <function f1_score at 0x1515c38b0>
accuracy:  0.9271385766256458
