In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from random import randint

In [7]:
def random_position_string(x):
    maxlen = 75
    if len(x)<=maxlen:
        return x
    else:
        rand_pos = randint(0,len(x)-maxlen)
        return x[rand_pos:rand_pos+maxlen]
    
def change_class(x):
    if x == "botnet":
        return 1
    elif x== "normal":
        return 0

In [8]:
for i in range(0,10):
    #Read raw dataset
    ctu_13 = pd.read_csv("../Datasets/ctu-13.labeled.cleaned.csv")
    
    #remove first 5 charc.
    ctu_13["State"] = ctu_13["State"].apply(lambda x: x[5:])
    
    #Select sub-string from random pos
    ctu_13["State"] = ctu_13["State"].apply(random_position_string)
    
    #FEATURE VECTOR
    #Periodicity
    ctu_13["strong_p"] = ctu_13["State"].str.count('[a-i]')
    ctu_13["weak_p"] = ctu_13["State"].str.count('[A-I]')
    ctu_13["weak_np"] = ctu_13["State"].str.count('[r-z]')
    ctu_13["strong_np"] = ctu_13["State"].str.count('[R-Z]')
    #Duration
    ctu_13["duration_s"] = ctu_13["State"].str.count('(a|A|r|R|1|d|D|u|U|4|g|G|x|X|7)')
    ctu_13["duration_m"] = ctu_13["State"].str.count('(b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8)')
    ctu_13["duration_l"] = ctu_13["State"].str.count('(c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9)')
    #Size
    ctu_13["size_s"] = ctu_13["State"].str.count('[a-c]') + \
                        ctu_13["State"].str.count('[A-C]') + \
                        ctu_13["State"].str.count('[r-t]') + \
                        ctu_13["State"].str.count('[R-T]') + \
                        ctu_13["State"].str.count('[1-3]')
    ctu_13["size_m"] = ctu_13["State"].str.count('[d-f]') + \
                        ctu_13["State"].str.count('[D-F]') + \
                        ctu_13["State"].str.count('[u-w]') + \
                        ctu_13["State"].str.count('[U-W]') + \
                        ctu_13["State"].str.count('[4-6]')
    ctu_13["size_l"] = ctu_13["State"].str.count('[g-i]') + \
                        ctu_13["State"].str.count('[G-I]') + \
                        ctu_13["State"].str.count('[x-z]') + \
                        ctu_13["State"].str.count('[X-Z]') + \
                        ctu_13["State"].str.count('[7-9]')
    #Periodicity %
    ctu_13["strong_p"] = ctu_13["strong_p"]/ctu_13["modelsize"]
    ctu_13["weak_p"] = ctu_13["weak_p"]/ctu_13["modelsize"]
    ctu_13["strong_np"] = ctu_13["strong_np"]/ctu_13["modelsize"]
    ctu_13["weak_np"] = ctu_13["weak_np"]/ctu_13["modelsize"]
    #Duration %
    ctu_13["duration_s"] = ctu_13["duration_s"]/ctu_13["modelsize"]
    ctu_13["duration_m"] = ctu_13["duration_m"]/ctu_13["modelsize"]
    ctu_13["duration_l"] = ctu_13["duration_l"]/ctu_13["modelsize"]
    #Size %
    ctu_13["size_s"] = ctu_13["size_s"]/ctu_13["modelsize"]
    ctu_13["size_m"] = ctu_13["size_m"]/ctu_13["modelsize"]
    ctu_13["size_l"] = ctu_13["size_l"]/ctu_13["modelsize"]
    
    #clean dataset
    ctu_13["class"] = ctu_13["class"].apply(change_class)
    ctu_13.drop(["src","dst","port","proto","label","modelsize","State"], axis=1, inplace=True)
    ctu_13.dropna(inplace=True)
    
    #Divide train-test dataset
    x = ctu_13.drop('class',axis=1)
    y = ctu_13['class']
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    
    #Training
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train, y_train)
    
    #Testing
    rfc_pred = rfc.predict(X_test)
    print(confusion_matrix(y_test,rfc_pred))
    print(classification_report(y_test,rfc_pred))
    
    #name = "../Datasets/Random_pos/ctu_13_without_first_5_charc_and_rand_pos_maxlen75_run"+"i"+".csv"
    #ctu_13.to_csv(name, index=False)

[[ 606  244]
 [  99 1771]]
              precision    recall  f1-score   support

           0       0.86      0.71      0.78       850
           1       0.88      0.95      0.91      1870

    accuracy                           0.87      2720
   macro avg       0.87      0.83      0.85      2720
weighted avg       0.87      0.87      0.87      2720

[[ 604  222]
 [  94 1800]]
              precision    recall  f1-score   support

           0       0.87      0.73      0.79       826
           1       0.89      0.95      0.92      1894

    accuracy                           0.88      2720
   macro avg       0.88      0.84      0.86      2720
weighted avg       0.88      0.88      0.88      2720

[[ 598  232]
 [ 120 1770]]
              precision    recall  f1-score   support

           0       0.83      0.72      0.77       830
           1       0.88      0.94      0.91      1890

    accuracy                           0.87      2720
   macro avg       0.86      0.83      0.84    