In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from random import randint

In [10]:
def remove_no_used_col(df):
    return df.drop(["Unnamed: 0", "InitialIp", "EndIP", "Port", "Proto"],
           axis=1)


def change_class(x):
    if "Botnet" in x:
        return 1
    elif "Normal" in x:
        return 0 

    
def rename_labels(df):
    df["class"] = df["LabelName"].apply(change_class)
    return df.drop(["LabelName"],axis=1)


def rename_string_column(df):
    return df.rename({"State":"domain"},axis=1)


def remove_chars_until(df, k):
    df["domain"] = df["domain"].apply(lambda x: x[k:])
    return df


def drop_rows_empty_string(df):
    return df[df["domain"] != ""]


def save_df_as(df, file_name="test.csv"):
    file_name = "../Datasets/change_of_context/" + file_name
    df.to_csv(file_name, index=False)
    
    
def calculate_modelsize(df):
    df["modelsize"] = df["domain"].apply(lambda x: len(x))
    return df


def deal_empty_string(df):    
    df["domain"] = df["domain"].apply(lambda x: np.nan if x=="" else x)
    df.dropna(inplace=True)
    return df


def random_position_string(x, maxlen):
    if len(x)<= maxlen:
        return x
    rand_pos = randint(0,len(x)-maxlen)
    return x[rand_pos:rand_pos+maxlen]


def select_random_string_size_of(df, maxlen):
    df["domain"] = df["domain"].apply(random_position_string, maxlen=maxlen)
    return df

In [11]:
ctu19 = pd.read_csv("../Datasets/ctu19_result.csv")

In [4]:
ctu19.head(10)

Unnamed: 0.1,Unnamed: 0,InitialIp,EndIP,Port,Proto,State,LabelName
0,0,0.0.0.0,10.0.2.112,,arp,2,Botnet
1,1,00:00:00:00:00:00,00:00:00:00:00:00,,llc,1,Botnet
2,2,10.0.2.112,1.143.41.48,4451.0,udp,1,Botnet
3,3,10.0.2.112,1.161.56.125,4692.0,udp,960F0f0f0f0c0,Botnet
4,4,10.0.2.112,1.34.140.54,1016.0,udp,1,Botnet
5,5,10.0.2.112,1.34.176.189,25.0,tcp,"88*Y,Y0y*y*y*H*y*Y*H*Y+Y*y*y*y*y*Y0Y*Y+Y*y*y*y...",Botnet
6,6,10.0.2.112,1.34.176.189,80.0,tcp,"22,b,S*S,b,S*S,b,S*S,b,S*S,b,S*S,b,S*S,b,S*S,b...",Botnet
7,7,10.0.2.112,1.34.231.77,25.0,tcp,"88,Z*y00Y.Y*Y0H0h0Y+Y000Y*Y0y*Y,Y*Y*y*Y0y*y*h*...",Botnet
8,8,10.0.2.112,1.34.231.77,80.0,tcp,"93,z+w,I,w,S*Z,z,Z00000Z,Z0Z,z,Z*Z,I,Z0W,z,z,W...",Botnet
9,9,10.0.2.112,1.4.173.240,7661.0,udp,4,Botnet


In [5]:
ctu19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72463 entries, 0 to 72462
Data columns (total 7 columns):
Unnamed: 0    72463 non-null int64
InitialIp     72463 non-null object
EndIP         72463 non-null object
Port          72229 non-null object
Proto         72463 non-null object
State         72463 non-null object
LabelName     72463 non-null object
dtypes: int64(1), object(6)
memory usage: 3.9+ MB


In [6]:
ctu19["LabelName"].value_counts()

Botnet    67551
Normal     4912
Name: LabelName, dtype: int64

In [7]:
ctu19[ctu19["State"].str.len() > 3].shape

(27545, 7)

In [10]:
(ctu19.copy()
     .pipe(remove_no_used_col)
     .pipe(rename_labels)
     .pipe(rename_string_column)
     .pipe(remove_chars_until, 3)
     .pipe(drop_rows_empty_string)
     .pipe(calculate_modelsize)
     .pipe(save_df_as, "ctu19_to_test_RF.csv"))

(27545, 3)

In [14]:
sizes_string = [10, 25, 75]

for maxlen in sizes_string:
    (ctu19.copy()
         .pipe(remove_no_used_col)
         .pipe(rename_labels)
         .pipe(rename_string_column)
         .pipe(remove_chars_until, 3)
         .pipe(drop_rows_empty_string)
         .pipe(select_random_string_size_of, maxlen)
         .pipe(save_df_as, "ctu19_for_LSTM_random_pos_size_" + str(maxlen) + ".csv"))