In [1]:
import time
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from pcap_parse import *

In [2]:
normal_dir = "D:\\Notebooks\\Datasets\\Network_dataset_pcaps\\normal_pcaps"
mal_dir = "D:\\Notebooks\\Datasets\\Network_dataset_pcaps\\normal_attack_pcaps"
truth_dir = "D:\\Notebooks\\Datasets\\truth"

# Create ground truth dataframe

In [3]:
files = os.listdir(truth_dir)
truth_df = pd.DataFrame()
for dataset in files:
    cur = pd.read_csv(truth_dir+"\\"+dataset)
    truth_df = pd.concat([truth_df, cur], axis=0)
truth_df.sort_values(by=['ts','src_ip','dst_ip','src_port','dst_port'], inplace=True)
truth_df.reset_index(drop=True, inplace=True)

In [4]:
truth_df.head(3)

Unnamed: 0,ts,src_ip,src_port,dst_ip,dst_port,proto,type
0,1556021410,192.168.1.30,42908,192.168.1.103,2046,tcp,scanning
1,1556021410,192.168.1.30,42909,192.168.1.103,2046,tcp,scanning
2,1556021410,192.168.1.30,50567,192.168.1.169,1106,tcp,scanning


# Create dataframes for all malicious categories

In [5]:
skip_to = 0

for x in range(len(os.listdir(mal_dir))-skip_to):
    folder = os.listdir(mal_dir)[x+skip_to]
    df = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])
    i = 1
    print("Parsing "+folder)
    for file_name in os.listdir(mal_dir+"\\"+folder):
        print(str(i)+"/"+str(len(os.listdir(mal_dir+"\\"+folder))), end=" : ")
        df = parse_pcap(mal_dir+"\\"+folder+"\\"+file_name, df, 1)
        i += 1
    print("Sorting Dataframe...")
    df.sort_values(by=['ts','src_ip','dst_ip','src_port','dst_port'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Matching ground truths...")
    truths = compare_truth(truth_df[truth_df['type']==folder].reset_index(drop=True), df)
    df['val_label'] = truths
    #print(df.head(3))
    print("Saving Dataframe...")
    confirmed = df[df['val_label']==1].reset_index(drop=True)
    confirmed.to_pickle('D:\\Notebooks\\Datasets\\final\\'+folder+'_final.pkl')

Parsing backdoor
1/1 : Using v1.0
Sorting Dataframe...
Matching ground truths...
Comparing row 0 out of 188864
Comparing row 18886 out of 188864
Comparing row 37772 out of 188864
Comparing row 56658 out of 188864
Comparing row 75544 out of 188864
Comparing row 94430 out of 188864
Comparing row 113316 out of 188864
Comparing row 132202 out of 188864
Comparing row 151088 out of 188864
Comparing row 169974 out of 188864
Comparing row 188860 out of 188864
Saving Dataframe...
Parsing ddos
1/12 : Using v2.4
2/12 : Using v1.0
3/12 : Using v1.0
4/12 : Using v1.0
5/12 : Using v2.4
6/12 : Using v1.0
7/12 : Using v1.0
8/12 : Using v1.0
9/12 : Using v1.0
10/12 : Using v1.0
11/12 : Using v1.0
12/12 : Using v1.0
Sorting Dataframe...
Matching ground truths...
Comparing row 0 out of 6031012
Comparing row 603101 out of 6031012
Comparing row 1206202 out of 6031012
Comparing row 1809303 out of 6031012
Comparing row 2412404 out of 6031012
Comparing row 3015505 out of 6031012
Comparing row 3618606 out of 6

In [13]:
sum = 0
for file in os.listdir('D:\\Notebooks\\Datasets\\final'):
    temp = pd.read_pickle('D:\\Notebooks\\Datasets\\final\\'+file)
    print(file)
    sum+=len(temp)
    #print(temp['val_label'].value_counts())
print(sum)

backdoor_final.pkl
ddos_final.pkl
dos_final.pkl
injection_final.pkl
mitm_final.pkl
password_final.pkl
ransomware_final.pkl
scanning_final.pkl
xss_final.pkl
17470536


In [14]:
test = pd.read_pickle('D:\\Notebooks\\Datasets\\final\\mitm_final.pkl')

In [15]:
test['val_label'].value_counts()

1.0    944
Name: val_label, dtype: int64

# Create dataframe for normal category

In [16]:
df = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])
i = 1
for file_name in os.listdir(normal_dir):
    print(str(i)+"/"+str(len(os.listdir(normal_dir))), end=" : ")
    df = parse_pcap(normal_dir+"\\"+file_name, df, 0)
    i+=1

df.sort_values(by=['ts','src_ip','dst_ip','src_port','dst_port'], inplace=True)
df.reset_index(drop=True, inplace=True)

1/15 : Using v1.0
2/15 : Using v1.0
3/15 : Using v1.0
4/15 : Using v1.0
5/15 : Using v1.0
6/15 : Using v1.0
7/15 : Using v1.0
8/15 : Using v1.0
9/15 : Using v1.0
10/15 : Using v1.0
11/15 : Using v1.0
12/15 : Using v1.0
13/15 : Using v1.0
14/15 : Using v2.4
15/15 : Using v2.4


In [17]:
temp = df[df['dst_port']!=0]

In [18]:
temp.reset_index(drop=True, inplace=True)

In [19]:
temp.to_pickle('D:\\Notebooks\\Datasets\\final\\normal_final.pkl')

# Create tokenize function

In [9]:
def tok_150(data):
    toks = list(data[0:150])
    toks += [0] * (150 - len(toks)) # Padding to 150
    #return tf.convert_to_tensor(np.asarray(toks))
    return np.array(toks, dtype=np.int32)

# Create final dataframe

In [4]:
final = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])

for file in os.listdir('D:\\Notebooks\\Datasets\\final'):
    temp = pd.read_pickle('D:\\Notebooks\\Datasets\\final\\'+file)
    final = pd.concat([final, temp])

final.reset_index(drop=True, inplace=True)


In [5]:
final.drop('val_label', axis=1, inplace=True)

In [6]:
final.head(3)

Unnamed: 0,p_bytes,label,ts,src_ip,dst_ip,src_port,dst_port,pcap_ver
0,b'\xa4\x91\xb1\x1eW\x90\x00\x0c)[s[\x08\x00E\x...,1,1556436599,192.168.1.37,121.0.0.42,47975,123,1
1,b'\x00\x0c)\xa4\x03\xf4\x00\x0c)[s[\x08\x00E\x...,1,1556436603,192.168.1.37,192.168.1.193,4444,49178,1
2,b'\x00\x0c)[s[\x00\x0c)\xa4\x03\xf4\x08\x00E\x...,1,1556436611,192.168.1.193,192.168.1.37,49180,8080,1


In [7]:
final.to_pickle('D:\\Notebooks\\Datasets\\complete_final.pkl')

In [7]:
final['label'].value_counts()

1    17470536
0     9565467
Name: label, dtype: int64

In [8]:
final.drop(final[final['label'] == 1].sample(n=12470536).index, inplace=True)
final.drop(final[final['label'] == 0].sample(n=4565467).index, inplace=True)
final.reset_index(drop=True, inplace=True)

In [9]:
final['label'].value_counts()

1    5000000
0    5000000
Name: label, dtype: int64

In [11]:
final.to_pickle('D:\\Notebooks\\Datasets\\10m_final.pkl')

In [10]:
final['tok150'] = final['p_bytes'].apply(tok_150)

In [11]:
final.to_pickle('D:\\Notebooks\\Datasets\\10m_final_tok.pkl')

# Create deep learning model

In [32]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [13]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [16]:
#final = pd.read_pickle('D:\\Notebooks\\Datasets\\10m_final_tok.pkl')

In [51]:
X_train, X_test, y_train, y_test = train_test_split(final['tok150'], final['label'], test_size=0.2)
X_train = np.stack(X_train.to_numpy())
X_test = np.stack(X_test.to_numpy())
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [58]:
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

In [60]:
model = keras.Sequential()

model.add(layers.Embedding(input_dim=256, output_dim=8, input_length=150))
model.add(layers.Flatten())

#model.add(layers.Dense(256, activation="relu", input_shape=(len(imp_cols)-1,)))
#model.add(layers.Dense(256, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 8)            2048      
_________________________________________________________________
flatten_2 (Flatten)          (None, 1200)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1201      
Total params: 3,249
Trainable params: 3,249
Non-trainable params: 0
_________________________________________________________________


In [61]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [63]:
hist = model.fit(x=X_train, y=y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
#hist = model.fit(x=X_train.astype(np.float32), y=y_train, batch_size=32, epochs=10, validation_data=(X_test.astype(np.float32), y_test))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  hist = model.fit(x=X_train.astype(np.float), y=y_train, batch_size=32, epochs=10, validation_data=(X_test.astype(np.float), y_test))


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [64]:
score = model.evaluate(X_test, y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 2.02811829830285e-11
Test accuracy: 1.0


In [66]:
y_pred = model.predict(X_test)

In [70]:
y_pred

array([[1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       ...,
       [5.6603840e-19],
       [9.2418814e-26],
       [1.0000000e+00]], dtype=float32)

In [71]:
y_pred_binary = []
for pred in y_pred:
    if pred >= .5:
        y_pred_binary.append(1)
    else:
        y_pred_binary.append(0)    

In [68]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [77]:
confusion_matrix(y_test, y_pred_binary)

array([[ 999598,       0],
       [      0, 1000402]], dtype=int64)

### Since accuracy is 100% maybe there is some identifier in the data that should not be included, such as the bytes indicating the ip addresses

I will change pcap_parse to be deindentify ip addresses

In [5]:
# Remaking the dataframes with no ip addresses in the data

skip_to = 0
for x in range(len(os.listdir(mal_dir))-skip_to):
    folder = os.listdir(mal_dir)[x+skip_to]
    df = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])
    i = 1
    print("Parsing "+folder)
    for file_name in os.listdir(mal_dir+"\\"+folder):
        print(str(i)+"/"+str(len(os.listdir(mal_dir+"\\"+folder))), end=" : ")
        df = parse_pcap(mal_dir+"\\"+folder+"\\"+file_name, df, 1, hide_identifiers=True)
        i += 1
    print("Sorting Dataframe...")
    df.sort_values(by=['ts','src_ip','dst_ip','src_port','dst_port'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Matching ground truths...")
    truths = compare_truth(truth_df[truth_df['type']==folder].reset_index(drop=True), df)
    df['val_label'] = truths
    #print(df.head(3))
    print("Saving Dataframe...")
    confirmed = df[df['val_label']==1].reset_index(drop=True)
    confirmed.to_pickle('D:\\Notebooks\\Datasets\\final\\'+folder+'_final_2.pkl')

Parsing backdoor
1/1 : Using v1.0
Sorting Dataframe...
Matching ground truths...
Comparing row 0 out of 188864
Comparing row 18886 out of 188864
Comparing row 37772 out of 188864
Comparing row 56658 out of 188864
Comparing row 75544 out of 188864
Comparing row 94430 out of 188864
Comparing row 113316 out of 188864
Comparing row 132202 out of 188864
Comparing row 151088 out of 188864
Comparing row 169974 out of 188864
Comparing row 188860 out of 188864
Saving Dataframe...
Parsing ddos
1/12 : Using v2.4
2/12 : Using v1.0
3/12 : Using v1.0
4/12 : Using v1.0
5/12 : Using v2.4
6/12 : Using v1.0
7/12 : Using v1.0
8/12 : Using v1.0
9/12 : Using v1.0
10/12 : Using v1.0
11/12 : Using v1.0
12/12 : Using v1.0
Sorting Dataframe...
Matching ground truths...
Comparing row 0 out of 6031012
Comparing row 603101 out of 6031012
Comparing row 1206202 out of 6031012
Comparing row 1809303 out of 6031012
Comparing row 2412404 out of 6031012
Comparing row 3015505 out of 6031012
Comparing row 3618606 out of 6

In [6]:
df = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])
i = 1
for file_name in os.listdir(normal_dir):
    print(str(i)+"/"+str(len(os.listdir(normal_dir))), end=" : ")
    df = parse_pcap(normal_dir+"\\"+file_name, df, 0, hide_identifiers=True)
    i+=1
df.sort_values(by=['ts','src_ip','dst_ip','src_port','dst_port'], inplace=True)
df.reset_index(drop=True, inplace=True)

temp = df[df['dst_port']!=0]
temp.reset_index(drop=True, inplace=True)
temp.to_pickle('D:\\Notebooks\\Datasets\\final\\normal_final_2.pkl')

1/15 : Using v1.0
2/15 : Using v1.0
3/15 : Using v1.0
4/15 : Using v1.0
5/15 : Using v1.0
6/15 : Using v1.0
7/15 : Using v1.0
8/15 : Using v1.0
9/15 : Using v1.0
10/15 : Using v1.0
11/15 : Using v1.0
12/15 : Using v1.0
13/15 : Using v1.0
14/15 : Using v2.4
15/15 : Using v2.4


In [10]:
final = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])

for file in os.listdir('D:\\Notebooks\\Datasets\\final'):
    temp = pd.read_pickle('D:\\Notebooks\\Datasets\\final\\'+file)
    final = pd.concat([final, temp])

final.reset_index(drop=True, inplace=True)
final.drop('val_label', axis=1, inplace=True)
final.drop(final[final['label'] == 1].sample(n=12470536).index, inplace=True)
final.drop(final[final['label'] == 0].sample(n=4565467).index, inplace=True)
final.reset_index(drop=True, inplace=True)
final['tok150'] = final['p_bytes'].apply(tok_150)
final.to_pickle('D:\\Notebooks\\Datasets\\10m_final_tok_2.pkl')

In [15]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [12]:
X_train, X_test, y_train, y_test = train_test_split(final['tok150'], final['label'], test_size=0.2)
X_train = np.stack(X_train.to_numpy())
X_test = np.stack(X_test.to_numpy())
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [13]:
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

In [29]:
model2 = keras.Sequential()

model2.add(layers.Embedding(input_dim=256, output_dim=8, input_length=150))
model2.add(layers.Flatten())

#model.add(layers.Dense(256, activation="relu", input_shape=(len(imp_cols)-1,)))
#model.add(layers.Dense(256, activation="relu"))
model2.add(layers.Dense(1, activation="sigmoid"))

model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 8)            2048      
_________________________________________________________________
flatten_1 (Flatten)          (None, 1200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1201      
Total params: 3,249
Trainable params: 3,249
Non-trainable params: 0
_________________________________________________________________


In [30]:
model2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [31]:
hist2 = model2.fit(x=X_train, y=y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
score = model2.evaluate(X_test, y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 5.2700920605275314e-06
Test accuracy: 0.9999989867210388


In [34]:
y_pred = model2.predict(X_test)
y_pred_binary = []
for pred in y_pred:
    if pred >= .5:
        y_pred_binary.append(1)
    else:
        y_pred_binary.append(0)   

In [35]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [36]:
confusion_matrix(y_test, y_pred_binary)

array([[ 999664,       0],
       [      2, 1000334]], dtype=int64)

In [37]:
X_train[0:10]

array([[  0,   0,   3, ...,   0,   0,   0],
       [  0,   3,   0, ...,   0,   0,   0],
       [  0,   3,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,  12,  41, ...,   0,   0,   0],
       [  0,   4,   0, ...,  58,  34, 112]])

In [38]:
y_train[0:10]

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 0])

In [45]:
X_train[0]

array([  0,   0,   3,   4,   0,   6,   0,   0,   0,   0,   0,   0,   0,
         0,   8,   0,  69,   0,   0,  52, 247, 234,  64,   0,  64,   6,
       190,  88,   0,   0,   0,   0,   0,   0,   0,   0, 202,  70,   7,
        88, 248, 217, 239, 146, 191, 117, 172,  82, 128,  16,  14,  53,
       132, 167,   0,   0,   1,   1,   8,  10,  70,  14,  70, 184,  70,
        14,  70, 184,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

In [44]:
X_train[3]

array([  0,   3,   0,   1,   0,   6,   0,  12,  41,  91, 115,  91,  61,
        34,   8,   0,  69,   0,   0,  60, 105, 238,  64,   0,  64,   6,
        76, 191,   0,   0,   0,   0,   0,   0,   0,   0, 167,  70,   0,
        21, 178, 244, 241, 218,   0,   0,   0,   0, 160,   2, 114,  16,
       215,  22,   0,   0,   2,   4,   5, 180,   4,   2,   8,  10,  67,
        59, 235,  99,   0,   0,   0,   0,   1,   3,   3,   7,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

In [83]:
df = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])
df = parse_pcap("D:\\Notebooks\\Datasets\\testing.pcapng", df, 0, hide_identifiers=True)

Using v1.0


In [84]:
df.drop(df[df['src_ip']=="0.0.0.0"].index, inplace=True)

In [57]:
#df.drop(853, inplace=True)

In [85]:
df.reset_index(drop=True, inplace=True)
df['tok150'] = df['p_bytes'].apply(tok_150)

In [86]:
test_x = df['tok150']

In [87]:
test_x = np.stack(test_x.to_numpy())

In [88]:
y_pred2 = model2.predict(test_x)
y_pred_binary2 = []
for pred in y_pred2:
    if pred >= .5:
        y_pred_binary2.append(1)
    else:
        y_pred_binary2.append(0)  

In [89]:
np.unique(y_pred_binary2, return_counts=True)

(array([0, 1]), array([58, 80], dtype=int64))

In [82]:
179/232

0.771551724137931

## Added my own packets to normal dataset

In [93]:
df = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])
i = 1
for file_name in os.listdir(normal_dir):
    print(str(i)+"/"+str(len(os.listdir(normal_dir))), end=" : ")
    df = parse_pcap(normal_dir+"\\"+file_name, df, 0)
    i+=1

df.sort_values(by=['ts','src_ip','dst_ip','src_port','dst_port'], inplace=True)
df.reset_index(drop=True, inplace=True)
temp = df[df['dst_port']!=0]
temp = temp[temp['src_ip']!="127.0.0.1"]
temp.reset_index(drop=True, inplace=True)
temp.to_pickle('D:\\Notebooks\\Datasets\\final\\normal_final_3.pkl')

1/12 : Using v1.0
2/12 : Using v1.0
3/12 : Using v1.0
4/12 : Using v1.0
5/12 : Using v1.0
6/12 : Using v1.0
7/12 : Using v1.0
8/12 : Using v1.0
9/12 : Using v1.0
10/12 : Using v1.0
11/12 : Using v2.4
12/12 : Using v1.0


In [94]:
temp

Unnamed: 0,p_bytes,label,ts,src_ip,dst_ip,src_port,dst_port,pcap_ver
0,b'\x00\x00\x00\x01\x00\x06\x00\x0c)\xd2\xb0\x0...,0,1554220325,192.168.1.152,192.168.1.190,1880,43539,1
1,b'\x00\x00\x00\x01\x00\x06\x00\x0c)\xd2\xb0\x0...,0,1554220325,192.168.1.152,192.168.1.190,1880,43539,1
2,b'\x00\x00\x00\x01\x00\x06\x00\x0c)\xd2\xb0\x0...,0,1554220325,192.168.1.152,192.168.1.190,1880,43539,1
3,b'\x00\x00\x00\x01\x00\x06\x00\x0c)\xd2\xb0\x0...,0,1554220325,192.168.1.152,192.168.1.190,1880,43539,1
4,b'\x00\x00\x00\x01\x00\x06\x00\x0c)\xd2\xb0\x0...,0,1554220325,192.168.1.152,192.168.1.190,1880,43539,1
...,...,...,...,...,...,...,...,...
6050101,b'\x01\x00^\x7f\xff\xfa\x00b\xec\xbb\xf4\xa3\x...,0,1682024250,10.1.96.182,239.255.255.250,56937,1900,1
6050102,b'\xd8^\xd3\x8a3B\x00b\xec\xbb\xf4\xa3\x08\x00...,0,1682024250,162.159.130.234,10.1.80.118,443,7068,1
6050103,b'\xd8^\xd3\x8a3B\x00b\xec\xbb\xf4\xa3\x08\x00...,0,1682024250,162.254.192.75,10.1.80.118,27037,42217,1
6050104,b'\xd8^\xd3\x8a3B\x04\xd6\x0e\x1d\x89x\x08\x00...,0,1682024251,10.1.88.132,10.1.80.118,1900,58112,1


In [None]:
final = pd.DataFrame(columns=["p_bytes","label","ts","src_ip","dst_ip","src_port","dst_port","pcap_ver"])

for file in os.listdir('D:\\Notebooks\\Datasets\\final'):
    temp = pd.read_pickle('D:\\Notebooks\\Datasets\\final\\'+file)
    final = pd.concat([final, temp])

final.reset_index(drop=True, inplace=True)
final.drop('val_label', axis=1, inplace=True)
final.drop(final[final['label'] == 1].sample(n=12470536).index, inplace=True)
final.drop(final[final['label'] == 0].sample(n=1050105).index, inplace=True)
final.reset_index(drop=True, inplace=True)
final['tok150'] = final['p_bytes'].apply(tok_150)
final.to_pickle('D:\\Notebooks\\Datasets\\10m_final_tok_3.pkl')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final['tok150'], final['label'], test_size=0.2)
X_train = np.stack(X_train.to_numpy())
X_test = np.stack(X_test.to_numpy())
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

In [None]:
model3 = keras.Sequential()

model3.add(layers.Embedding(input_dim=256, output_dim=8, input_length=150))
model3.add(layers.Flatten())
#model.add(layers.Dense(256, activation="relu", input_shape=(len(imp_cols)-1,)))
#model.add(layers.Dense(256, activation="relu"))
model3.add(layers.Dense(1, activation="sigmoid"))

model3.summary()

In [None]:
model3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
hist3 = model3.fit(x=X_train, y=y_train, batch_size=32, epochs=10)

In [None]:
score = model3.evaluate(X_test, y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
y_pred = model3.predict(X_test)
y_pred_binary = []
for pred in y_pred:
    if pred >= .5:
        y_pred_binary.append(1)
    else:
        y_pred_binary.append(0)

In [None]:
confusion_matrix(y_test, y_pred_binary)