In [4]:
from scapy.all import *
from scapy.layers.http import HTTP
import pandas as pd
import numpy as np
import csv
import os

load_layer("tls")

In [10]:
WEB1 = "web1-background.pcap"
WEB2 = "web2-background.pcap"
WEB3 = "web3-background.pcap"
WEB_TRACES = [WEB1, WEB2, WEB3]
OUT_FOLDER = "traces_out"
CLEAN_TRACES = "traces"
DIRTY_TRACES = "../logs_all"

# Create output folder if it does not exist
if not os.path.exists(OUT_FOLDER):
    os.makedirs(OUT_FOLDER)

In [6]:
with open(os.path.join(OUT_FOLDER, "malwares_dirty.csv"), "w") as out_file:
    csv_writer = csv.writer(out_file)
    csv_writer.writerow(["ipsrc", "ipdst", "proto", "srcport", "dstport", "http_data", "ok", "malware"])
    i = 0
    for trace in os.listdir(CLEAN_TRACES):
        i += 1
        if i % 10 == 0:
            print(f"{i}/{len(os.listdir(CLEAN_TRACES))}")
        trace_path = os.path.join(DIRTY_TRACES, trace.split(".pcap")[0], trace)
        packets = rdpcap(trace_path)
        for packet in packets:
            # empty strings
            ipsrc = ""
            ipdst = ""
            proto = ""
            srcport = ""
            dstport = ""
            http_data = ""
            if packet.haslayer("IP"):
                pass
                ipsrc = packet["IP"].src
                ipdst = packet["IP"].dst
                proto = packet["IP"].proto
            if packet.haslayer("TCP"):
                srcport = packet["TCP"].sport
                dstport = packet["TCP"].dport
            if packet.haslayer("UDP"):
                srcport = packet["UDP"].sport
                dstport = packet["UDP"].dport
            if packet.haslayer(HTTP):
                if packet[HTTP].haslayer("HTTP Request"):
                    http_data = packet[HTTP].Host.decode("utf-8")
            if packet.haslayer("TLS"):
                if packet['TLS'].type == 22:
                    if type(packet['TLS'].msg[0]) == scapy.layers.tls.handshake.TLSClientHello:
                        if packet['TLS'].msg[0].msgtype == 1:
                            http_data = packet['TLS']['TLS_Ext_ServerName'].servernames[0].servername.decode("utf-8")
                            # print(http_data)
            csv_writer.writerow([ipsrc, ipdst, proto, srcport, dstport, http_data, 0, 1])



10/223




20/223




30/223




40/223




50/223




60/223




70/223




80/223




90/223




100/223




110/223




120/223




130/223




140/223




150/223




160/223




170/223




180/223




190/223




200/223




210/223




220/223




In [8]:
malware_df = pd.read_csv(os.path.join(OUT_FOLDER, "malwares_dirty_unique_ip.csv"))
malware_df

Unnamed: 0,ipsrc,ipdst,proto,srcport,dstport,http_data,ok,malware
0,192.168.1.66,192.168.1.1,17,123,123,,0,1
1,192.168.1.1,192.168.1.66,17,123,123,,0,1
2,192.168.1.66,192.168.1.1,6,5000,56070,,0,1
3,192.168.1.1,192.168.1.66,6,56070,5000,,0,1
4,192.168.1.66,192.168.1.1,6,5000,56070,,0,1
...,...,...,...,...,...,...,...,...
4408381,104.85.10.58,192.168.1.66,6,443,49929,,0,1
4408382,104.85.10.58,192.168.1.66,6,443,49929,,0,1
4408383,192.168.1.66,104.85.10.58,6,49930,443,,0,1
4408384,104.85.10.58,192.168.1.66,6,443,49930,,0,1


In [13]:
# shuffle the malware dataframe and cut it in 1/10
malware_df = malware_df.sample(frac=1).reset_index(drop=True)
malware_df = malware_df[:int(len(malware_df)/10)]
malware_df

Unnamed: 0,ipsrc,ipdst,proto,srcport,dstport,http_data,ok,malware
0,199.232.210.172,192.168.1.66,6.0,80.0,49883.0,,0,1
1,8.8.8.8,192.168.1.66,17.0,53.0,63047.0,,0,1
2,95.101.123.80,192.168.1.66,6.0,443.0,49951.0,,0,1
3,69.197.47.17,192.168.1.66,6.0,80.0,49868.0,,0,1
4,192.168.1.66,13.85.23.206,6.0,49892.0,443.0,,0,1
...,...,...,...,...,...,...,...,...
440959,192.168.1.66,193.206.135.8,6.0,49956.0,80.0,,0,1
440960,69.197.47.20,192.168.1.66,6.0,80.0,49891.0,,0,1
440961,69.197.47.17,192.168.1.66,6.0,80.0,49888.0,,0,1
440962,192.168.1.66,20.3.187.198,6.0,49923.0,443.0,,0,1


In [14]:
# join all 3 web traces into one dataframe
web_df = pd.concat([pd.read_csv(os.path.join(OUT_FOLDER, f"{trace}_unique_ip.csv")) for trace in WEB_TRACES])
web_df

Unnamed: 0,ipsrc,ipdst,proto,srcport,dstport,http_data,ok,malware
0,204.79.197.203,192.168.1.66,6.0,443.0,49832.0,,1,0
1,204.79.197.203,192.168.1.66,6.0,443.0,49832.0,,1,0
2,204.79.197.203,192.168.1.66,6.0,443.0,49832.0,,1,0
3,204.79.197.203,192.168.1.66,6.0,443.0,49832.0,,1,0
4,204.79.197.203,192.168.1.66,6.0,443.0,49832.0,,1,0
...,...,...,...,...,...,...,...,...
73479,131.253.33.239,192.168.1.66,6.0,443.0,49891.0,,1,0
73480,192.168.1.66,99.80.43.51,6.0,49890.0,443.0,,1,0
73481,192.168.1.66,99.80.43.51,6.0,49890.0,443.0,,1,0
73482,192.168.1.66,8.8.8.8,17.0,62769.0,53.0,,1,0


In [15]:
# join the two dataframes
malware_df = pd.concat([malware_df, web_df])
malware_df

Unnamed: 0,ipsrc,ipdst,proto,srcport,dstport,http_data,ok,malware
0,199.232.210.172,192.168.1.66,6.0,80.0,49883.0,,0,1
1,8.8.8.8,192.168.1.66,17.0,53.0,63047.0,,0,1
2,95.101.123.80,192.168.1.66,6.0,443.0,49951.0,,0,1
3,69.197.47.17,192.168.1.66,6.0,80.0,49868.0,,0,1
4,192.168.1.66,13.85.23.206,6.0,49892.0,443.0,,0,1
...,...,...,...,...,...,...,...,...
73479,131.253.33.239,192.168.1.66,6.0,443.0,49891.0,,1,0
73480,192.168.1.66,99.80.43.51,6.0,49890.0,443.0,,1,0
73481,192.168.1.66,99.80.43.51,6.0,49890.0,443.0,,1,0
73482,192.168.1.66,8.8.8.8,17.0,62769.0,53.0,,1,0


In [16]:
malware_df.to_csv(os.path.join(OUT_FOLDER, "malware_web_dirty_mixed.csv"), index=False)

In [40]:
malware_df = pd.read_csv(os.path.join(OUT_FOLDER, "malware_web_dirty_mixed.csv"))
malware_df

Unnamed: 0,ipsrc,ipdst,proto,srcport,dstport,http_data,ok,malware
0,199.232.210.172,192.168.1.66,6.0,80.0,49883.0,,0,1
1,8.8.8.8,192.168.1.66,17.0,53.0,63047.0,,0,1
2,95.101.123.80,192.168.1.66,6.0,443.0,49951.0,,0,1
3,69.197.47.17,192.168.1.66,6.0,80.0,49868.0,,0,1
4,192.168.1.66,13.85.23.206,6.0,49892.0,443.0,,0,1
...,...,...,...,...,...,...,...,...
560370,131.253.33.239,192.168.1.66,6.0,443.0,49891.0,,1,0
560371,192.168.1.66,99.80.43.51,6.0,49890.0,443.0,,1,0
560372,192.168.1.66,99.80.43.51,6.0,49890.0,443.0,,1,0
560373,192.168.1.66,8.8.8.8,17.0,62769.0,53.0,,1,0


In [41]:
# split labels and features
labels = malware_df[["ok" ,"malware"]]
features = malware_df.drop(columns=["ok", "malware"])

In [42]:
# expanding ip addr object fields, separating bytes and converting them to float
added_field = features["ipsrc"].str.split(".", expand=True).astype('float64')
features.drop("ipsrc", axis=1, inplace=True)
features = pd.concat([features, added_field], axis=1)

added_field = features["ipdst"].str.split(".", expand=True).astype('float64')
features.drop("ipdst", axis=1, inplace=True)
features = pd.concat([features, added_field], axis=1)

features

Unnamed: 0,proto,srcport,dstport,http_data,0,1,2,3,0.1,1.1,2.1,3.1
0,6.0,80.0,49883.0,,199.0,232.0,210.0,172.0,192.0,168.0,1.0,66.0
1,17.0,53.0,63047.0,,8.0,8.0,8.0,8.0,192.0,168.0,1.0,66.0
2,6.0,443.0,49951.0,,95.0,101.0,123.0,80.0,192.0,168.0,1.0,66.0
3,6.0,80.0,49868.0,,69.0,197.0,47.0,17.0,192.0,168.0,1.0,66.0
4,6.0,49892.0,443.0,,192.0,168.0,1.0,66.0,13.0,85.0,23.0,206.0
...,...,...,...,...,...,...,...,...,...,...,...,...
560370,6.0,443.0,49891.0,,131.0,253.0,33.0,239.0,192.0,168.0,1.0,66.0
560371,6.0,49890.0,443.0,,192.0,168.0,1.0,66.0,99.0,80.0,43.0,51.0
560372,6.0,49890.0,443.0,,192.0,168.0,1.0,66.0,99.0,80.0,43.0,51.0
560373,17.0,62769.0,53.0,,192.0,168.0,1.0,66.0,8.0,8.0,8.0,8.0


In [43]:
features.fillna("", inplace=True)

  features.fillna("", inplace=True)


In [44]:
# expand HTTP data
maxlen = 32

added_field = features["http_data"].str[:maxlen].str.encode('utf-8')

for i in range(len(added_field)):
    seq = [0] * maxlen
    arr = added_field[i]
    arrlen = len(arr)
    for j in range(arrlen):
        seq[maxlen - arrlen + j] = arr[j]
    added_field[i] = seq

for field in added_field:
    if field != list([0] * maxlen):
        print(field)

features.drop("http_data", axis=1, inplace=True)
features = pd.concat([features, pd.DataFrame(added_field.to_list())], axis=1)
features


[0, 0, 0, 0, 0, 0, 0, 0, 0, 99, 116, 108, 100, 108, 46, 119, 105, 110, 100, 111, 119, 115, 117, 112, 100, 97, 116, 101, 46, 99, 111, 109]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 112, 105, 46, 118, 107, 46, 99, 111, 109]
[0, 0, 0, 118, 49, 48, 46, 101, 118, 101, 110, 116, 115, 46, 100, 97, 116, 97, 46, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 99, 111, 109]
[115, 101, 114, 118, 101, 114, 46, 101, 118, 101, 110, 116, 115, 46, 100, 97, 116, 97, 46, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 99, 111, 109]
[0, 0, 0, 0, 0, 0, 0, 102, 100, 46, 97, 112, 105, 46, 105, 114, 105, 115, 46, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46, 99, 111, 109]
[49, 100, 46, 116, 108, 117, 46, 100, 108, 46, 100, 101, 108, 105, 118, 101, 114, 121, 46, 109, 112, 46, 109, 105, 99, 114, 111, 115, 111, 102, 116, 46]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54, 57, 46, 49, 57, 55, 46, 52, 55, 46, 49, 50]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Unnamed: 0,proto,srcport,dstport,0,1,2,3,0.1,1.1,2.1,...,22,23,24,25,26,27,28,29,30,31
0,6.0,80.0,49883.0,199.0,232.0,210.0,172.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,17.0,53.0,63047.0,8.0,8.0,8.0,8.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,6.0,443.0,49951.0,95.0,101.0,123.0,80.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,6.0,80.0,49868.0,69.0,197.0,47.0,17.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,6.0,49892.0,443.0,192.0,168.0,1.0,66.0,13.0,85.0,23.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560370,6.0,443.0,49891.0,131.0,253.0,33.0,239.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
560371,6.0,49890.0,443.0,192.0,168.0,1.0,66.0,99.0,80.0,43.0,...,0,0,0,0,0,0,0,0,0,0
560372,6.0,49890.0,443.0,192.0,168.0,1.0,66.0,99.0,80.0,43.0,...,0,0,0,0,0,0,0,0,0,0
560373,17.0,62769.0,53.0,192.0,168.0,1.0,66.0,8.0,8.0,8.0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# convert empty strings to 0
features = features.replace("", 0)
features

  features = features.replace("", 0)


Unnamed: 0,proto,srcport,dstport,0,1,2,3,0.1,1.1,2.1,...,22,23,24,25,26,27,28,29,30,31
0,6.0,80.0,49883.0,199.0,232.0,210.0,172.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,17.0,53.0,63047.0,8.0,8.0,8.0,8.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,6.0,443.0,49951.0,95.0,101.0,123.0,80.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,6.0,80.0,49868.0,69.0,197.0,47.0,17.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,6.0,49892.0,443.0,192.0,168.0,1.0,66.0,13.0,85.0,23.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560370,6.0,443.0,49891.0,131.0,253.0,33.0,239.0,192.0,168.0,1.0,...,0,0,0,0,0,0,0,0,0,0
560371,6.0,49890.0,443.0,192.0,168.0,1.0,66.0,99.0,80.0,43.0,...,0,0,0,0,0,0,0,0,0,0
560372,6.0,49890.0,443.0,192.0,168.0,1.0,66.0,99.0,80.0,43.0,...,0,0,0,0,0,0,0,0,0,0
560373,17.0,62769.0,53.0,192.0,168.0,1.0,66.0,8.0,8.0,8.0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# normalize data
for column in features.columns:
    print(column)
    features[column] = features[column] / features[column].abs().max()
features

proto
srcport
dstport
0
1
2
3
0
1
2
3
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


Unnamed: 0,proto,srcport,dstport,0,1,2,3,0.1,1.1,2.1,...,22,23,24,25,26,27,28,29,30,31
0,0.352941,0.001221,0.761177,0.904545,0.909804,0.823529,0.674510,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.000000,0.000809,0.962050,0.036364,0.031373,0.031373,0.031373,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.352941,0.006760,0.762215,0.431818,0.396078,0.482353,0.313725,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.352941,0.001221,0.760949,0.313636,0.772549,0.184314,0.066667,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.352941,0.761303,0.006760,0.872727,0.658824,0.003922,0.258824,0.059091,0.333333,0.090196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560370,0.352941,0.006760,0.761299,0.595455,0.992157,0.129412,0.937255,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
560371,0.352941,0.761273,0.006760,0.872727,0.658824,0.003922,0.258824,0.450000,0.313725,0.168627,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
560372,0.352941,0.761273,0.006760,0.872727,0.658824,0.003922,0.258824,0.450000,0.313725,0.168627,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
560373,1.000000,0.957794,0.000809,0.872727,0.658824,0.003922,0.258824,0.036364,0.031373,0.031373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# add labels back
final_df = pd.concat([features, labels], axis=1)
final_df

Unnamed: 0,proto,srcport,dstport,0,1,2,3,0.1,1.1,2.1,...,24,25,26,27,28,29,30,31,ok,malware
0,0.352941,0.001221,0.761177,0.904545,0.909804,0.823529,0.674510,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
1,1.000000,0.000809,0.962050,0.036364,0.031373,0.031373,0.031373,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
2,0.352941,0.006760,0.762215,0.431818,0.396078,0.482353,0.313725,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,0.352941,0.001221,0.760949,0.313636,0.772549,0.184314,0.066667,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
4,0.352941,0.761303,0.006760,0.872727,0.658824,0.003922,0.258824,0.059091,0.333333,0.090196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560370,0.352941,0.006760,0.761299,0.595455,0.992157,0.129412,0.937255,0.872727,0.658824,0.003922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
560371,0.352941,0.761273,0.006760,0.872727,0.658824,0.003922,0.258824,0.450000,0.313725,0.168627,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
560372,0.352941,0.761273,0.006760,0.872727,0.658824,0.003922,0.258824,0.450000,0.313725,0.168627,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
560373,1.000000,0.957794,0.000809,0.872727,0.658824,0.003922,0.258824,0.036364,0.031373,0.031373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


In [48]:
# write to file
final_df.to_csv(os.path.join(OUT_FOLDER, "malware_web_dirty_mixed_cleaned.csv"), index=False, header=False)