# EDA for network dataframe: Traffic Data from Kyoto University's Honeypots

## Load libreraries

In [1]:
import ipaddress
import os
import shutil
from zipfile import ZipFile

import pandas as pd
import requests as req
from tqdm import tqdm
from ydata_profiling import ProfileReport
from pycaret.classification import setup, compare_models, create_model, evaluate_model

### Reading and working with data

In [2]:
# Variables to data
FOLDER_RAW:str = '../data/raw/'
YEAR:int = 2007
MONTH:int = 1
DAYS:list[int] = [1, 31]

In [3]:
# Download file
FILE = f"{YEAR}{MONTH:02d}.zip"
URL_PATH = f"http://www.takakura.com/Kyoto_data/new_data201704/{YEAR}/{FILE}"
PATH = "../data/raw"

# Check if file exists
if not os.path.exists(f"{PATH}/{FILE}"):
    print("Download files:")
    resp=req.get(URL_PATH, stream=True)
    save_path=f"{PATH}/{FILE}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=FILE,
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

    # Unzip file
    with ZipFile(f"{PATH}/{FILE}", 'r') as zip_ref:
        zip_ref.extractall(f"{PATH}")
        os.remove(f"{PATH}/{FILE}")

Download files:


200701.zip: 100%|██████████| 43382104/43382104 [08:14<00:00, 87777.33it/s, save to ../data/raw/200701.zip]


In [4]:
# Move file to the correct folder
if os.path.exists(f"{PATH}/Kyoto2016"):
    print("Move files:")
    shutil.move(f"{PATH}/Kyoto2016/{YEAR}", f"{PATH}/{YEAR}")
    os.removedirs(f"{PATH}/Kyoto2016/")

Move files:


In [5]:
# Load dataset
headers = [
    "duration",
    "service",
    "source_bytes",
    "destination_bytes",
    "count",
    "same_srv_rate",
    "serror_rate",
    "srv_serror_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_src_port_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "flag",
    "ids_detection",
    "malware_detection",
    "ashula_detection",
    "label",
    "source_ip_address",
    "source_port_number",
    "destination_ip_address",
    "destination_port_number",
    "start_time",
    "protocol",
]

df_list = []

for DAY in DAYS:
    path: str = f"{FOLDER_RAW}{YEAR:04d}/{MONTH:02d}/{YEAR:04d}{MONTH:02d}{DAY:02d}.txt"

    if not os.path.exists(path):
        continue

    df_dirty = pd.read_csv(path, sep="\t", header=None)
    df_list.append(df_dirty)

df = pd.concat(df_list)
print(len(headers), len(df.columns))
df.columns = headers

with pd.option_context(
    'display.max_colwidth', None,
    'display.max_columns', None,
    'display.max_rows', None):
    display(df.head())

# Preclean data
del df["ids_detection"]

24 24


Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,flag,ids_detection,malware_detection,ashula_detection,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,start_time,protocol
0,86364.573924,other,240680,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fda2:69aa:1f1a:2d57:7da5:27fc:07e8:2808,32770,fda2:69aa:1f1a:425e:1046:01b0:02d4:2adb,8649,00:00:18,udp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,60(2),-2,fda2:69aa:1f1a:509a:0b19:590a:0528:2375,1050,fda2:69aa:1f1a:f505:7df6:2782:60e4:44d6,1434,00:00:27,udp
2,0.00334,other,48,48,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,0,0,0,-1,fda2:69aa:1f1a:232a:7a25:0083:5f86:3cc0,123,fda2:69aa:1f1a:f820:7d99:2701:0ff4:1570,123,00:00:53,udp
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fda2:69aa:1f1a:a757:7d73:278f:61f1:0f3f,138,fda2:69aa:1f1a:1499:7d6b:27b7:6172:002c,138,00:00:57,udp
4,0.311797,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,RSTO,0,0,0,-1,fda2:69aa:1f1a:9113:3c52:037e:52b3:2742,11810,fda2:69aa:1f1a:ec01:7d38:2763:0f17:1b37,139,00:01:17,tcp


In [6]:
# Size of the dataset
df.shape

(98581, 23)

In [7]:
# Group of label
df.groupby(['label'])['label'].count()

label
-2     5231
-1    71487
 1    21863
Name: label, dtype: int64

In [8]:
df.dtypes

duration                       float64
service                         object
source_bytes                     int64
destination_bytes                int64
count                            int64
same_srv_rate                  float64
serror_rate                    float64
srv_serror_rate                float64
dst_host_count                   int64
dst_host_srv_count               int64
dst_host_same_src_port_rate    float64
dst_host_serror_rate           float64
dst_host_srv_serror_rate       float64
flag                            object
malware_detection               object
ashula_detection                object
label                            int64
source_ip_address               object
source_port_number               int64
destination_ip_address          object
destination_port_number          int64
start_time                      object
protocol                        object
dtype: object

### First steps with dataframe

#### Helper funtions

In [9]:
def transform_ipv6_to_ipv4(ip:str)->str:
    ipv6 = ipaddress.IPv6Address(ip)
    prefix = "fda2:"

    if not ipv6.exploded.startswith(prefix):
        return "Failed"

    ipv6_suffix = ipv6.exploded[len(prefix):].replace(":", "")

    ipv4_hex = bytes.fromhex(ipv6_suffix)
    ipv4 = ipaddress.IPv4Address(ipv4_hex)

    return str(ipv4)

In [10]:
# Categorical columns
cat_cols = [
    "service",
    "flag",
    "ashula_detection",
    "label",
    # "source_ip_address",
    # "destination_ip_address",
    "protocol",
]

In [11]:
# Show values of categoricals
for col in cat_cols:
    print(f"{col}: {df[col].unique()}")

service: ['other' 'ssl' 'smtp' 'dns' 'ssh' 'smtp,ssl' 'http' 'ftp']
flag: ['S0' 'SF' 'RSTO' 'REJ' 'OTH' 'S3' 'SHR' 'RSTOS0' 'RSTRH' 'RSTR' 'SH' 'S1'
 'S2']
ashula_detection: ['0' '60(2)' '60(1)' '130(1),131(2)' '129(4)' '151(1)' '119(1)' '80(1)'
 '129(2)' '41(1)' '56(2)' '152(1),153(1)' '56(1)' '63(1)' '163(2),164(4)']
label: [-1 -2  1]
protocol: ['udp' 'tcp' 'icmp']


#### Transforming variables

In [16]:
df_transform = df.copy()

# transform source_ip_address and destination_ip_address
df_transform["source_ip_address"] = df_transform["source_ip_address"].apply(transform_ipv6_to_ipv4)
df_transform["destination_ip_address"] = df_transform["destination_ip_address"].apply(transform_ipv6_to_ipv4)

# transform service
df_transform["service_split"] = df_transform["service"].str.split(" ")
df_transform = df_transform.explode("service_split")
freq_service = df_transform["service_split"].value_counts().to_dict()
df_transform["freq_service"] = df_transform["service_split"].map(freq_service)

# transform flag
freq_flag = df_transform["flag"].value_counts().to_dict()
df_transform["freq_flag"] = df_transform["flag"].map(freq_flag)

# transform ashula_detection
df_transform["ashula_split"] = df_transform["ashula_detection"].str.split(",")
df_transform = df_transform.explode("ashula_split")
df_transform["ashula_code"] = df_transform["ashula_split"].str.extract('(\d+)')
freq_ashula = df_transform["ashula_code"].value_counts().to_dict()
df_transform["freq_ashula"] = df_transform["ashula_code"].map(freq_ashula)

# transform label
df_transform["label"] = df_transform["label"].replace({-1: 1, -2: 1, 1:0})

# transform protocol
freq_protocol = df_transform["protocol"].value_counts().to_dict()
df_transform["freq_protocol"] = df_transform["protocol"].map(freq_protocol)

# Show Results
with pd.option_context(
    'display.max_colwidth', None,
    'display.max_columns', None,
    'display.max_rows', None):
    display(df_transform.head())

Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,flag,malware_detection,ashula_detection,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,start_time,protocol,service_split,freq_service,freq_flag,ashula_split,ashula_code,freq_ashula,freq_protocol
0,86364.573924,other,240680,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,1,Failed,32770,Failed,8649,00:00:18,udp,other,74678,20122,0,0,92419,10920
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,60(2),1,Failed,1050,Failed,1434,00:00:27,udp,other,74678,20122,60(2),60,5715,10920
2,0.00334,other,48,48,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,0,0,1,Failed,123,Failed,123,00:00:53,udp,other,74678,24219,0,0,92419,10920
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,1,Failed,138,Failed,138,00:00:57,udp,other,74678,20122,0,0,92419,10920
4,0.311797,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,RSTO,0,0,1,Failed,11810,Failed,139,00:01:17,tcp,other,74678,11303,0,0,92419,56847


In [None]:
# Delete columns
df_transform = df_transform.drop(columns=["service", "flag", "ashula_detection", "ashula_split", "ashula_code", "service_split", "protocol"])

In [13]:
profile = ProfileReport(
    df, title='Dataset Network profiling', explorative=True
)
profile.to_file("../reports/profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
df.corr()

ValueError: could not convert string to float: 'other'