# EDA for network dataframe: Traffic Data from Kyoto University's Honeypots

## Load libreraries

In [1]:
import os
import shutil
from zipfile import ZipFile

import pandas as pd
import requests as req
from tqdm import tqdm
from ydata_profiling import ProfileReport
from pycaret.classification import setup, compare_models, create_model, evaluate_model

### Reading and working with data

In [2]:
# Variables to data
FOLDER_RAW:str = '../data/raw/'
YEAR:int = 2007
MONTH:int = 1
DAY:int = 1

In [3]:
# Download file
FILE = f"{YEAR}{MONTH:02d}.zip"
URL_PATH = f"http://www.takakura.com/Kyoto_data/new_data201704/{YEAR}/{FILE}"
PATH = "../data/raw"

# Check if file exists
if not os.path.exists(f"{PATH}/{FILE}"):
    print("Download files:")
    resp=req.get(URL_PATH, stream=True)
    save_path=f"{PATH}/{FILE}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=FILE,
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

# Unzip file
with ZipFile(f"{PATH}/{FILE}", 'r') as zip_ref:
    zip_ref.extractall(f"{PATH}")

Download files:


200701.zip: 100%|██████████| 43382104/43382104 [08:01<00:00, 90040.79it/s, save to ../data/raw/200701.zip]


In [4]:
# Move file to the correct folder
if os.path.exists(f"{PATH}/Kyoto2016"):
    print("Move files:")
    shutil.move(f"{PATH}/Kyoto2016/{YEAR}", f"{PATH}/{YEAR}")
    os.removedirs(f"{PATH}/Kyoto2016/")

Move files:


In [6]:
# Load dataset
path: str = f"{FOLDER_RAW}{YEAR:04d}/{MONTH:02d}/{YEAR:04d}{MONTH:02d}{DAY:02d}.txt"
headers = [
    "duration",
    "service",
    "source_bytes",
    "destination_bytes",
    "count",
    "same_srv_rate",
    "serror_rate",
    "srv_serror_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_srv_rate",
    "dst_host_same_src_port_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "flag",
    "ids_detection",
    "malware_detection",
    "ashula_detection",
    "label",
    "source_ip_address",
    "source_port_number",
    "destination_ip_address",
    "destination_port_number",
    "start_time",
    "protocol",
]
df = pd.read_csv(path, sep='\t', header=None)
print(len(headers), len(df.columns))
df.columns = headers

with pd.option_context(
    'display.max_colwidth', None,
    'display.max_columns', None,
    'display.max_rows', None):
    display(df.head())

# Preclean data
del df["ids_detection"]

25 24


ValueError: Length mismatch: Expected axis has 24 elements, new values have 25 elements

### First steps with dataframe

In [None]:
profile = ProfileReport(
    df, title='Dataset Network profiling', explorative=True
)
profile

In [None]:
clf_setup = setup(df, session_id=42)