In [1]:
%load_ext autoreload
%autoreload 2

https://pdfs.semanticscholar.org/2342/9b5be933e16e6988da9a322ad95dfdc8c4b0.pdf

In [3]:
import os
import io
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter


In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
def read_conn_log(dpth: str) -> pd.DataFrame:
    cols_to_drop = [
        "local_orig",
        "local_resp",
        "missed_bytes",
        "history",
        "orig_ip_bytes",
        "resp_ip_bytes",
        "tunnel_parents",
    ]
    separator = "\t"
    set_separator = ","

    with open(os.path.join(dpth, "conn.log")) as f:
        conn_log_lines = f.readlines()

    # Find the line starting with #fields
    fields_line = [i for i in conn_log_lines if i.startswith("#fields")][0].strip()
    fields = fields_line.split(separator)[1:]

    # Remove all lines starting with #
    conn_log_lines = [i.strip() for i in conn_log_lines if not i.startswith("#")]

    # Create dataframe from lines, with fields as column names
    df = pd.DataFrame(
        [i.strip().split(separator) for i in conn_log_lines], columns=fields
    )

    # Drop columns
    df = df.drop(columns=cols_to_drop)

    return df


## Botnet task

In [4]:
# Change this to point to the dataset directory
base_path = ""
base_dir = f"{base_path}/cic-ids-2018/friday_02-03-2018_zeek/"

In [5]:
tot_dirs = os.listdir(base_dir)
print("Found {} directories".format(len(tot_dirs)))

Found 442 directories


In [6]:
all_dfs = []

for d in tot_dirs:
    # Load the conn.log file in the directory
    df = read_conn_log(os.path.join(base_dir, d))
    all_dfs.append(df)


In [7]:
all_dfs = pd.concat(all_dfs)
display(all_dfs)
print("Total number of connections: {}".format(len(all_dfs)))

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,orig_pkts,resp_pkts
0,1519995037.811992,Cxii6q3yo7eMpIW6ue,117.149.164.184,54425,172.31.69.15,3389,tcp,-,0.303909,0,0,REJ,2,1
1,1519995043.238823,Cy6XaS3okrzpsOQgSk,77.72.82.96,56855,172.31.69.15,8162,tcp,-,0.110472,0,0,REJ,2,1
2,1519995039.095337,Cl9FnK2EIophqZMNq4,172.31.69.15,45052,91.189.94.4,123,udp,ntp,0.091543,48,48,SF,1,1
3,1519995111.101158,Cza4X02UiNI6OpFw6a,86.105.196.101,5818,172.31.69.15,10669,tcp,-,0.000022,0,0,RSTRH,1,1
4,1519995124.936724,ClsjeN15EDx6dDu0ba,5.188.9.25,50099,172.31.69.15,20020,tcp,-,0.113692,0,0,REJ,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8291,1520026660.021127,CTqtbB1NEqhoTYRws,213.202.230.223,14061,172.31.67.98,3389,tcp,-,0.323888,43,0,S1,3,2
8292,1520026653.394961,CTEn3x1TiNh3yTgoW,5.101.40.43,53815,172.31.67.98,3389,tcp,ssl,11.132096,1144,1581,RSTO,8,8
8293,1520026613.201921,C6rlto2mAmgDroEXf6,172.31.67.98,50832,52.10.55.151,443,tcp,ssl,50.010975,6344,3188,S1,13,13
8294,1520026605.691789,CcxoNC2CiV9Htx5lEe,172.31.67.98,50831,169.254.169.254,80,tcp,http,60.024012,97,231,SF,6,5


Total number of connections: 6244607


In [8]:
# Remove all rows where id.orig_h == id.resp_h
all_dfs = all_dfs[all_dfs["id.orig_h"] != all_dfs["id.resp_h"]]
print("Remaining connections: {}".format(len(all_dfs)))
# Remove all rows where either id.orig_h or id.resp_h is an IPV6 address
all_dfs = all_dfs[
    ~all_dfs["id.orig_h"].str.contains(":") & ~all_dfs["id.resp_h"].str.contains(":")
]
print("Remaining connections: {}".format(len(all_dfs)))

Remaining connections: 6244607
Remaining connections: 6203903


In [9]:
attacker_ip = "18.219.211.138"

In [10]:
# Create a vector of 1s and 0s, where 1 indicates that the connection is either from or to the attacker
is_attacker = (all_dfs["id.orig_h"] == attacker_ip) | (all_dfs["id.resp_h"] == attacker_ip)
print("Number of connections from/to attacker: {}".format(sum(is_attacker)))

all_dfs["label"] = is_attacker.astype(int)

# Print the percentage of connections from/to the attacker
print("Percentage of connections from/to attacker: {:.2f}%".format(100 * sum(is_attacker) / len(all_dfs)))

Number of connections from/to attacker: 143183
Percentage of connections from/to attacker: 2.31%


In [11]:
# Convert the timestamp to a datetime object
dates = [datetime.fromtimestamp(float(i)) for i in all_dfs["ts"]]

In [12]:
# Find earliest and latest date
print("Earliest date: {}".format(min(dates)))
print("Latest date: {}".format(max(dates)))
# Count the number of dates before 12.00 and after 12.00
before_12 = sum([i.hour < 12 for i in dates])
after_12 = sum([i.hour >= 12 for i in dates])
print("Number of connections before 12.00: {}".format(before_12))
print("Number of connections after 12.00: {}".format(after_12))

# Count the percentage of label 1s before 12.00 and after 12.00
before_12_attacker = sum(
    (all_dfs["label"] == 1) & ([i.hour < 12 for i in dates])
)
after_12_attacker = sum(
    (all_dfs["label"] == 1) & ([i.hour >= 12 for i in dates])
)
print("Percentage of connections from/to attacker before 12.00: {:.2f}%".format(100 * before_12_attacker / before_12))
print("Percentage of connections from/to attacker after 12.00: {:.2f}%".format(100 * after_12_attacker / after_12))


Earliest date: 2018-02-27 07:18:26.348188
Latest date: 2018-03-02 19:39:51.398516
Number of connections before 12.00: 3005968
Number of connections after 12.00: 3197935
Percentage of connections from/to attacker before 12.00: 2.41%
Percentage of connections from/to attacker after 12.00: 2.21%


### Basic classification test

In [13]:
# Split the data into train and test by selecting connections before and after 12.00
train = all_dfs[[i.hour < 12 for i in dates]]
test = all_dfs[[i.hour >= 12 for i in dates]]

print("Number of connections in train: {}".format(len(train)))
print("Number of connections in test: {}".format(len(test)))

Number of connections in train: 3005968
Number of connections in test: 3197935


In [14]:
y_trn = train["label"]
y_tst = test["label"]

x_trn = train.drop(columns=["label", "ts", "uid", "id.orig_h", "id.resp_h", "service"])
x_tst = test.drop(columns=["label", "ts", "uid", "id.orig_h", "id.resp_h", "service"])
print("Initial number of features: {}".format(len(x_trn.columns)))
assert len(x_trn.columns) == len(x_tst.columns)

# Convert all '-' to 0
x_trn = x_trn.replace("-", 0)
x_tst = x_tst.replace("-", 0)

# One hot encode the categorical features
# categorical_features = ["proto", "service", "conn_state"]
categorical_features = ["proto", "conn_state"]
x_trn = pd.get_dummies(x_trn, columns=categorical_features)
x_tst = pd.get_dummies(x_tst, columns=categorical_features)
print("Number of features after one hot encoding: {}".format(len(x_trn.columns)))
assert len(x_trn.columns) == len(x_tst.columns)

Initial number of features: 9
Number of features after one hot encoding: 23


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [16]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
model.fit(x_trn, y_trn)

In [17]:
preds = model.predict(x_tst)
print("Accuracy: {:.2f}%".format(100 * accuracy_score(y_tst, preds)))
print("F1 score: {:.2f}%".format(100 * f1_score(y_tst, preds)))
print("Confusion matrix:")
print(confusion_matrix(y_tst, preds))
print("Classification report:")
print(classification_report(y_tst, preds))

Accuracy: 99.99%
F1 score: 99.81%
Confusion matrix:
[[3127139       0]
 [    262   70534]]
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3127139
           1       1.00      1.00      1.00     70796

    accuracy                           1.00   3197935
   macro avg       1.00      1.00      1.00   3197935
weighted avg       1.00      1.00      1.00   3197935



In [18]:
# Show the relative importance of each feature
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(x_trn.shape[1]):
    print(
        "{:3}. feature {:20} {:5}".format(
            f + 1, x_trn.columns[indices[f]], importances[indices[f]]
        )
    )

Feature ranking:
  1. feature id.resp_p            0.3395010131447603
  2. feature resp_bytes           0.19752568100270224
  3. feature orig_bytes           0.12982895422095259
  4. feature resp_pkts            0.1000632226807775
  5. feature duration             0.09045377443538868
  6. feature orig_pkts            0.06281122706959101
  7. feature id.orig_p            0.037939579377925636
  8. feature conn_state_SF        0.016382729517927658
  9. feature proto_udp            0.00899408157562793
 10. feature conn_state_S0        0.0070575817545198045
 11. feature proto_tcp            0.006151088565793543
 12. feature conn_state_RSTR      0.0016898300303897925
 13. feature conn_state_RSTO      0.0006684459434866043
 14. feature conn_state_SHR       0.0005398960410000203
 15. feature conn_state_OTH       0.0002741171753579982
 16. feature conn_state_REJ       0.00011705769475004322
 17. feature conn_state_RSTRH     6.174512820872116e-07
 18. feature proto_icmp           4.9990820589351

In [19]:
# Distribution of ports used by the attacker
attacker_ports = all_dfs[all_dfs["label"] == 1]["id.resp_p"]
print("Number of unique ports used by the attacker: {}".format(len(attacker_ports.unique())))
print("Most common ports used by the attacker: {}".format(attacker_ports.value_counts().head(10)))

Number of unique ports used by the attacker: 1
Most common ports used by the attacker: 8080    143183
Name: id.resp_p, dtype: int64


In [20]:
other_ports = all_dfs[all_dfs["label"] == 0]["id.resp_p"]
print("Number of unique ports used by other hosts: {}".format(len(other_ports.unique())))


Number of unique ports used by other hosts: 12543


In [21]:
# Number of times non attacker hosts used port 8080
print("Number of times non attacker hosts used port 8080: {}".format(sum(other_ports == "8080")))

Number of times non attacker hosts used port 8080: 3711


## Infiltration task

In [22]:
base_dir = f"{base_path}/cic-ids-2018/thursday_01-03-2018_zeek"

In [23]:
tot_dirs = os.listdir(base_dir)
print("Found {} directories".format(len(tot_dirs)))

Found 443 directories


In [24]:
all_dfs = []

for d in tot_dirs:
    # Load the conn.log file in the directory
    df = read_conn_log(os.path.join(base_dir, d))
    all_dfs.append(df)


In [25]:
all_dfs = pd.concat(all_dfs)
display(all_dfs)
print("Total number of connections: {}".format(len(all_dfs)))

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,orig_pkts,resp_pkts
0,1519906857.491276,CIzFxH2nx1XUxB9kKf,5.188.11.111,57580,172.31.69.15,4010,tcp,-,0.101403,0,0,REJ,2,1
1,1519906870.368963,CGATlmcUF8O9KObL,79.107.241.121,27485,172.31.69.15,23,tcp,-,0.000019,0,0,REJ,1,1
2,1519906899.453201,CG7sSd20gKfsNBvo49,197.50.232.198,54948,172.31.69.15,445,tcp,-,0.000023,0,0,REJ,1,1
3,1519906900.130016,C90Aiq9qL8NlfTcLf,197.50.232.198,54948,172.31.69.15,445,tcp,-,0.000021,0,0,REJ,1,1
4,1519906928.566989,CNJsHP2rxPibF2t5ij,139.215.216.36,31911,172.31.69.15,23,tcp,-,0.000019,0,0,REJ,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7249,1519940226.793272,C5zsxL1DAB064ujx5,176.99.158.199,54672,172.31.67.98,3389,tcp,ssl,14.068993,1148,1544,S1,9,8
7250,1519940223.972522,CokJSM3kYi2KUY79D6,69.16.232.48,61971,172.31.67.98,3389,tcp,ssl,12.944852,1132,1581,RSTO,9,8
7251,1519940215.796293,CtLwWv2ZwPLKn3Bz5j,172.31.67.98,51019,169.254.169.254,80,tcp,http,5.765294,97,231,S3,5,4
7252,1519940185.740850,CvA3Ab3nvDun4I6cza,172.31.67.98,51011,34.211.99.53,443,tcp,ssl,48.995335,863,3275,S1,12,11


Total number of connections: 6662591


In [26]:
# Remove all rows where id.orig_h == id.resp_h
all_dfs = all_dfs[all_dfs["id.orig_h"] != all_dfs["id.resp_h"]]
print("Remaining connections: {}".format(len(all_dfs)))
# Remove all rows where either id.orig_h or id.resp_h is an IPV6 address
all_dfs = all_dfs[
    ~all_dfs["id.orig_h"].str.contains(":") & ~all_dfs["id.resp_h"].str.contains(":")
]
print("Remaining connections: {}".format(len(all_dfs)))

Remaining connections: 6662591
Remaining connections: 6621362


In [27]:
attacker_ip = "13.58.225.34"

In [28]:
# Create a vector of 1s and 0s, where 1 indicates that the connection is either from or to the attacker
is_attacker = (all_dfs["id.orig_h"] == attacker_ip) | (all_dfs["id.resp_h"] == attacker_ip)
print("Number of connections from/to attacker: {}".format(sum(is_attacker)))

all_dfs["label"] = is_attacker.astype(int)

# Print the percentage of connections from/to the attacker
print("Percentage of connections from/to attacker: {:.2f}%".format(100 * sum(is_attacker) / len(all_dfs)))

Number of connections from/to attacker: 4
Percentage of connections from/to attacker: 0.00%


There are only 4 connections to and from the attacker host. So we will include the compromised machines

In [29]:
additional_attacker_ips = ["18.216.254.154", "172.31.69.13"]
attacker_ips = [attacker_ip] + additional_attacker_ips
print("Attacker IPs: {}".format(attacker_ips))

Attacker IPs: ['13.58.225.34', '18.216.254.154', '172.31.69.13']


In [30]:
# Create a vector of 1s and 0s, where 1 indicates that the connection is either from or to one of the attacker IPs
is_attacker = all_dfs["id.orig_h"].isin(attacker_ips) | all_dfs["id.resp_h"].isin(attacker_ips)
print("Number of connections from/to attacker: {}".format(sum(is_attacker)))

all_dfs["label"] = is_attacker.astype(int)

# Print the percentage of connections from/to the attacker
print("Percentage of connections from/to attacker: {:.2f}%".format(100 * sum(is_attacker) / len(all_dfs)))


Number of connections from/to attacker: 272004
Percentage of connections from/to attacker: 4.11%


In [31]:
# Convert the timestamp to a datetime object
dates = [datetime.fromtimestamp(float(i)) for i in all_dfs["ts"]]

In [32]:
# Find earliest and latest date
print("Earliest date: {}".format(min(dates)))
print("Latest date: {}".format(max(dates)))
# Count the number of dates before 12.00 and after 12.00
before_12 = sum([i.hour < 12 for i in dates])
after_12 = sum([i.hour >= 12 for i in dates])
print("Number of connections before 12.00: {}".format(before_12))
print("Number of connections after 12.00: {}".format(after_12))

# Count the percentage of label 1s before 12.00 and after 12.00
before_12_attacker = sum(
    (all_dfs["label"] == 1) & ([i.hour < 12 for i in dates])
)
after_12_attacker = sum(
    (all_dfs["label"] == 1) & ([i.hour >= 12 for i in dates])
)
print("Percentage of connections from/to attacker before 12.00: {:.2f}%".format(100 * before_12_attacker / before_12))
print("Percentage of connections from/to attacker after 12.00: {:.2f}%".format(100 * after_12_attacker / after_12))


Earliest date: 2018-02-27 07:18:26.348188
Latest date: 2018-03-01 18:40:14.412141
Number of connections before 12.00: 3446980
Number of connections after 12.00: 3174382
Percentage of connections from/to attacker before 12.00: 4.45%
Percentage of connections from/to attacker after 12.00: 3.73%


### Basic classification test

In [33]:
# Split the data into train and test by selecting connections before and after 12.00
train = all_dfs[[i.hour < 12 for i in dates]]
test = all_dfs[[i.hour >= 12 for i in dates]]

print("Number of connections in train: {}".format(len(train)))
print("Number of connections in test: {}".format(len(test)))

Number of connections in train: 3446980
Number of connections in test: 3174382


In [34]:
y_trn = train["label"]
y_tst = test["label"]

x_trn = train.drop(columns=["label", "ts", "uid", "id.orig_h", "id.resp_h", "service"])
x_tst = test.drop(columns=["label", "ts", "uid", "id.orig_h", "id.resp_h", "service"])
print("Initial number of features: {}".format(len(x_trn.columns)))
assert len(x_trn.columns) == len(x_tst.columns)

# Convert all '-' to 0
x_trn = x_trn.replace("-", 0)
x_tst = x_tst.replace("-", 0)

# One hot encode the categorical features
# categorical_features = ["proto", "service", "conn_state"]
categorical_features = ["proto", "conn_state"]
x_trn = pd.get_dummies(x_trn, columns=categorical_features)
x_tst = pd.get_dummies(x_tst, columns=categorical_features)
print("Number of features after one hot encoding: {}".format(len(x_trn.columns)))
assert len(x_trn.columns) == len(x_tst.columns)

Initial number of features: 9
Number of features after one hot encoding: 23


In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [36]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
model.fit(x_trn, y_trn)

In [37]:
preds = model.predict(x_tst)
print("Accuracy: {:.2f}%".format(100 * accuracy_score(y_tst, preds)))
print("F1 score: {:.2f}%".format(100 * f1_score(y_tst, preds)))
print("Confusion matrix:")
print(confusion_matrix(y_tst, preds))
print("Classification report:")
print(classification_report(y_tst, preds))

Accuracy: 97.60%
F1 score: 55.58%
Confusion matrix:
[[3050512    5423]
 [  70778   47669]]
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99   3055935
           1       0.90      0.40      0.56    118447

    accuracy                           0.98   3174382
   macro avg       0.94      0.70      0.77   3174382
weighted avg       0.97      0.98      0.97   3174382



In [38]:
# Show the relative importance of each feature
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(x_trn.shape[1]):
    print(
        "{:3}. feature {:20} {:5}".format(
            f + 1, x_trn.columns[indices[f]], importances[indices[f]]
        )
    )

Feature ranking:
  1. feature id.orig_p            0.3304765026789903
  2. feature id.resp_p            0.23358243069753423
  3. feature duration             0.13804234174519098
  4. feature orig_bytes           0.05588194396147436
  5. feature conn_state_S0        0.054853079992168804
  6. feature resp_pkts            0.053724113731849855
  7. feature resp_bytes           0.038037449766766145
  8. feature orig_pkts            0.027337015114869884
  9. feature proto_udp            0.017048351207917496
 10. feature conn_state_REJ       0.014956929612303186
 11. feature conn_state_SF        0.013976175018295093
 12. feature proto_tcp            0.011923758074839657
 13. feature conn_state_RSTR      0.0030629280609513134
 14. feature conn_state_OTH       0.002466631721097827
 15. feature conn_state_RSTO      0.0013162574936578322
 16. feature conn_state_SHR       0.0011328213145896464
 17. feature conn_state_S1        0.001103605255861409
 18. feature conn_state_SH        0.00078058029400

## Models

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from poisnet import constants, data_utils, utils

In [12]:
scenario_tag = constants.cicids_botnet_tag

BOTNET_IPS = {
    "friday_02-03-2018_morning": ("18.219.211.138"),
    "friday_02-03-2018_afternoon": ("18.219.211.138"),
}

INTERNAL = ("172.31.", "18.219.211.138")
model_types = ["GradientBoosting", "FFNN"]


In [10]:
train_captures = constants.subscenarios[scenario_tag]["train"]
test_captures = constants.subscenarios[scenario_tag]["test"]
train_name = "friday_02-03-2018_morning"
test_name = "friday_02-03-2018_afternoon"

In [11]:
train_cl = pd.read_csv(
    os.path.join(constants.cicids_base_pth, train_name, "conn_log.csv")
)
test_cl = pd.read_csv(
    os.path.join(constants.cicids_base_pth, test_name, "conn_log.csv")
)
train_cl.sort_values(by="ts", inplace=True)
test_cl.sort_values(by="ts", inplace=True)
print(f"{train_name} shape:", train_cl.shape)
print(f"{test_name} shape:", test_cl.shape)
train_cl_reference = train_cl.copy()
test_cl_reference = test_cl.copy()
all_cl = {
    "friday_02-03-2018_morning": train_cl,
    "friday_02-03-2018_afternoon": test_cl,
}

# Extract aggregated features from the conn.log files
df_train, labels_train, rows_train = data_utils.process_zeek_csv(
    train_cl,
    internal_prefixes=INTERNAL,
    attacker_ips=BOTNET_IPS[train_name],
    t_window=30,
    remove_int_int=False,
)
print("\ntrain shape: {}".format(df_train.shape))
print("train labels: {}".format(labels_train.shape))
print("train labels: {}".format(np.unique(labels_train, return_counts=True)))
print("train rows: {}".format(rows_train.shape))

df_test, labels_test, rows_test = data_utils.process_zeek_csv(
    test_cl,
    internal_prefixes=INTERNAL,
    attacker_ips=BOTNET_IPS[test_name],
    t_window=30,
    remove_int_int=False,
)
print("\ntest shape: {}".format(df_test.shape))
print("test labels: {}".format(labels_test.shape))
print("test labels: {}".format(np.unique(labels_test, return_counts=True)))
print("test rows: {}".format(rows_test.shape))
assert np.array_equal(df_train.columns.to_numpy(), df_test.columns.to_numpy())


all_df = {
    "friday_02-03-2018_morning": df_train,
    "friday_02-03-2018_afternoon": df_test,
}
all_labels = {
    "friday_02-03-2018_morning": labels_train,
    "friday_02-03-2018_afternoon": labels_test,
}
all_rows = {
    "friday_02-03-2018_morning": rows_train,
    "friday_02-03-2018_afternoon": rows_test,
}

friday_02-03-2018_morning shape: (3005968, 14)
friday_02-03-2018_afternoon shape: (3197935, 14)
Current conn log shape: (3005968, 14)
Added orig_row column: (3005968, 15)
Removed NaN values: (3005968, 15)
Removed IPV6 addresses: (3005968, 15)
Removed external connections: (3005858, 15)
src_df shape:  (1685404, 21)
dst_df shape:  (1320454, 21)
src_agg shape:  (196442, 37)
dst_agg shape:  (294750, 37)

train shape: (213059, 1152)
train labels: (213059,)
train labels: (array([0, 1]), array([212725,    334]))
train rows: (213059,)
Current conn log shape: (3197935, 14)
Added orig_row column: (3197935, 15)
Removed NaN values: (3197935, 15)
Removed IPV6 addresses: (3197935, 15)
Removed external connections: (3197848, 15)
src_df shape:  (1672708, 21)
dst_df shape:  (1525140, 21)
src_agg shape:  (205075, 37)
dst_agg shape:  (316142, 37)

test shape: (229253, 1152)
test labels: (229253,)
test labels: (array([0, 1]), array([228903,    350]))
test rows: (229253,)


In [17]:
from collections import defaultdict
from sklearn.model_selection import train_test_split

trn_accs = defaultdict(list)
trn_f1s = defaultdict(list)
trn_precs = defaultdict(list)
trn_recalls = defaultdict(list)
tst_accs = defaultdict(list)
tst_f1s = defaultdict(list)
tst_precs = defaultdict(list)
tst_recalls = defaultdict(list)

for seed in range(5):
    for model_type in model_types:

        orig_x_train = np.concatenate([all_df[tc].values for tc in train_captures])
        orig_y_train = np.concatenate([all_labels[tc] for tc in train_captures])
        orig_rows_train = np.concatenate([all_rows[tc] for tc in train_captures])
        orig_src_train = np.concatenate(
            [np.full(all_df[tc].shape[0], tc) for tc in train_captures]
        )

        # Test and adversarial datasets
        tst_cp = test_captures[0]  # There is only one test capture
        tst_indices, adv_indices, = train_test_split(
            np.arange(all_df[tst_cp].values.shape[0]),
            test_size=0.15,
            random_state=seed,
            stratify=all_labels[tst_cp],
        )
        tst_indices = np.sort(tst_indices)
        adv_indices = np.sort(adv_indices)
        orig_x_test = all_df[tst_cp].values[tst_indices]
        orig_y_test = all_labels[tst_cp][tst_indices]
        x_adv = all_df[tst_cp].values[adv_indices]
        y_adv = all_labels[tst_cp][adv_indices]

        orig_model = utils.train_model(
            model_type=model_type,
            x_trn=orig_x_train,
            y_trn=orig_y_train,
            save_pth=None,
            random_state=seed,
        )

        # Evaluate the original model
        orig_train_pred = orig_model.predict(orig_x_train)
        orig_test_pred = orig_model.predict(orig_x_test)

        orig_train_acc = accuracy_score(orig_y_train, orig_train_pred)
        orig_test_acc = accuracy_score(orig_y_test, orig_test_pred)
        orig_train_f1 = f1_score(orig_y_train, orig_train_pred)
        orig_test_f1 = f1_score(orig_y_test, orig_test_pred)
        orig_train_prec = precision_score(orig_y_train, orig_train_pred)
        orig_test_prec = precision_score(orig_y_test, orig_test_pred)
        orig_train_rec = recall_score(orig_y_train, orig_train_pred)
        orig_test_rec = recall_score(orig_y_test, orig_test_pred)
        
        trn_accs[model_type].append(orig_train_acc)
        trn_f1s[model_type].append(orig_train_f1)
        trn_precs[model_type].append(orig_train_prec)
        trn_recalls[model_type].append(orig_train_rec)
        
        tst_accs[model_type].append(orig_test_acc)
        tst_f1s[model_type].append(orig_test_f1)
        tst_precs[model_type].append(orig_test_prec)
        tst_recalls[model_type].append(orig_test_rec)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
print("Train Accuracies")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(trn_accs[model_type])))
print("Train F1s")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(trn_f1s[model_type])))
print("Train Precisions")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(trn_precs[model_type])))
print("Train Recalls")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(trn_recalls[model_type])))
print()
print("Test Accuracies")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(tst_accs[model_type])))
print("Test F1s")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(tst_f1s[model_type])))
print("Test Precisions")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(tst_precs[model_type])))
print("Test Recalls")
for model_type in model_types:
    print("{}: {}".format(model_type, np.mean(tst_recalls[model_type])))



Train Accuracies
GradientBoosting: 0.9999887355145758
FFNN: 0.9999953064644066
Train F1s
GradientBoosting: 0.9963515958915521
FFNN: 0.9985007496251874
Train Precisions
GradientBoosting: 1.0
FFNN: 1.0
Train Recalls
GradientBoosting: 0.9928143712574851
FFNN: 0.9970059880239521

Test Accuracies
GradientBoosting: 0.9999804993200421
FFNN: 0.9999846047263489
Test F1s
GradientBoosting: 0.9936391135214663
FFNN: 0.9949295626960601
Test Precisions
GradientBoosting: 0.9927148643275959
FFNN: 0.9986486332249044
Test Recalls
GradientBoosting: 0.9946127946127946
FFNN: 0.9912457912457914
