In [0]:
## This file handles the processing of the USTC data
## This implements SMOTE in order to balance the malicious 
## and benign data flows per type of data
## it further makes use of clutering in order to get the 
## correct admixture of data when resampling
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
# Move to drive
%cd drive
%cd 'My Drive'
%cd Oxford-Thesis

/content/drive
/content/drive/My Drive
/content/drive/My Drive/Oxford-Thesis


In [0]:
# Imports

from __future__ import absolute_import, division, print_function
import numpy as np
import os
import re
import tensorflow as tf
import time
import sys
import pickle
import scipy
import pandas as pd
from imblearn.over_sampling import SMOTE
import numpy as np
import math
import json
import gzip



In [0]:
### LOAD IN BENIGN EMBEDDINGS ## 
continuous_benign_flows = []

import pickle
with open('./embedding/CNN_FULL/time_benign_encodings_CNN', 'rb') as fp:
  benign_flows_timings = pickle.load(fp)
  fp.close()
benign_pkt_times = []
benign_directions = []
benign_pkt_dir_sizes = []
continuous_benign_flows = []

print(len(benign_flows_timings))
with open('./embedding/CNN_FULL/metadata_benign_encodings_CNN', 'rb') as fp:
  benign_flows = pickle.load(fp)
  fp.close()
continuous_mal_flows =[]

for file_index in range(len(benign_flows_timings)):
  for key in benign_flows_timings[file_index]:
    benign_pkt_times.append(benign_flows_timings[file_index][key][0])
    benign_directions.append(benign_flows_timings[file_index][key][1])
    benign_pkt_dir_sizes.append(benign_flows_timings[file_index][key][2])
    continuous_benign_flows.append(benign_flows[file_index][key][1])
print(len(continuous_benign_flows))

14
355956


In [0]:
### LOAD IN BENIGN EMBEDDINGS ## 
import pickle
with open('./embedding/CNN_FULL/time_mal_encodings_CNN', 'rb') as fp:
  malware_flows_timings = pickle.load(fp)
  fp.close()
mal_pkt_times = []
mal_directions = []
mal_pkt_dir_sizes = []
continuous_mal_flows = []
CONT_FEATURE_SIZE = 158
with open('./embedding/CNN_FULL/metadata_mal_encodings_CNN', 'rb') as fp:
  malware_flows = pickle.load(fp)
  fp.close()
continuous_mal_flows =[]

for file_index in range(len(malware_flows_timings)):
  for key in malware_flows_timings[file_index]:
    mal_pkt_times.append(malware_flows_timings[file_index][key][0])
    mal_directions.append(malware_flows_timings[file_index][key][1])
    mal_pkt_dir_sizes.append(malware_flows_timings[file_index][key][2])
    continuous_mal_flows.append(malware_flows[file_index][key][1])

In [0]:
continuous_mal_flows = np.array(continuous_mal_flows)
continuous_benign_flows = np.array(continuous_benign_flows)
continuous_flows = np.concatenate([continuous_benign_flows,continuous_mal_flows])

355956
183787
(539743, 158)


In [0]:
nsis_label = [[1,0,0,0,0,0,0,0,0,0]]
geodo_label = [[0,1,0,0,0,0,0,0,0,0]]
virut_label = [[0,0,1,0,0,0,0,0,0,0]]
shifu_label = [[0,0,0,1,0,0,0,0,0,0]]
cridex_label = [[0,0,0,0,1,0,0,0,0,0]]
tinba_label = [[0,0,0,0,0,1,0,0,0,0]]
miuref_label = [[0,0,0,0,0,0,1,0,0,0]]
zeus_label = [[0,0,0,0,0,0,0,1,0,0]]
htbot_label = [[0,0,0,0,0,0,0,0,1,0]]
neris_label = [[0,0,0,0,0,0,0,0,0,1]]

In [0]:
quantile_values = []

In [0]:
CONT_FEATURE_SIZE = 158
for i in range(0,CONT_FEATURE_SIZE):
  quantile_values.append(np.quantile(continuous_flows[:,i],.999))

In [0]:
NUM_VIRUT = 33559
NUM_NERIS = 34322
NUM_GEODO = 41686
NUM_NSIS = 6431
NUM_MIUREF = 13804
NUM_TINBA = 8997
NUM_ZEUS = 10993
NUM_HTBOT = 7265
NUM_SHIFU = 10345
NUM_CRIDEX = 16385

final_mal_flows = []
final_time_mal_flows = []
final_pkt_mal_flow =[]
final_dir_mal_flows = []
final_mal_flow_labels = []

final_benign_flows = []
final_time_benign_flows = []
final_pkt_benign_flow =[]
final_dir_benign_flows = []

INTER1 = NUM_NSIS +NUM_GEODO+NUM_VIRUT + NUM_SHIFU
INTER2 = INTER1+ NUM_CRIDEX +NUM_TINBA+NUM_MIUREF+NUM_ZEUS

In [0]:
### Adjust Timings ###
for flowset_times in mal_pkt_times:
  base_time = flowset_times[0]
  for index in range(len(flowset_times)):
    flowset_times[index] = flowset_times[index] - base_time
  
for flowset_times in benign_pkt_times:
  base_time = flowset_times[0]
  for index in range(len(flowset_times)):
    flowset_times[index] = flowset_times[index] - base_time

In [0]:
## Normalize ##
max_time = 0 
for flow in mal_pkt_times:
  for time in flow:
    if time > max_time:
      max_time = time 
for flow in benign_pkt_times:
  for time in flow:
    if time > max_time:
      max_time = time 
      
max_size = 0 
for flow in mal_pkt_dir_sizes:
  for size in flow:
    if size > max_size:
      max_size = size 
for flow in benign_pkt_dir_sizes:
  for size in flow:
    if size > max_time:
      max_size = size 


for flow in mal_pkt_times:
  for index in range(len(flow)):
    flow[index] = flow[index]/max_time
    
for flow in benign_pkt_times:
  for index in range(len(flow)):
    flow[index] = flow[index]/max_time
      
for flow in mal_pkt_dir_sizes:
  for index in range(len(flow)):
    flow[index] = flow[index]/max_size
for flow in benign_pkt_dir_sizes:
  for index in range(len(flow)):
    flow[index] = flow[index]/max_size

here
here
here
here
here
here


In [0]:
## Normalize to the correct size 
NUM_PACKETS = 64
for flow in mal_pkt_times:
  while len(flow) < NUM_PACKETS:
    flow.append(0)
print("HERE")
for flow in benign_pkt_times:
  while len(flow) < NUM_PACKETS:
    flow.append(0)
print("HERE")         
for flow in mal_pkt_dir_sizes:
  while len(flow) < NUM_PACKETS:
    flow.append(0)
print("HERE")
for flow in benign_pkt_dir_sizes:
  while len(flow) < NUM_PACKETS:
    flow.append(0)
print("HERE")    
for flow in mal_directions:
  while len(flow) < NUM_PACKETS:
    flow.append(0)
print("HERE")    
for flow in benign_directions:
  while len(flow) < NUM_PACKETS:
    flow.append(0)
print("HERE")    

HERE
HERE
HERE
HERE
HERE
HERE


In [0]:
## Remove outlier flow that distrub classification 
final_mal_flows = []
final_time_mal_flows = []
final_pkt_mal_flow = [] 
final_dir_mal_flows = []
for index in range(len(continuous_mal_flows)):
  flow = continuous_mal_flows[index]
  keep_flow = True
  for feature_index in range(len(continuous_mal_flows[index])):
    if flow[feature_index] > quantile_values[feature_index]:
      keep_flow =False
  if keep_flow:
    final_mal_flows.append(flow)
    final_time_mal_flows.append(mal_pkt_times[index])
    final_pkt_mal_flow.append(mal_pkt_dir_sizes[index])
    final_dir_mal_flows.append(mal_directions[index])
    if index < NUM_NSIS:
      final_mal_flow_labels.append(nsis_label)
    elif index < NUM_NSIS+ NUM_GEODO:
      final_mal_flow_labels.append(geodo_label)
    elif index < NUM_NSIS +NUM_GEODO+NUM_VIRUT:
      final_mal_flow_labels.append(virut_label)
    elif index <  NUM_NSIS +NUM_GEODO+NUM_VIRUT + NUM_SHIFU:
      final_mal_flow_labels.append(shifu_label)
    elif index < INTER1 + NUM_CRIDEX:
      final_mal_flow_labels.append(cridex_label)
    elif index < INTER1+ NUM_CRIDEX +NUM_TINBA:
      final_mal_flow_labels.append(tinba_label)
    elif index < INTER1+ NUM_CRIDEX +NUM_TINBA+NUM_MIUREF:
      final_mal_flow_labels.append(miuref_label)
    elif index < INTER1+ NUM_CRIDEX +NUM_TINBA+NUM_MIUREF+NUM_ZEUS:
      final_mal_flow_labels.append(zeus_label)
    elif index < INTER2 +NUM_HTBOT:
      final_mal_flow_labels.append(htbot_label)
    elif index < INTER2+ NUM_HTBOT + NUM_NERIS:
      final_mal_flow_labels.append(neris_label)
  



In [0]:
## Get normalized flows 
final_benign_flows = []
final_time_benign_flows = []
final_pkt_benign_flow = []
final_dir_benign_flows =[]
num = 0
for index in range(len(continuous_benign_flows)):
  flow = continuous_benign_flows[index]
  keep_flow = True
  for feature_index in range(len(continuous_benign_flows[index])):
    if flow[feature_index] > quantile_values[feature_index]:
      keep_flow =False
      num +=1 
  if keep_flow:
    final_benign_flows.append(flow)
    final_time_benign_flows.append(benign_pkt_times[index])
    final_pkt_benign_flow.append(benign_pkt_dir_sizes[index])
    final_dir_benign_flows.append(benign_directions[index])


In [0]:
## Get max values for evaluation later on 
max_values = []
CONT_FEATURE_SIZE = 158
for i in range(0,CONT_FEATURE_SIZE):
  max_value = np.max(continuous_flows[:,i])
  max_values.append(max_value)

In [0]:
##  Perform final normalization 
continuous_flows = np.concatenate([final_benign_flows,final_mal_flows])
CONT_FEATURE_SIZE = 158
for i in range(0,CONT_FEATURE_SIZE):
  max_value = np.max(continuous_flows[:,i])
  min_value = np.min(continuous_flows[:,i])
  continuous_flows[:,i] = ((continuous_flows[:,i] - min_value)/(max_value-min_value +1e-12))
f_continuous_mal_flows = continuous_flows[len(final_benign_flows):]
f_continuous_benign_flows = continuous_flows[:len(final_benign_flows)]

In [0]:
## Get appropriate flows for each type of label 
nsis_label = [[1,0,0,0,0,0,0,0,0,0]]
geodo_label = [[0,1,0,0,0,0,0,0,0,0]]
virut_label = [[0,0,1,0,0,0,0,0,0,0]]
shifu_label = [[0,0,0,1,0,0,0,0,0,0]]
cridex_label = [[0,0,0,0,1,0,0,0,0,0]]
tinba_label = [[0,0,0,0,0,1,0,0,0,0]]
miuref_label = [[0,0,0,0,0,0,1,0,0,0]]
zeus_label = [[0,0,0,0,0,0,0,1,0,0]]
htbot_label = [[0,0,0,0,0,0,0,0,1,0]]
neris_label = [[0,0,0,0,0,0,0,0,0,1]]
NUM_NSIS = 0 
NUM_GEODO = 0 
NUM_VIRUT = 0 
NUM_SHIFU = 0 
NUM_CRIDEX = 0 
NUM_TINBA = 0 
NUM_MIUREF = 0 
NUM_ZEUS = 0 
NUM_HTBOT = 0 
NUM_NERIS = 0 
for label in final_mal_flow_labels:
  label_num = np.argmax(label)
  if label_num == 0:
    NUM_NSIS +=1 
  elif label_num == 1:
    NUM_GEODO +=1 
  elif label_num == 2:
    NUM_VIRUT +=1
  elif label_num == 3:
    NUM_SHIFU +=1
  elif label_num == 4:
    NUM_CRIDEX+=1
  elif label_num == 5:
    NUM_TINBA +=1
  elif label_num == 6:
    NUM_MIUREF+=1
  elif label_num == 7:
    NUM_ZEUS +=1
  elif label_num == 8:
    NUM_HTBOT+=1
  elif label_num == 9:
    NUM_NERIS+=1

In [0]:
nsis_flows_cont = f_continuous_mal_flows[:NUM_NSIS]
geodo_flows_cont= f_continuous_mal_flows[NUM_NSIS:NUM_NSIS+NUM_GEODO]
virut_flows_cont = f_continuous_mal_flows[NUM_NSIS+NUM_GEODO:NUM_NSIS+NUM_GEODO+NUM_VIRUT]
shifu_flows_cont = f_continuous_mal_flows[NUM_NSIS+NUM_GEODO+NUM_VIRUT:NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU]
intermediate_total =NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU
cridex_flows_cont = f_continuous_mal_flows[intermediate_total:intermediate_total+NUM_CRIDEX]
tinba_flows_cont = f_continuous_mal_flows[intermediate_total+NUM_CRIDEX:intermediate_total+NUM_CRIDEX+NUM_TINBA]
miuref_flows_cont = f_continuous_mal_flows[intermediate_total+NUM_CRIDEX+NUM_TINBA:intermediate_total+NUM_CRIDEX+NUM_TINBA+NUM_MIUREF]
intermediate_total =intermediate_total +NUM_CRIDEX+NUM_TINBA+NUM_MIUREF
zeus_flows_cont = f_continuous_mal_flows[intermediate_total:intermediate_total+NUM_ZEUS]
htbot_flows_cont = f_continuous_mal_flows[intermediate_total+NUM_ZEUS:intermediate_total+NUM_ZEUS+NUM_HTBOT]
neris_flows_cont = f_continuous_mal_flows[intermediate_total+NUM_ZEUS+NUM_HTBOT:intermediate_total+NUM_ZEUS+NUM_HTBOT+NUM_NERIS]

In [0]:
nsis_flows_size = final_pkt_mal_flow[:NUM_NSIS]
geodo_flows_size= final_pkt_mal_flow[NUM_NSIS:NUM_NSIS+NUM_GEODO]
virut_flows_size = final_pkt_mal_flow[NUM_NSIS+NUM_GEODO:NUM_NSIS+NUM_GEODO+NUM_VIRUT]
shifu_flows_size = final_pkt_mal_flow[NUM_NSIS+NUM_GEODO+NUM_VIRUT:NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU]
intermediate_total =NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU
cridex_flows_size = final_pkt_mal_flow[intermediate_total:intermediate_total+NUM_CRIDEX]
tinba_flows_size = final_pkt_mal_flow[intermediate_total+NUM_CRIDEX:intermediate_total+NUM_CRIDEX+NUM_TINBA]
miuref_flows_size = final_pkt_mal_flow[intermediate_total+NUM_CRIDEX+NUM_TINBA:intermediate_total+NUM_CRIDEX+NUM_TINBA+NUM_MIUREF]
intermediate_total =intermediate_total +NUM_CRIDEX+NUM_TINBA+NUM_MIUREF
zeus_flows_size = final_pkt_mal_flow[intermediate_total:intermediate_total+NUM_ZEUS]
htbot_flows_size = final_pkt_mal_flow[intermediate_total+NUM_ZEUS:intermediate_total+NUM_ZEUS+NUM_HTBOT]
neris_flows_size = final_pkt_mal_flow[intermediate_total+NUM_ZEUS+NUM_HTBOT:intermediate_total+NUM_ZEUS+NUM_HTBOT+NUM_NERIS]

In [0]:
nsis_flows_dir = final_dir_mal_flows[:NUM_NSIS]
geodo_flows_dir = final_dir_mal_flows[NUM_NSIS:NUM_NSIS+NUM_GEODO]
virut_flows_dir = final_dir_mal_flows[NUM_NSIS+NUM_GEODO:NUM_NSIS+NUM_GEODO+NUM_VIRUT]
shifu_flows_dir = final_dir_mal_flows[NUM_NSIS+NUM_GEODO+NUM_VIRUT:NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU]
intermediate_total =NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU
cridex_flows_dir = final_dir_mal_flows[intermediate_total:intermediate_total+NUM_CRIDEX]
tinba_flows_dir = final_dir_mal_flows[intermediate_total+NUM_CRIDEX:intermediate_total+NUM_CRIDEX+NUM_TINBA]
miuref_flows_dir = final_dir_mal_flows[intermediate_total+NUM_CRIDEX+NUM_TINBA:intermediate_total+NUM_CRIDEX+NUM_TINBA+NUM_MIUREF]
intermediate_total =intermediate_total +NUM_CRIDEX+NUM_TINBA+NUM_MIUREF
zeus_flows_dir = final_dir_mal_flows[intermediate_total:intermediate_total+NUM_ZEUS]
htbot_flows_dir = final_dir_mal_flows[intermediate_total+NUM_ZEUS:intermediate_total+NUM_ZEUS+NUM_HTBOT]
neris_flows_dir = final_dir_mal_flows[intermediate_total+NUM_ZEUS+NUM_HTBOT:intermediate_total+NUM_ZEUS+NUM_HTBOT+NUM_NERIS]

In [0]:
nsis_flows_time = final_time_mal_flows[:NUM_NSIS]
geodo_flows_time = final_time_mal_flows[NUM_NSIS:NUM_NSIS+NUM_GEODO]
virut_flows_time = final_time_mal_flows[NUM_NSIS+NUM_GEODO:NUM_NSIS+NUM_GEODO+NUM_VIRUT]
shifu_flows_time = final_time_mal_flows[NUM_NSIS+NUM_GEODO+NUM_VIRUT:NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU]
intermediate_total =NUM_NSIS+NUM_GEODO+NUM_VIRUT+NUM_SHIFU
cridex_flows_time = final_time_mal_flows[intermediate_total:intermediate_total+NUM_CRIDEX]
tinba_flows_time = final_time_mal_flows[intermediate_total+NUM_CRIDEX:intermediate_total+NUM_CRIDEX+NUM_TINBA]
miuref_flows_time = final_time_mal_flows[intermediate_total+NUM_CRIDEX+NUM_TINBA:intermediate_total+NUM_CRIDEX+NUM_TINBA+NUM_MIUREF]
intermediate_total =intermediate_total +NUM_CRIDEX+NUM_TINBA+NUM_MIUREF
zeus_flows_time = final_time_mal_flows[intermediate_total:intermediate_total+NUM_ZEUS]
htbot_flows_time = final_time_mal_flows[intermediate_total+NUM_ZEUS:intermediate_total+NUM_ZEUS+NUM_HTBOT]
neris_flows_time = final_time_mal_flows[intermediate_total+NUM_ZEUS+NUM_HTBOT:intermediate_total+NUM_ZEUS+NUM_HTBOT+NUM_NERIS]

In [0]:
## Get Training indices ##
TRAIN_SIZE = 0.9
nsis_index = np.random.choice(len(nsis_flows_cont), int(len(nsis_flows_cont)*TRAIN_SIZE), replace=False)
geodo_index = np.random.choice(len(geodo_flows_cont), int(len(geodo_flows_cont)*TRAIN_SIZE) , replace=False)
virut_index = np.random.choice(len(virut_flows_cont), int(len(virut_flows_cont)*TRAIN_SIZE) , replace=False)
shifu_index = np.random.choice(len(shifu_flows_cont), int(len(shifu_flows_cont)*TRAIN_SIZE) , replace=False)
cridex_index = np.random.choice(len(cridex_flows_cont), int(len(cridex_flows_cont)*TRAIN_SIZE) , replace=False)
tinba_index = np.random.choice(len(tinba_flows_cont), int(len(tinba_flows_cont)*TRAIN_SIZE) , replace=False)
miuref_index = np.random.choice(len(miuref_flows_cont), int(len(miuref_flows_cont)*TRAIN_SIZE) , replace=False)
zeus_index = np.random.choice(len(zeus_flows_cont), int(len(zeus_flows_cont)*TRAIN_SIZE) , replace=False)
htbot_index = np.random.choice(len(htbot_flows_cont), int(len(htbot_flows_cont)*TRAIN_SIZE) , replace=False)
neris_index = np.random.choice(len(neris_flows_cont), int(len(neris_flows_cont)*TRAIN_SIZE) , replace=False)
benign_index = np.random.choice(len(f_continuous_benign_flows), int(len(f_continuous_benign_flows)*TRAIN_SIZE) , replace=False)

In [0]:
## Create training and validation sets ##

## Continuous train and valid sets
nsis_train_cont = []
geodo_train_cont = []
virut_train_cont = []
shifu_train_cont = []
cridex_train_cont = []
tinba_train_cont = []
miuref_train_cont = []
zeus_train_cont = []
htbot_train_cont = []
neris_train_cont = []
benign_train_cont = []

nsis_valid_cont = []
geodo_valid_cont = []
virut_valid_cont = []
shifu_valid_cont = []
cridex_valid_cont = []
tinba_valid_cont = []
miuref_valid_cont = []
zeus_valid_cont = []
htbot_valid_cont = []
neris_valid_cont = []
benign_valid_cont = []

## Size train and valid sets
nsis_train_size = []
geodo_train_size = []
virut_train_size = []
shifu_train_size = []
cridex_train_size = []
tinba_train_size = []
miuref_train_size = []
zeus_train_size = []
htbot_train_size = []
neris_train_size = []
benign_train_size = []

nsis_valid_size = []
geodo_valid_size = []
virut_valid_size = []
shifu_valid_size = []
cridex_valid_size = []
tinba_valid_size = []
miuref_valid_size = []
zeus_valid_size = []
htbot_valid_size = []
neris_valid_size = []
benign_valid_size = []

## Dir train and valid sets
nsis_train_dir = []
geodo_train_dir = []
virut_train_dir = []
shifu_train_dir = []
cridex_train_dir = []
tinba_train_dir = []
miuref_train_dir = []
zeus_train_dir = []
htbot_train_dir = []
neris_train_dir = []
benign_train_dir = []

nsis_valid_dir = []
geodo_valid_dir = []
virut_valid_dir = []
shifu_valid_dir = []
cridex_valid_dir = []
tinba_valid_dir = []
miuref_valid_dir = []
zeus_valid_dir = []
htbot_valid_dir = []
neris_valid_dir = []
benign_valid_dir = []

## Time train and valid sets
nsis_train_time = []
geodo_train_time = []
virut_train_time = []
shifu_train_time = []
cridex_train_time = []
tinba_train_time = []
miuref_train_time = []
zeus_train_time = []
htbot_train_time = []
neris_train_time = []
benign_train_time = []

nsis_valid_time = []
geodo_valid_time = []
virut_valid_time = []
shifu_valid_time = []
cridex_valid_time = []
tinba_valid_time = []
miuref_valid_time = []
zeus_valid_time = []
htbot_valid_time = []
neris_valid_time = []
benign_valid_time = []



for index in range(len(nsis_flows_cont)):
  if index in nsis_index:
    nsis_train_cont.append(nsis_flows_cont[index])
    nsis_train_size.append(nsis_flows_size[index])
    nsis_train_dir.append(nsis_flows_dir[index])
    nsis_train_time.append(nsis_flows_time[index])
  else:
    nsis_valid_cont.append(nsis_flows_cont[index])
    nsis_valid_size.append(nsis_flows_size[index])
    nsis_valid_dir.append(nsis_flows_dir[index])
    nsis_valid_time.append(nsis_flows_time[index])

for index in range(len(geodo_flows_cont)):
  if index in geodo_index:
    geodo_train_cont.append(geodo_flows_cont[index])
    geodo_train_size.append(geodo_flows_size[index])
    geodo_train_dir.append(geodo_flows_dir[index])
    geodo_train_time.append(geodo_flows_time[index])
  else:    
    geodo_valid_cont.append(geodo_flows_cont[index])
    geodo_valid_size.append(geodo_flows_size[index])
    geodo_valid_dir.append(geodo_flows_dir[index])
    geodo_valid_time.append(geodo_flows_time[index])
    
for index in range(len(virut_flows_cont)):
  if index in virut_index:
    virut_train_cont.append(virut_flows_cont[index])
    virut_train_size.append(virut_flows_size[index])
    virut_train_dir.append(virut_flows_dir[index])
    virut_train_time.append(virut_flows_time[index])
  else:
    virut_valid_cont.append(virut_flows_cont[index])
    virut_valid_size.append(virut_flows_size[index])
    virut_valid_dir.append(virut_flows_dir[index])
    virut_valid_time.append(virut_flows_time[index])
    
for index in range(len(shifu_flows_cont)):
  if index in shifu_index:
    shifu_train_cont.append(shifu_flows_cont[index])
    shifu_train_size.append(shifu_flows_size[index])
    shifu_train_dir.append(shifu_flows_dir[index])
    shifu_train_time.append(shifu_flows_time[index])
  else:
    shifu_valid_cont.append(shifu_flows_cont[index])
    shifu_valid_size.append(shifu_flows_size[index])
    shifu_valid_dir.append(shifu_flows_dir[index])
    shifu_valid_time.append(shifu_flows_time[index])

for index in range(len(cridex_flows_cont)):
  if index in cridex_index:
    cridex_train_cont.append(cridex_flows_cont[index])
    cridex_train_size.append(cridex_flows_size[index])
    cridex_train_dir.append(cridex_flows_dir[index])
    cridex_train_time.append(cridex_flows_time[index])
  else:
    cridex_valid_cont.append(cridex_flows_cont[index])
    cridex_valid_size.append(cridex_flows_size[index])
    cridex_valid_dir.append(cridex_flows_dir[index])
    cridex_valid_time.append(cridex_flows_time[index])
    
for index in range(len(tinba_flows_cont)):
  if index in tinba_index:
    tinba_train_cont.append(tinba_flows_cont[index])
    tinba_train_size.append(tinba_flows_size[index])
    tinba_train_dir.append(tinba_flows_dir[index])
    tinba_train_time.append(tinba_flows_dir[index])
  else:
    tinba_valid_cont.append(tinba_flows_cont[index])
    tinba_valid_size.append(tinba_flows_size[index])
    tinba_valid_dir.append(tinba_flows_dir[index])
    tinba_valid_time.append(tinba_flows_time[index])
    

for index in range(len(miuref_flows_cont)):
  if index in miuref_index:
    miuref_train_cont.append(miuref_flows_cont[index])
    miuref_train_size.append(miuref_flows_size[index])
    miuref_train_dir.append(miuref_flows_dir[index])
    miuref_train_time.append(miuref_flows_time[index])
  else:
    miuref_valid_cont.append(miuref_flows_cont[index])
    miuref_valid_size.append(miuref_flows_size[index])
    miuref_valid_dir.append(miuref_flows_dir[index])
    miuref_valid_time.append(miuref_flows_time[index])

for index in range(len(zeus_flows_cont)):
  if index in zeus_index:
    zeus_train_cont.append(zeus_flows_cont[index])
    zeus_train_size.append(zeus_flows_size[index])
    zeus_train_dir.append(zeus_flows_dir[index])
    zeus_train_time.append(zeus_flows_time[index])
  else:
    zeus_valid_cont.append(zeus_flows_cont[index])
    zeus_valid_size.append(zeus_flows_size[index])
    zeus_valid_dir.append(zeus_flows_dir[index])
    zeus_valid_time.append(zeus_flows_time[index])
    
for index in range(len(htbot_flows_cont)):
  if index in htbot_index:
    htbot_train_cont.append(htbot_flows_cont[index])
    htbot_train_size.append(htbot_flows_size[index])
    htbot_train_dir.append(htbot_flows_dir[index])
    htbot_train_time.append(htbot_flows_time[index])
  else:
    htbot_valid_cont.append(htbot_flows_cont[index])
    htbot_valid_size.append(htbot_flows_size[index])
    htbot_valid_dir.append(htbot_flows_dir[index])
    htbot_valid_time.append(htbot_flows_time[index])

for index in range(len(neris_flows_cont)):
  if index in neris_index:
    neris_train_cont.append(neris_flows_cont[index])
    neris_train_size.append(neris_flows_size[index])
    neris_train_dir.append(neris_flows_dir[index])
    neris_train_time.append(neris_flows_time[index])
  else:
    neris_valid_cont.append(neris_flows_cont[index])
    neris_valid_size.append(neris_flows_size[index])
    neris_valid_dir.append(neris_flows_dir[index])
    neris_valid_time.append(neris_flows_time[index])
    
    
for index in range(len(f_continuous_benign_flows)):
  if index in benign_index:
    benign_train_cont.append(f_continuous_benign_flows[index])
    benign_train_size.append(final_pkt_benign_flow[index])
    benign_train_dir.append(final_dir_benign_flows[index])
    benign_train_time.append(final_time_benign_flows[index])
  else:
    benign_valid_cont.append(f_continuous_benign_flows[index])
    benign_valid_size.append(final_pkt_benign_flow[index])
    benign_valid_dir.append(final_dir_benign_flows[index])
    benign_valid_time.append(final_time_benign_flows[index])


In [0]:
## Method for performing clustering and SMOTE 
from sklearn.utils import resample
from sklearn.cluster import KMeans
import math
def cluster_resample(flows,time_flows,size_flows,dir_flows,benign_flows, num_clusters):
  NUM_SAMPLES = 5000

  actual_flows = np.concatenate([flows,time_flows,size_flows,dir_flows],axis=1)
  sorted_flows = []
  for i in range(num_clusters):
    sorted_flows.append([])
  ## sort in num clusters ##
  kmeans = KMeans(n_clusters=num_clusters)
  kmeans.fit(actual_flows)
  labels = kmeans.predict(actual_flows)
  centroids = kmeans.cluster_centers_
  for index in range(len(labels)):
    label_index = labels[index]
    sorted_flows[label_index].append(actual_flows[index])

  resampled_flows = []
  for i in range(num_clusters):
    resampled_flows.append([])

  for i in range(num_clusters):
    sampled_benign_flows = resample(benign_flows,
                                    replace=False, # sample with replacement
                                    n_samples=NUM_SAMPLES, # match number in majority class
                                    random_state=27) # reproducible results
    print(len(sorted_flows[i]))
    check = sorted_flows[i][0]
    y  = np.array([[0,1]]*len(sampled_benign_flows) +[[1,0]]*len(sorted_flows[i]))
    X = np.concatenate([sampled_benign_flows,sorted_flows[i]],axis = 0 )
    X_resampled, y_resampled = SMOTE().fit_resample(X, y) 
    resampled_flows[i] = X_resampled[NUM_SAMPLES:]
  final_flows = []
  final_time_flows =[]
  final_size_flows = []
  final_dir_flows = []
  for i in range(num_clusters):
    for flow in resampled_flows[i]:
      final_flows.append(flow[:158])  
      final_time_flows.append(flow[158:158+64])
      final_size_flows.append(flow[158+64:158+128])
      final_dir_flows.append(flow[158+128:158+128+64])
  print(len(final_flows))
  return final_flows,final_time_flows,final_size_flows,final_dir_flows

In [0]:
actual_benign_flows = np.concatenate([benign_train_cont,benign_train_time,benign_train_size,benign_train_dir],axis = 1)
re_nsis_train_cont,re_nsis_train_time,re_nsis_train_size,re_nsis_train_dir = cluster_resample(nsis_train_cont,nsis_train_time,nsis_train_size,nsis_train_dir,actual_benign_flows, 5)
re_geodo_train_cont,re_geodo_train_time,re_geodo_train_size,re_geodo_train_dir = cluster_resample(geodo_train_cont,geodo_train_time,geodo_train_size,geodo_train_dir,actual_benign_flows, 5)
re_virut_train_cont,re_virut_train_time,re_virut_train_size,re_virut_train_dir = cluster_resample(virut_train_cont,virut_train_time,virut_train_size,virut_train_dir,actual_benign_flows, 5)
re_neris_train_cont,re_neris_train_time,re_neris_train_size,re_neris_train_dir = cluster_resample(neris_train_cont,neris_train_time,neris_train_size,neris_train_dir,actual_benign_flows, 3)
re_miuref_train_cont,re_miuref_train_time,re_miuref_train_size,re_miuref_train_dir = cluster_resample(miuref_train_cont,miuref_train_time,miuref_train_size,miuref_train_dir,actual_benign_flows, 5)
re_tinba_train_cont,re_tinba_train_time,re_tinba_train_size,re_tinba_train_dir = cluster_resample(tinba_train_cont,tinba_train_time,tinba_train_size,tinba_train_dir,actual_benign_flows, 2)
re_zeus_train_cont,re_zeus_train_time,re_zeus_train_size,re_zeus_train_dir = cluster_resample(zeus_train_cont,zeus_train_time,zeus_train_size,zeus_train_dir,actual_benign_flows, 4)
re_htbot_train_cont,re_htbot_train_time,re_htbot_train_size,re_htbot_train_dir = cluster_resample(htbot_train_cont,htbot_train_time,htbot_train_size,htbot_train_dir,actual_benign_flows, 4)
re_shifu_train_cont,re_shifu_train_time,re_shifu_train_size,re_shifu_train_dir = cluster_resample(shifu_train_cont,shifu_train_time,shifu_train_size,shifu_train_dir,actual_benign_flows, 6)
re_cridex_train_cont,re_cridex_train_time,re_cridex_train_size,re_cridex_train_dir = cluster_resample(cridex_train_cont,cridex_train_time,cridex_train_size,cridex_train_dir,actual_benign_flows, 4)

1214
726
2303
159
849
25000
1827
29489
1281
1759
2380
73978
7072
16614
1379
2402
1359
52372
6056
9057
14173
43572
2228
5733
1683
2347
189
26466
6847
1224
13694
197
4477
4630
397
20000
2296
1231
1454
804
20000
1689
2355
1665
319
2122
1141
30000
3931
7337
2125
1163
24674


In [0]:
## Get  the mal train labels 
labels = nsis_label*len(re_nsis_train_cont) + geodo_label*len(re_geodo_train_cont )+virut_label*len(re_virut_train_cont)
labels = labels+shifu_label*len(re_shifu_train_cont)+cridex_label*len(re_cridex_train_cont)+tinba_label*len(re_tinba_train_cont)
labels = labels+miuref_label*len(re_miuref_train_cont)+zeus_label*len(re_zeus_train_cont)+htbot_label*len(re_htbot_train_cont)
mal_train_labels = labels+neris_label*len(re_neris_train_cont)
print(len(mal_train_labels))

329756


In [0]:
## Train meta flow data ##
all_flows = list(re_nsis_train_cont)+ list(re_geodo_train_cont) +list(re_virut_train_cont)
all_flows = all_flows + list(re_shifu_train_cont) + list(re_cridex_train_cont)+ list(re_tinba_train_cont)
all_flows = all_flows + list(re_miuref_train_cont) + list(re_zeus_train_cont) + list(re_htbot_train_cont)+ list(re_neris_train_cont)
mal_train_meta_X = list(all_flows)
print(len(mal_train_meta_X))

329756


In [0]:
## Train packet size data ## 
all_flows = list(re_nsis_train_size)+ list(re_geodo_train_size) +list(re_virut_train_size)
all_flows = all_flows + list(re_shifu_train_size) + list(re_cridex_train_size)+ list(re_tinba_train_size)
all_flows = all_flows + list(re_miuref_train_size) + list(re_zeus_train_size) + list(re_htbot_train_size)+ list(re_neris_train_size)
mal_train_size_X = list(all_flows)
print(len(mal_train_size_X))

329756


In [0]:
## Train packet dir data ## 
all_flows = list(re_nsis_train_dir)+ list(re_geodo_train_dir) +list(re_virut_train_dir)
all_flows = all_flows + list(re_shifu_train_dir) + list(re_cridex_train_dir)+ list(re_tinba_train_dir)
all_flows = all_flows + list(re_miuref_train_dir) + list(re_zeus_train_dir) + list(re_htbot_train_dir)+ list(re_neris_train_dir)
mal_train_dir_X = list(all_flows)
print(len(mal_train_dir_X))

329756


In [0]:
## Train packet time data ## 
all_flows = list(re_nsis_train_time)+ list(re_geodo_train_time) +list(re_virut_train_time)
all_flows = all_flows + list(re_shifu_train_time) + list(re_cridex_train_time)+ list(re_tinba_train_time)
all_flows = all_flows + list(re_miuref_train_time) + list(re_zeus_train_time) + list(re_htbot_train_time)+ list(re_neris_train_time)
mal_train_time_X = list(all_flows)
print(len(mal_train_time_X))

329756


In [0]:
## valid meta flow data ##
all_flows = list(nsis_valid_cont)+ list(geodo_valid_cont) +list(virut_valid_cont)
all_flows = all_flows + list(shifu_valid_cont) + list(cridex_valid_cont)+ list(tinba_valid_cont)
all_flows = all_flows + list(miuref_valid_cont) + list(zeus_valid_cont) + list(htbot_valid_cont)+ list(neris_valid_cont)
mal_valid_meta_X = list(all_flows)
print(len(mal_valid_meta_X))

17747


In [0]:
## valid packet size data ## 
all_flows = list(nsis_valid_size)+ list(geodo_valid_size) +list(virut_valid_size)
all_flows = all_flows + list(shifu_valid_size) + list(cridex_valid_size)+ list(tinba_valid_size)
all_flows = all_flows + list(miuref_valid_size) + list(zeus_valid_size) + list(htbot_valid_size)+ list(neris_valid_size)
mal_valid_size_X = list(all_flows)
print(len(mal_valid_size_X))

17747


In [0]:
## valid packet time data ## 
all_flows = list(nsis_valid_time)+ list(geodo_valid_time) +list(virut_valid_time)
all_flows = all_flows + list(shifu_valid_time) + list(cridex_valid_time)+ list(tinba_valid_time)
all_flows = all_flows + list(miuref_valid_time) + list(zeus_valid_time) + list(htbot_valid_time)+ list(neris_valid_time)
mal_valid_time_X = list(all_flows)
print(len(mal_valid_time_X))

17747


In [0]:
## valid packet dir data ## 
all_flows = list(nsis_valid_dir)+ list(geodo_valid_dir) +list(virut_valid_dir)
all_flows = all_flows + list(shifu_valid_dir) + list(cridex_valid_dir)+ list(tinba_valid_dir)
all_flows = all_flows + list(miuref_valid_dir) + list(zeus_valid_dir) + list(htbot_valid_dir)+ list(neris_valid_dir)
mal_valid_dir_X = list(all_flows)
print(len(mal_valid_dir_X))

17747


In [0]:
## valid labels ##
labels = nsis_label*len(nsis_valid_cont) + geodo_label*len(geodo_valid_cont )+virut_label*len(virut_valid_cont)
labels = labels+shifu_label*len(shifu_valid_cont)+cridex_label*len(cridex_valid_cont)+tinba_label*len(tinba_valid_cont)
labels = labels+miuref_label*len(miuref_valid_cont)+zeus_label*len(zeus_valid_cont)+htbot_label*len(htbot_valid_cont)
mal_valid_labels = labels+neris_label*len(neris_valid_cont)
print(len(mal_valid_labels))

17747


In [0]:
### Dump flows in appropriate pace 
import numpy as np
with open('./embedding/CNN_FULL/USTC_prn_benign_meta_train', 'wb') as fp:
  pickle.dump(benign_train_cont,fp)
  fp.close()

with open('./embedding/CNN_FULL/USTC_prn_mal_meta_train', 'wb') as fp:
  pickle.dump(np.array(mal_train_meta_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_meta_valid', 'wb') as fp:
  pickle.dump(np.array(mal_valid_meta_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_size_train', 'wb') as fp:
  pickle.dump(np.array(mal_train_size_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_size_valid', 'wb') as fp:
  pickle.dump(np.array(mal_valid_size_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_time_train', 'wb') as fp:
  pickle.dump(np.array(mal_train_time_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_time_valid', 'wb') as fp:
  pickle.dump(np.array(mal_valid_time_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_dir_train', 'wb') as fp:
  pickle.dump(np.array(mal_train_dir_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_dir_valid', 'wb') as fp:
  pickle.dump(np.array(mal_valid_dir_X).astype(float),fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_meta_train_labels', 'wb') as fp:
  pickle.dump(mal_train_labels,fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_mal_meta_valid_labels', 'wb') as fp:
  pickle.dump(mal_valid_labels,fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_benign_meta_valid', 'wb') as fp:
  pickle.dump(benign_valid_cont,fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_benign_size_train', 'wb') as fp:
  pickle.dump(benign_train_size,fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_benign_size_valid', 'wb') as fp:
  pickle.dump(benign_valid_size,fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_benign_dir_train', 'wb') as fp:
  pickle.dump(benign_train_dir,fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_benign_dir_valid', 'wb') as fp:
  pickle.dump(benign_valid_dir,fp)
  fp.close()

with open('./embedding/CNN_FULL/USTC_prn_benign_time_train', 'wb') as fp:
  pickle.dump(benign_train_time,fp)
  fp.close()
  
with open('./embedding/CNN_FULL/USTC_prn_benign_time_valid', 'wb') as fp:
  pickle.dump(benign_valid_time,fp)
  fp.close()

In [0]:
print(NUM_NSIS)
print(NUM_GEODO)
print(NUM_VIRUT)
print(NUM_SHIFU)
print(NUM_CRIDEX)
print(NUM_TINBA)
print(NUM_MIUREF)
print(NUM_ZEUS)
print(NUM_HTBOT)
print(NUM_NERIS)

5835
40818
32029
10324
16174
8968
13534
10779
6428
32541
