In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


import boto3
import sagemaker

In [2]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role

# create an S3 bucket
bucket = sagemaker_session.default_bucket()# session and role

In [3]:
# Read in the data

kdd = pd.read_csv("kddcup.data_10_percent")

In [4]:
col_names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment",
             "urgent","hot","num_failed_logins","logged_in","num_compromised",
             "root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds",
             "is_hot_login","is_guest_login","count","srv_count","serror_rate",
             "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","target"]

kdd.columns = col_names

In [5]:
kdd.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


## Creating and Preparing Training, Validation & Test Set

In [6]:
train_set, test_set = train_test_split(kdd, test_size=.3, random_state=42)
val_set, test_set = train_test_split(test_set, test_size=.5, random_state=42)

In [7]:
def intrude_attack(train_set, intrusion_rate= 0.01):
    
    train_normal = train_set.loc[train_set["target"] == 'normal.']
    train_attack = train_set.loc[train_set["target"] != 'normal.']

    n_attacks = round((len(train_normal)/100)*intrusion_rate)
    
    train_attack = train_attack.sample(n_attacks)
    
    train_set_noise = pd.concat([train_set, train_attack], axis = 0)
    
    return train_set_noise.sample(frac=1).drop('target', axis=1)

In [8]:
intrude_attack_rate = 0.01
X_train = intrude_attack(train_set)

y_val = val_set['target'].copy()
y_val = y_val.apply(lambda x: 0 if x == 'normal.' else 1) #0: normal, 1: anomaly, RCF assumption
X_val = val_set.drop('target', axis=1)

y_test = test_set['target'].copy()
y_test = y_test.apply(lambda x: 0 if x == 'normal.' else 1) #0: normal, 1: anomaly, RCF assumption
X_test = test_set.drop('target', axis=1)

In [9]:
print("X_train dimension: " + str(X_train.shape))
print("X_val dimension: " + str(X_val.shape))
print("y_val dimension: " + str(y_val.shape))
print("X_test dimension: " + str(X_test.shape))
print("y_test dimension: " + str(y_test.shape))

overall_length= X_train.shape[0]+X_test.shape[0]+X_val.shape[0]

print("Train: {} %, Val: {} %, Test: {} %".format(round(X_train.shape[0]/overall_length*100), round(X_val.shape[0]/overall_length*100), round(X_test.shape[0]/overall_length*100)))

X_train dimension: (345821, 41)
X_val dimension: (74103, 41)
y_val dimension: (74103,)
X_test dimension: (74103, 41)
y_test dimension: (74103,)
Train: 70 %, Val: 15 %, Test: 15 %


### Handling categorical data

In [10]:
# Take a view on our categorical data
X_train_cat = X_train.select_dtypes(include=object)

for i in range(0,len(X_train_cat.columns)):
    print(X_train_cat.columns[i])
    print("-"*5)
    print(X_train_cat[X_train_cat.columns[i]].drop_duplicates())
    print("-"*20)

protocol_type
-----
6319       tcp
173928    icmp
148334     udp
Name: protocol_type, dtype: object
--------------------
service
-----
6319         http
91786       other
60119     private
173928      ecr_i
50873        smtp
           ...   
461550     Z39_50
70504      systat
482842        X11
142415      tim_i
140583     tftp_u
Name: service, Length: 64, dtype: object
--------------------
flag
-----
6319          SF
91786        REJ
60119         S0
144729      RSTR
478539      RSTO
136506        S1
43142         SH
91621         S2
370279       OTH
28187         S3
42069     RSTOS0
Name: flag, dtype: object
--------------------


In [11]:
cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) 

cat_features = X_train.select_dtypes(include=object).columns

j=0
for i in cat_features:
    
    # one-hot encoding
    cat_encoder = cat_encoder.fit(X_train[i].values.reshape(-1,1))
    X_train_cat_i= cat_encoder.transform(X_train[i].values.reshape(-1,1))
    X_val_cat_i = cat_encoder.transform(X_val[i].values.reshape(-1,1))
    X_test_cat_i = cat_encoder.transform(X_test[i].values.reshape(-1,1))
    
    # transforming to DataFrame
    X_train_cat_i = pd.DataFrame(X_train_cat_i, columns = cat_encoder.categories_, index = X_train.index)
    X_val_cat_i = pd.DataFrame(X_val_cat_i, columns = cat_encoder.categories_, index = X_val.index)
    X_test_cat_i = pd.DataFrame(X_test_cat_i, columns = cat_encoder.categories_, index = X_test.index)
    
    if j==0:
        X_train_cat = X_train_cat_i
        X_val_cat = X_val_cat_i
        X_test_cat = X_test_cat_i
    else:
        X_train_cat = pd.concat([X_train_cat, X_train_cat_i], axis = 1)
        X_val_cat = pd.concat([X_val_cat, X_val_cat_i], axis = 1)
        X_test_cat = pd.concat([X_test_cat, X_test_cat_i], axis = 1)
    j=j+1
    
X_train_cat_i = None
X_val_cat_i = None
X_test_cat_i = None

### Normalize data

In [12]:
# Select numerical data, which need to be normalized
X_train_num = X_train.select_dtypes(exclude=object)
X_val_num = X_val.select_dtypes(exclude=object)
X_test_num = X_test.select_dtypes(exclude=object)

scaler = StandardScaler()

X_train_norm = scaler.fit_transform(X_train_num)
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train_num.columns, index = X_train_num.index)

X_val_norm = scaler.transform(X_val_num)
X_val_norm = pd.DataFrame(X_val_norm, columns = X_val_num.columns, index = X_val_num.index)

X_test_norm = scaler.transform(X_test_num)
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test_num.columns, index = X_test_num.index)

X_train_num = None
X_val_num = None
X_test_num = None


## Creating Final Data Files

Input Random Cut Forest Algorithm: <br>
test data, text/csv format, first column represents the anomaly label: "1" = anomalous and "0" = normal

In [13]:
  
# Combining all the transformed data
X_train = None
X_val = None
X_test = None

X_train = pd.concat([X_train_norm, X_train_cat], axis=1)
X_val = pd.concat([X_val_norm, X_val_cat], axis=1)
X_test = pd.concat([X_test_norm, X_test_cat], axis=1)

def make_csv(X, y, filename, data_dir):
    '''Merges features and labels and converts them into one csv file with labels in the first column.
       :param x: Data features
       :param y: Data labels
       :param file_name: Name of csv file, ex. 'train.csv'
       :param data_dir: The directory where files will be saved
       '''
    # make data dir, if it does not exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    if y is not None:
        pd.DataFrame(y).join(pd.DataFrame(X)).to_csv(os.path.join(data_dir, filename), header=False, index=False)
    else:
        X.to_csv(os.path.join(data_dir, filename), header=False, index=False)

    # nothing is returned, but a print statement indicates that the function has run
    print('Path created: '+str(data_dir)+'/'+str(filename))

In [14]:
make_csv(X_train, None, 'train.csv', 'data')

Path created: data/train.csv


In [15]:
# should be the name of directory you created to save your features data
data_dir = 'data'

# set prefix, a descriptive name for a directory  
prefix = 'anomaly'

# upload all data to S3
train_location = sagemaker_session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [16]:
os.remove(os.path.join('data', 'train.csv'))

In [17]:
make_csv(X_val, y_val, 'val.csv', 'data')

Path created: data/val.csv


In [18]:
val_location = sagemaker_session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)

In [19]:
os.remove(os.path.join('data', 'val.csv'))

In [20]:
make_csv(X_test, y_test, 'test.csv', 'data')

Path created: data/test.csv


In [21]:
test_location = sagemaker_session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

In [22]:
os.remove(os.path.join('data', 'test.csv'))

In [23]:
X_train.to_pickle('X_train.pkl')
X_val.to_pickle('X_val.pkl')
y_val.to_pickle('y_val.pkl')
X_test.to_pickle('X_test.pkl')
y_test.to_pickle('y_test.pkl')

In [24]:
print(test_location)

s3://sagemaker-us-east-1-517714493426/anomaly/test.csv
