In [1]:
import pandas as pd
import numpy as np
import math
import tensorflow as tf
from tensorflow.contrib import learn
import json
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

<IPython.core.display.Javascript object>

In [2]:
with open('./hidden_units.json') as file:
    hidden_units = json.load(file)

CORRCOEF_THRESHOLD = 0.5
NUM_STEPS = 10000
BATCH_SIZE = 1024

# Número de hidden layers: 1, 2, 5, 10, 20, 30
hidden_units = hidden_units[31] #Número de hidden units por hidden layer, cada posição do vetor é um layer diferente
activation_fn = tf.nn.leaky_relu

In [3]:
def input_fn(x, y, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((dict(x), y))
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    return dataset

def input_fn_test(x, y, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((dict(x), y))
    dataset = dataset.batch(batch_size)
    return dataset

def classifier_fn(feature_columns, hidden_units, activation_fn, BATCH_SIZE, NUM_STEPS):  
    classifier = tf.estimator.DNNClassifier(
        feature_columns=feature_columns,
        hidden_units=hidden_units, #1 hidden layer com 1000 hidden units
        n_classes=23, #Pode distinguir até 23 valores diferentes
        activation_fn=activation_fn,
        batch_norm=False)

    # Treino
    classifier.train(
        input_fn=lambda:input_fn(x_train, y_train, BATCH_SIZE),
        steps=NUM_STEPS)

    # Teste
    eval_result = classifier.evaluate(
        input_fn=lambda:input_fn_test(x_test, y_test, BATCH_SIZE))

    #Abre arquivo e adiciona no fim
    file = open("./output.csv", "a")
    file.write(f'"{len(hidden_units)}","{hidden_units}","{eval_result["accuracy"]}","{eval_result["average_loss"]}","{eval_result["loss"]}","leaky_relu"\n')
    file.close()

In [4]:
column_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
                "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
                "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
                "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
                "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "attack"]

dataset = pd.read_csv('../../dataset/kddcup.data_10_percent_corrected', header=None, names=column_names)

protocol_type = {"icmp": 0, "tcp": 1, "udp": 2}
service = {"auth": 0, "bgp": 1, "courier": 2, "csnet_ns": 3, "ctf": 4, "daytime": 5, "discard": 6, "domain": 7, "domain_u": 8, "echo": 9, "eco_i": 10, "ecr_i": 11, "efs": 12,
           "exec": 13, "finger": 14, "ftp": 15, "ftp_data": 16, "gopher": 17, "hostnames": 18, "http": 19, "http_443": 20, "imap4": 21, "IRC": 22, "iso_tsap": 23, "klogin": 24,
           "kshell": 25, "ldap": 26, "link": 27, "login": 28, "mtp": 29, "name": 30, "netbios_dgm": 31, "netbios_ns": 32, "netbios_ssn": 33, "netstat": 34, "nnsp": 35, "nntp": 36,
           "ntp_u": 37, "other": 38, "pm_dump": 39, "pop_2": 40, "pop_3": 41, "printer": 42, "private": 43, "red_i": 44, "remote_job": 45, "rje": 46, "shell": 47, "smtp": 48,
           "sql_net": 49, "ssh": 50, "sunrpc": 51, "supdup": 52, "systat": 53, "telnet": 54, "tftp_u": 55, "time": 56, "tim_i": 57, "urh_i": 58, "urp_i": 59, "uucp": 60,
           "uucp_path": 61, "vmnet": 62, "whois": 63, "X11": 64, "Z39_50": 65}
flag = {"OTH": 0, "REJ": 1, "RSTO": 2, "RSTOS0": 3, "RSTR": 4, "S0": 5, "S1": 6, "S2": 7, "S3": 8, "SF": 9, "SH": 10}
attack = {"back.": 0, "buffer_overflow.": 1, "ftp_write.": 2, "guess_passwd.": 3, "imap.": 4, "ipsweep.": 5, "land.": 6, "loadmodule.": 7, "multihop.": 8, "neptune.": 9,
          "nmap.": 10, "normal.": 11, "perl.": 12, "phf.": 13, "pod.": 14, "portsweep.": 15, "rootkit.": 16, "satan.": 17, "smurf.": 18, "spy.": 19, "teardrop.": 20,
          "warezclient.": 21, "warezmaster.": 22}

#Troca os valores de string para número
dataset = dataset.replace({"protocol_type": protocol_type, "service": service, "flag": flag, "attack": attack})

#Coloca todas em columas de feature em x e a de resultado em y
x, y = dataset, dataset.pop("attack")

In [5]:
included_features = column_names
included_features.remove("attack")


i = 0
j = 1
while i < len(included_features):
    col1 = dataset[column_names[i]]

    while j < len(included_features):
        col2 = dataset[column_names[j]]
        corrcoef = np.corrcoef(col1, col2)

        if abs(corrcoef[0][1]) > CORRCOEF_THRESHOLD:
            included_features.remove(included_features[j])
        else:
            j = j + 1

    i = i + 1
    j = i + 1

for feature in included_features: #Por algum motivo o pop dentro do while não funcionava
    x.pop(feature)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [6]:
tf.logging.set_verbosity(tf.logging.WARN)

n_train = math.ceil(len(x.index)*0.9)
n_test = len(x.index) - n_train

x_train = x[:n_train]
y_train = y[:n_train]

x_test = x[n_train:n_train+n_test]
y_test = y[n_train:n_train+n_test]

feature_columns = []
for feature in x_train.keys():
    feature_columns.append(tf.feature_column.numeric_column(key=feature))

classifier_fn(feature_columns, hidden_units, activation_fn, BATCH_SIZE, NUM_STEPS)

%notify -m "Done!"



<IPython.core.display.Javascript object>