In [1]:
import pandas as pd
import numpy as cp
import math
import tensorflow as tf
from tensorflow.contrib import learn

In [2]:
def input_fn(x, y, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(x), y))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset


def input_fn_test(x, y, batch_size):
    """An input function for evaluation or prediction"""
    x=dict(x)
    if y is None:
        # No labels, use only features.
        inputs = x
    else:
        inputs = (x, y)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

In [3]:
column_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
                "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
                "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
                "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
                "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "attack"]

dataset = pd.read_csv('../../dataset/kddcup.data_10_percent_corrected', header=None, names=column_names)

#Mapeio as string para número
protocol_type = {"icmp": 0, "tcp": 1, "udp": 2}
service = {"auth": 0, "bgp": 1, "courier": 2, "csnet_ns": 3, "ctf": 4, "daytime": 5, "discard": 6, "domain": 7, "domain_u": 8, "echo": 9, "eco_i": 10, "ecr_i": 11, "efs": 12,
           "exec": 13, "finger": 14, "ftp": 15, "ftp_data": 16, "gopher": 17, "hostnames": 18, "http": 19, "http_443": 20, "imap4": 21, "IRC": 22, "iso_tsap": 23, "klogin": 24,
           "kshell": 25, "ldap": 26, "link": 27, "login": 28, "mtp": 29, "name": 30, "netbios_dgm": 31, "netbios_ns": 32, "netbios_ssn": 33, "netstat": 34, "nnsp": 35, "nntp": 36,
           "ntp_u": 37, "other": 38, "pm_dump": 39, "pop_2": 40, "pop_3": 41, "printer": 42, "private": 43, "red_i": 44, "remote_job": 45, "rje": 46, "shell": 47, "smtp": 48,
           "sql_net": 49, "ssh": 50, "sunrpc": 51, "supdup": 52, "systat": 53, "telnet": 54, "tftp_u": 55, "time": 56, "tim_i": 57, "urh_i": 58, "urp_i": 59, "uucp": 60,
           "uucp_path": 61, "vmnet": 62, "whois": 63, "X11": 64, "Z39_50": 65}
flag = {"OTH": 0, "REJ": 1, "RSTO": 2, "RSTOS0": 3, "RSTR": 4, "S0": 5, "S1": 6, "S2": 7, "S3": 8, "SF": 9, "SH": 10}
attack = {"back.": 0, "buffer_overflow.": 1, "ftp_write.": 2, "guess_passwd.": 3, "imap.": 4, "ipsweep.": 5, "land.": 6, "loadmodule.": 7, "multihop.": 8, "neptune.": 9,
          "nmap.": 10, "normal.": 11, "perl.": 12, "phf.": 13, "pod.": 14, "portsweep.": 15, "rootkit.": 16, "satan.": 17, "smurf.": 18, "spy.": 19, "teardrop.": 20,
          "warezclient.": 21, "warezmaster.": 22}

#Troca os valores de string para número
dataset = dataset.replace({"protocol_type": protocol_type, "service": service, "flag": flag, "attack": attack})

#Coloca todas em columas de feature em x e a de resultado em y
x, y = dataset, dataset.pop("attack")

In [4]:
n_train = math.ceil(len(x.index)*0.9)
n_test = len(x.index) - n_train

x_train = x[:n_train]
y_train = y[:n_train]

x_test = x[n_train:n_train+n_test]
y_test = y[n_train:n_train+n_test]

#x_train = tf.keras.utils.normalize(x_train, axis=1)

NUM_STEPS = 10000
BATCH_SIZE = 100

feature_columns = []
for feature in x_train.keys():
    feature_columns.append(tf.feature_column.numeric_column(key=feature))

shallow = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[1000], #1 hidden layer com 1000 hidden units
    n_classes=23) #Pode distinguir até 23 valores diferentes

# Treino
print("Treinando")
shallow.train(
    input_fn=lambda:input_fn(x_train, y_train, BATCH_SIZE),
    steps=NUM_STEPS)

# Teste
print("Testando")
eval_result = shallow.evaluate(
    input_fn=lambda:input_fn_test(x_test, y_test, BATCH_SIZE))

print(f'Acc: {eval_result["accuracy"]}')
    
# dnn = learn.DNNClassifier(
#     feature_columns=feature_columns,
#     hidden_units=[1000],
#     n_classes=41,
#     optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.2),
# )

# dnn.fit(x=x_train.values,y=y_train.values,steps=NUM_STEPS,batch_size = BATCH_SIZE)

# test_accuracy = dnn.evaluate(x=x_test.values,y=y_test.values,steps=NUM_STEPS)["accuracy"]
# print(f'test accuracy: {test_accuracy}')


#print(cp.asarray(included_features))
# feature_columns = learn.infer_real_valued_columns_from_input(cp.asarray(indexed_dataSet))

# dnn = learn.DNNClassifier(
#     feature_columns=feature_columns,
#     hidden_units=[100],
#     n_classes=len(included_features),
#     optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.2),
# )

# dnn.fit(x=x_train.values,y=y_train.values,steps=NUM_STEPS,batch_size = BATCH_SIZE)

# test_accuracy = dnn.evaluate(x=x_test.values,y=y_test.values,steps=100)["accuracy"]
# print(f'test accuracy: {test_accuracy}')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Everton\\AppData\\Local\\Temp\\tmppjstaa5z', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001D2BA7D59B0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Treinando
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoin

INFO:tensorflow:loss = 0.0, step = 7501 (0.406 sec)
INFO:tensorflow:global_step/sec: 256.013
INFO:tensorflow:loss = 0.0, step = 7601 (0.391 sec)
INFO:tensorflow:global_step/sec: 246.181
INFO:tensorflow:loss = 0.0, step = 7701 (0.406 sec)
INFO:tensorflow:global_step/sec: 246.15
INFO:tensorflow:loss = 0.0, step = 7801 (0.422 sec)
INFO:tensorflow:global_step/sec: 237.071
INFO:tensorflow:loss = 2486.3325, step = 7901 (0.406 sec)
INFO:tensorflow:global_step/sec: 246.173
INFO:tensorflow:loss = 0.0, step = 8001 (0.406 sec)
INFO:tensorflow:global_step/sec: 246.166
INFO:tensorflow:loss = 0.0, step = 8101 (0.406 sec)
INFO:tensorflow:global_step/sec: 246.181
INFO:tensorflow:loss = 0.0, step = 8201 (0.406 sec)
INFO:tensorflow:global_step/sec: 246.18
INFO:tensorflow:loss = 0.0, step = 8301 (0.406 sec)
INFO:tensorflow:global_step/sec: 246.167
INFO:tensorflow:loss = 0.0, step = 8401 (0.406 sec)
INFO:tensorflow:global_step/sec: 246.173
INFO:tensorflow:loss = 0.0, step = 8501 (0.406 sec)
INFO:tensorflo