In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.preprocessing as skpp
import sklearn.feature_selection as skfs
import sklearn.datasets as skds
import sklearn.metrics as skm

In [None]:
#descargamos el Dataset
kddsa = skds.fetch_kddcup99("SA")

In [None]:
print(kddsa.DESCR)

.. _kddcup99_dataset:

Kddcup 99 dataset
-----------------

The KDD Cup '99 dataset was created by processing the tcpdump portions
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
created by MIT Lincoln Lab [1]. The artificial data (described on the `dataset's
homepage <https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html>`_) was
generated using a closed network and hand-injected attacks to produce a
large number of different types of attack with normal activity in the
background. As the initial goal was to produce a large training set for
supervised learning algorithms, there is a large proportion (80.1%) of
abnormal data which is unrealistic in real world, and inappropriate for
unsupervised anomaly detection which aims at detecting 'abnormal' data, ie

1) qualitatively different from normal data

2) in large minority among the observations.

We thus transform the KDD Data set into two different data sets: SA and SF.

-SA is obtained by simply selecting all t

In [None]:
kddsa.target.shape

(100655,)

In [None]:
set (kddsa.target)

{b'back.',
 b'ipsweep.',
 b'neptune.',
 b'nmap.',
 b'normal.',
 b'pod.',
 b'portsweep.',
 b'rootkit.',
 b'satan.',
 b'smurf.',
 b'teardrop.',
 b'warezclient.'}

In [None]:
#descargamos los features
!wget http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names

--2021-06-10 01:16:50--  http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1307 (1.3K)
Saving to: ‘kddcup.names.1’


2021-06-10 01:16:51 (170 MB/s) - ‘kddcup.names.1’ saved [1307/1307]



In [None]:
features = []
with open("kddcup.names", "r") as f:
  linea = f.readline()
  print(linea)
  clases = linea.split(",")
  while linea:
    linea = f.readline().strip()    #strip para limpiar los espacios en blanco o saltos de linea
    if (len(linea)>0):
      print(linea)
      features.append(linea.split(":")[0])
    

In [None]:
print(clases)
print(features)

['back', 'buffer_overflow', 'ftp_write', 'guess_passwd', 'imap', 'ipsweep', 'land', 'loadmodule', 'multihop', 'neptune', 'nmap', 'normal', 'perl', 'phf', 'pod', 'portsweep', 'rootkit', 'satan', 'smurf', 'spy', 'teardrop', 'warezclient', 'warezmaster.\n']
['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [None]:
cfeatures = ["protocol_type", "service", "flag", "land", "logged_in", "is_host_login", "is_guest_login"]
cfeatures

['protocol_type',
 'service',
 'flag',
 'land',
 'logged_in',
 'is_host_login',
 'is_guest_login']

In [None]:
data = pd.DataFrame(kddsa.data, columns=features)
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,b'tcp',b'http',b'SF',181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,9,9,1,0,0.11,0,0,0,0,0
1,0,b'tcp',b'http',b'SF',239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,19,19,1,0,0.05,0,0,0,0,0
2,0,b'tcp',b'http',b'SF',235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,29,29,1,0,0.03,0,0,0,0,0
3,0,b'tcp',b'http',b'SF',219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,1,0,0,39,39,1,0,0.03,0,0,0,0,0
4,0,b'tcp',b'http',b'SF',217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,1,0,0,49,49,1,0,0.02,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100650,0,b'tcp',b'private',b'S0',0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,300,20,1,1,0,0,0.07,0.06,0,255,20,0.08,0.07,0,0,1,1,0,0
100651,0,b'tcp',b'private',b'S0',0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,264,19,1,1,0,0,0.07,0.05,0,255,19,0.07,0.07,0,0,1,1,0,0
100652,0,b'icmp',b'ecr_i',b'SF',1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0,0,0,0,1,0,0,255,255,1,0,1,0,0,0,0,0
100653,0,b'tcp',b'private',b'S0',0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,149,2,1,1,0,0,0.01,0.07,0,255,9,0.04,0.07,0,0,1,1,0,0


In [None]:
#de los que tienen datos simbolicos, los imprimimos
for item in cfeatures:
  print(data[item].unique())

[b'tcp' b'udp' b'icmp']
[b'http' b'smtp' b'finger' b'domain_u' b'auth' b'telnet' b'ftp' b'eco_i'
 b'ntp_u' b'ecr_i' b'other' b'pop_3' b'ftp_data' b'ssh' b'domain'
 b'private' b'time' b'shell' b'IRC' b'urh_i' b'X11' b'urp_i' b'tftp_u'
 b'tim_i' b'red_i' b'uucp_path' b'csnet_ns' b'Z39_50' b'discard' b'vmnet'
 b'netbios_ns' b'remote_job' b'systat' b'gopher' b'iso_tsap' b'exec'
 b'nntp' b'daytime' b'hostnames' b'mtp' b'kshell' b'supdup' b'imap4'
 b'efs' b'http_443' b'netbios_ssn' b'courier' b'rje' b'sql_net' b'login']
[b'SF' b'S1' b'REJ' b'S2' b'S0' b'RSTO' b'S3' b'OTH' b'RSTR' b'SH']
[0 1]
[1 0]
[0]
[0 1]


In [None]:
#verificamos que no hayan nulos
data.isnull().sum().sum()

0

In [None]:
#codificamos
ohe = skpp.OneHotEncoder()

In [None]:
ec = ohe.fit_transform(data[cfeatures]).toarray()
ec.shape

(100655, 70)

In [None]:
type(ec)

numpy.ndarray

In [None]:
ohe.categories_

[array([b'icmp', b'tcp', b'udp'], dtype=object),
 array([b'IRC', b'X11', b'Z39_50', b'auth', b'courier', b'csnet_ns',
        b'daytime', b'discard', b'domain', b'domain_u', b'eco_i', b'ecr_i',
        b'efs', b'exec', b'finger', b'ftp', b'ftp_data', b'gopher',
        b'hostnames', b'http', b'http_443', b'imap4', b'iso_tsap',
        b'kshell', b'login', b'mtp', b'netbios_ns', b'netbios_ssn',
        b'nntp', b'ntp_u', b'other', b'pop_3', b'private', b'red_i',
        b'remote_job', b'rje', b'shell', b'smtp', b'sql_net', b'ssh',
        b'supdup', b'systat', b'telnet', b'tftp_u', b'tim_i', b'time',
        b'urh_i', b'urp_i', b'uucp_path', b'vmnet'], dtype=object),
 array([b'OTH', b'REJ', b'RSTO', b'RSTR', b'S0', b'S1', b'S2', b'S3',
        b'SF', b'SH'], dtype=object),
 array([0, 1], dtype=object),
 array([0, 1], dtype=object),
 array([0], dtype=object),
 array([0, 1], dtype=object)]

In [None]:
efeatures = []
for idx, cc in enumerate(cfeatures):
  for i in range(len(ohe.categories_[idx])):
    efeatures.append(cc+str(i))
efeatures

['protocol_type0',
 'protocol_type1',
 'protocol_type2',
 'service0',
 'service1',
 'service2',
 'service3',
 'service4',
 'service5',
 'service6',
 'service7',
 'service8',
 'service9',
 'service10',
 'service11',
 'service12',
 'service13',
 'service14',
 'service15',
 'service16',
 'service17',
 'service18',
 'service19',
 'service20',
 'service21',
 'service22',
 'service23',
 'service24',
 'service25',
 'service26',
 'service27',
 'service28',
 'service29',
 'service30',
 'service31',
 'service32',
 'service33',
 'service34',
 'service35',
 'service36',
 'service37',
 'service38',
 'service39',
 'service40',
 'service41',
 'service42',
 'service43',
 'service44',
 'service45',
 'service46',
 'service47',
 'service48',
 'service49',
 'flag0',
 'flag1',
 'flag2',
 'flag3',
 'flag4',
 'flag5',
 'flag6',
 'flag7',
 'flag8',
 'flag9',
 'land0',
 'land1',
 'logged_in0',
 'logged_in1',
 'is_host_login0',
 'is_guest_login0',
 'is_guest_login1']

In [None]:
print(ec)

[[0. 1. 0. ... 1. 1. 0.]
 [0. 1. 0. ... 1. 1. 0.]
 [0. 1. 0. ... 1. 1. 0.]
 ...
 [1. 0. 0. ... 1. 1. 0.]
 [0. 1. 0. ... 1. 1. 0.]
 [1. 0. 0. ... 1. 1. 0.]]


In [None]:
#agregamos esos features al data frame
data[efeatures] = ec
data[efeatures]

Unnamed: 0,protocol_type0,protocol_type1,protocol_type2,service0,service1,service2,service3,service4,service5,service6,service7,service8,service9,service10,service11,service12,service13,service14,service15,service16,service17,service18,service19,service20,service21,service22,service23,service24,service25,service26,service27,service28,service29,service30,service31,service32,service33,service34,service35,service36,service37,service38,service39,service40,service41,service42,service43,service44,service45,service46,service47,service48,service49,flag0,flag1,flag2,flag3,flag4,flag5,flag6,flag7,flag8,flag9,land0,land1,logged_in0,logged_in1,is_host_login0,is_guest_login0,is_guest_login1
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100650,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
100651,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
100652,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
100653,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [None]:
data = data.drop(cfeatures, axis=1)

In [None]:
data.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type0,protocol_type1,protocol_type2,service0,service1,service2,...,service27,service28,service29,service30,service31,service32,service33,service34,service35,service36,service37,service38,service39,service40,service41,service42,service43,service44,service45,service46,service47,service48,service49,flag0,flag1,flag2,flag3,flag4,flag5,flag6,flag7,flag8,flag9,land0,land1,logged_in0,logged_in1,is_host_login0,is_guest_login0,is_guest_login1
0,0,181,5450,0,0,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,9,9,1,0,0.11,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
1,0,239,486,0,0,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,19,19,1,0,0.05,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
2,0,235,1337,0,0,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,1,0,0,29,29,1,0,0.03,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
3,0,219,1337,0,0,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,1,0,0,39,39,1,0,0.03,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,0,217,2032,0,0,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,1,0,0,49,49,1,0,0.02,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


In [None]:
data.max()

duration             58329.0
src_bytes          2194619.0
dst_bytes          5134218.0
wrong_fragment           3.0
urgent                   3.0
                     ...    
logged_in0               1.0
logged_in1               1.0
is_host_login0           1.0
is_guest_login0          1.0
is_guest_login1          1.0
Length: 104, dtype: float64

In [None]:
escalador = skpp.MinMaxScaler()
x = escalador.fit_transform(data)

In [None]:
data_escalada = pd.DataFrame(x, columns=data.columns)
data_escalada

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type0,protocol_type1,protocol_type2,service0,service1,service2,...,service27,service28,service29,service30,service31,service32,service33,service34,service35,service36,service37,service38,service39,service40,service41,service42,service43,service44,service45,service46,service47,service48,service49,flag0,flag1,flag2,flag3,flag4,flag5,flag6,flag7,flag8,flag9,land0,land1,logged_in0,logged_in1,is_host_login0,is_guest_login0,is_guest_login1
0,0.0,0.000082,0.001062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015656,0.015656,0.0,0.0,0.0,0.0,1.00,0.00,0.0,0.035294,0.035294,1.00,0.00,0.11,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.000109,0.000095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015656,0.015656,0.0,0.0,0.0,0.0,1.00,0.00,0.0,0.074510,0.074510,1.00,0.00,0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.000107,0.000260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015656,0.015656,0.0,0.0,0.0,0.0,1.00,0.00,0.0,0.113725,0.113725,1.00,0.00,0.03,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.000100,0.000260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011742,0.011742,0.0,0.0,0.0,0.0,1.00,0.00,0.0,0.152941,0.152941,1.00,0.00,0.03,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.000099,0.000396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011742,0.011742,0.0,0.0,0.0,0.0,1.00,0.00,0.0,0.192157,0.192157,1.00,0.00,0.02,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100650,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587084,0.039139,1.0,1.0,0.0,0.0,0.07,0.06,0.0,1.000000,0.078431,0.08,0.07,0.00,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
100651,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516634,0.037182,1.0,1.0,0.0,0.0,0.07,0.05,0.0,1.000000,0.074510,0.07,0.07,0.00,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
100652,0.0,0.000470,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,1.000000,0.0,0.0,0.0,0.0,1.00,0.00,0.0,1.000000,1.000000,1.00,0.00,1.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
100653,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291585,0.003914,1.0,1.0,0.0,0.0,0.01,0.07,0.0,1.000000,0.035294,0.04,0.07,0.00,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
set(kddsa.target)

{b'back.',
 b'ipsweep.',
 b'neptune.',
 b'nmap.',
 b'normal.',
 b'pod.',
 b'portsweep.',
 b'rootkit.',
 b'satan.',
 b'smurf.',
 b'teardrop.',
 b'warezclient.'}

In [None]:
#puntos anormales se marcan con -1 y los normales con 1
kddsa.target == b'normal.'

array([ True,  True,  True, ..., False, False, False])

In [None]:
y = (kddsa.target != b'normal.')*(-2)
y+=1
y

array([ 1,  1,  1, ..., -1, -1, -1])

In [None]:
print("% de anomalos = {:.2%}".format(sum(y==-1)/len(y)))

% de anomalos = 3.36%


In [None]:
print(x.shape)
x = skfs.VarianceThreshold().fit_transform(x)
print(x.shape)

(100655, 104)
(100655, 102)


deteccion de outlayers

Envoltura eliptica

In [None]:
from sklearn.covariance import EllipticEnvelope
ee = EllipticEnvelope(random_state=0)

In [None]:
#-1 para los anomalos
#1 para los normales
yp = ee.fit_predict(x)



In [None]:
print(f"min={min(yp)}, max={max(yp)}")

min=-1, max=1


In [None]:
sum(yp==-1)/len(yp)

0.10000496746311659

In [None]:
skm.confusion_matrix(y, yp)

array([[  959,  2418],
       [ 9107, 88171]])

In [None]:
pd.DataFrame(skm.confusion_matrix(y,yp),
             index=["y_anomalo(pos)", "y_normal(negativo)"],
             columns = ["yp_anomalos (pos)","yp_normales (neg)"]
)

Unnamed: 0,yp_anomalos (pos),yp_normales (neg)
y_anomalo(pos),959,2418
y_normal(negativo),9107,88171


In [None]:
(959)/(959 + 9107)

0.0952712100139082

In [None]:
(959)/(959 + 2418)

0.283979863784424

In [None]:
print("presicion= {:.2%}".format(skm.precision_score(y,yp, pos_label=-1)))
print("recall= {:.2%}".format(skm.recall_score(y,yp, pos_label=-1)))

presicion= 9.53%
recall= 28.40%


In [None]:
yps = []
for i in np.linspace(0.01, 0.1, 5):
  yps.append(EllipticEnvelope(random_state=0, contamination=i).fit_predict(x))
  print()


























In [None]:
for idx,i in enumerate(np.linspace(0.01,0.1,5)):
  print("contamination={:.2%}, presicion={:.2%}, recall={:.2%}".format(
      i,
      skm.precision_score(y,yps[idx], pos_label=-1),
      skm.recall_score(y,yps[idx], pos_label=-1)
  ))

contamination=1.00%, presicion=0.00%, recall=0.00%
contamination=3.25%, presicion=0.00%, recall=0.00%
contamination=5.50%, presicion=17.23%, recall=28.25%
contamination=7.75%, presicion=12.23%, recall=28.25%
contamination=10.00%, presicion=9.53%, recall=28.40%


Isolation forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
iforest = IsolationForest(n_estimators=150, n_jobs=-1, random_state=0, contamination=0.1, verbose=2)
yp=iforest.fit_predict(x)

In [None]:
set(yp)

In [None]:
print("contamination={:.2%}, presicion={:.2%}, recall={:.2%}".format(
      iforest.contamination,
      skm.precision_score(y,yp, pos_label=-1),
      skm.recall_score(y,yp, pos_label=-1)))

contamination=10.00%, presicion=33.31%, recall=99.29%


In [None]:
pd.DataFrame(skm.confusion_matrix(y,yp),
             index=["y_anomalo(pos)", "y_normal(negativo)"],
             columns = ["yp_anomalos (pos)","yp_normales (neg)"]
)

Unnamed: 0,yp_anomalos (pos),yp_normales (neg)
y_anomalo(pos),3353,24
y_normal(negativo),6713,90565


In [None]:
3353/(3353+24)

0.992893100384957

In [None]:
3353/(3353+6713)

0.3331015299026426

**Deteccion de novedades**


Local Outlayer Factor

In [117]:
from sklearn.neighbors import LocalOutlierFactor

In [118]:
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1, novelty=True, n_jobs=-1)

In [119]:
lof.fit(x[y==1])

LocalOutlierFactor(algorithm='auto', contamination=0.1, leaf_size=30,
                   metric='minkowski', metric_params=None, n_jobs=-1,
                   n_neighbors=10, novelty=True, p=2)

In [121]:
yp = lof.predict(x)

print("contamination={:.2%}, presicion={:.2%}, recall={:.2%}".format(
      iforest.contamination,
      skm.precision_score(y,yp, pos_label=-1),
      skm.recall_score(y,yp, pos_label=-1)))

pd.DataFrame(skm.confusion_matrix(y,yp),
             index=["y_anomalo(pos)", "y_normal(negativo)"],
             columns = ["yp_anomalos (pos)","yp_normales (neg)"])

contamination=10.00%, presicion=29.00%, recall=99.56%


Unnamed: 0,yp_anomalos (pos),yp_normales (neg)
y_anomalo(pos),3362,15
y_normal(negativo),8231,89047
