## Importing the libraries

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Extracting the features

In [3]:
featuresPath = "/content/drive/MyDrive/Colab Notebooks/Anomaly Detection/features.txt"
with open(featuresPath, "r") as file:
  # splitting at ':' and taking the first word
  features = [line.split(':')[0] for line in file]
# dropping the first 2 lines as they are unnecessary
features.pop(0), features.pop(0)
features.append('attack_type')

In [4]:
len(features)

41

## Reading the dataset

In [5]:
path = "/content/drive/MyDrive/Colab Notebooks/Anomaly Detection/kddcup.data.gz"
df = pd.read_csv(path, compression='gzip', names=features, header=None)
df.head()

Unnamed: 0,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,tcp,http,SF,215,45076,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,162,4528,0,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,236,1228,0,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,233,2032,0,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,239,486,0,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


## Changing the categorical features to numerical

In [8]:
# getting all categorical columns
cat_columns = df.select_dtypes(['object']).columns
# converting all categorical columns to numeric
df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
df.head()

Unnamed: 0,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,0,0,215,45076,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,162,4528,0,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,236,1228,0,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,233,2032,0,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,239,486,0,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0
