In [1]:
pip install minisom



In [2]:
import sklearn.cluster
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from minisom import MiniSom

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
gzipped_file_path = '/content/drive/MyDrive/Data Mining/training_data.txt'

column_names = ["idk", "idk2",  "Date", "Start time", "Duration", "Server", "Src port", "Dest port", "Src IP", "Dest IP", "Attack Score", "Name"]
df = pd.read_csv(gzipped_file_path, sep = ' ', header=None, names=column_names)

df.head()

Unnamed: 0,idk,idk2,Date,Start time,Duration,Server,Src port,Dest port,Src IP,Dest IP,Attack Score,Name
0,1,,07/15/1998,07:56:52,00:00:01,ntp/u,123,123,172.016.112.020,192.168.001.010,0,-
1,2,,07/15/1998,07:56:52,00:00:01,ntp/u,123,123,172.016.112.020,192.168.001.010,0,-
2,3,,07/15/1998,07:56:53,00:00:01,domain/u,53,53,172.016.112.020,192.168.001.010,0,-
3,4,,07/15/1998,07:56:53,00:00:01,domain/u,53,53,172.016.112.020,192.168.001.010,0,-
4,5,,07/15/1998,07:56:53,00:00:01,smtp,1024,25,172.016.113.204,194.007.248.153,0,-


In [5]:
def before_slash(string):
  if '/' in string:
    part_before_slash = string.split('/')[0]
    return part_before_slash
  else:
    return string

df["New server"] = df["Server"].apply(before_slash)
df.head()

Unnamed: 0,idk,idk2,Date,Start time,Duration,Server,Src port,Dest port,Src IP,Dest IP,Attack Score,Name,New server
0,1,,07/15/1998,07:56:52,00:00:01,ntp/u,123,123,172.016.112.020,192.168.001.010,0,-,ntp
1,2,,07/15/1998,07:56:52,00:00:01,ntp/u,123,123,172.016.112.020,192.168.001.010,0,-,ntp
2,3,,07/15/1998,07:56:53,00:00:01,domain/u,53,53,172.016.112.020,192.168.001.010,0,-,domain
3,4,,07/15/1998,07:56:53,00:00:01,domain/u,53,53,172.016.112.020,192.168.001.010,0,-,domain
4,5,,07/15/1998,07:56:53,00:00:01,smtp,1024,25,172.016.113.204,194.007.248.153,0,-,smtp


In [6]:
servers = df[df['Attack Score'] == 1].groupby('New server')['Attack Score'].sum().reset_index()
servers = servers.sort_values(by='Attack Score', ascending=False).reset_index(drop=True)
servers= servers[servers['Attack Score'] != 1]
servers = servers[servers['Attack Score'] != 2]
servers = servers[servers['Attack Score'] != 3]
servers.head(20)

Unnamed: 0,New server,Attack Score
0,ecr,249750
1,eco,16801
2,frag,11987
3,1,8301
4,ftp-data,3647
5,http,3569
6,telnet,3316
7,finger,2338
8,ftp,1965
9,smtp,1632


In [7]:
servers = df.groupby('Name')['Attack Score'].count().reset_index()
servers = servers.sort_values(by='Attack Score', ascending=False).reset_index(drop=True)
servers

Unnamed: 0,Name,Attack Score
0,neptune,1526628
1,-,790526
2,smurf,249609
3,satan,32632
4,ipsweep,15406
5,portsweep,10504
6,pod,10048
7,nmap,2357
8,teardrop,2172
9,warezclient,1766


In [8]:
df_neptune = df[df['Name'] == 'neptune'].copy(deep=True)
df_others = df[df['Name'] != 'neptune'].copy(deep=True)

drop_fraction = 0.7

# Calculate the number of rows to drop
num_rows_to_drop = int(len(df_neptune) * drop_fraction)

# Generate random row indices to drop
random_indices = np.random.choice(df_neptune.index, num_rows_to_drop, replace=False)

# Drop the selected rows from the DataFrame
df_dropped = df_neptune.drop(random_indices)
df_dropped

Unnamed: 0,idk,idk2,Date,Start time,Duration,Server,Src port,Dest port,Src IP,Dest IP,Attack Score,Name,New server
114461,7595,,07/02/1998,10:05:01,00:00:01,2,20055,2,010.020.030.040,172.016.112.050,1,neptune,2
114464,7598,,07/02/1998,10:05:01,00:00:01,tcpmux,13655,1,010.020.030.040,172.016.112.050,1,neptune,tcpmux
114468,7602,,07/02/1998,10:05:01,00:00:01,tcpmux,16471,1,010.020.030.040,172.016.112.050,1,neptune,tcpmux
114469,7603,,07/02/1998,10:05:01,00:00:01,tcpmux,16727,1,010.020.030.040,172.016.112.050,1,neptune,tcpmux
114470,7604,,07/02/1998,10:05:01,00:00:01,tcpmux,16983,1,010.020.030.040,172.016.112.050,1,neptune,tcpmux
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2506196,512891,,07/09/1998,14:23:00,00:00:01,1024,50545,1024,010.020.030.040,172.016.112.050,1,neptune,1024
2506199,512894,,07/09/1998,14:23:00,00:00:01,1024,51313,1024,010.020.030.040,172.016.112.050,1,neptune,1024
2506200,512895,,07/09/1998,14:23:00,00:00:01,1024,51569,1024,010.020.030.040,172.016.112.050,1,neptune,1024
2506206,512901,,07/09/1998,14:23:01,00:00:01,1024,53105,1024,010.020.030.040,172.016.112.050,1,neptune,1024


In [9]:
combined_df = pd.concat([df_others, df_dropped], axis=0)
combined_df = combined_df.sample(frac=1, random_state=42)
combined_df = combined_df.reset_index(drop=True)

In [10]:
combined_df

Unnamed: 0,idk,idk2,Date,Start time,Duration,Server,Src port,Dest port,Src IP,Dest IP,Attack Score,Name,New server
0,21341,,07/13/1998,12:36:55,00:00:01,smtp,28803,25,195.073.151.050,172.016.112.050,0,-,smtp
1,196339,,07/08/1998,11:27:49,00:00:01,955,2992,955,135.013.216.191,172.016.113.050,1,neptune,955
2,10869,,06/25/1998,12:00:20,00:00:01,ftp-data,20,8255,197.218.177.069,172.016.113.204,0,-,ftp-data
3,17221,,06/19/1998,15:56:16,00:00:01,domain/u,1134,53,192.168.001.010,172.016.112.020,0,-,domain
4,172943,,07/08/1998,11:21:46,00:00:01,106,45677,106,135.013.216.191,172.016.113.050,1,neptune,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1575554,66890,,06/17/1998,11:56:54,00:00:01,ecr/i,7,7,207.230.054.138,172.016.112.050,1,smurf,ecr
1575555,194254,,07/03/1998,18:02:37,00:00:01,655,12647,655,230.001.010.020,172.016.112.050,1,neptune,655
1575556,3342,,06/16/1998,11:04:04,00:00:01,http,26691,80,172.016.112.194,209.001.236.048,0,-,http
1575557,217694,,07/17/1998,10:56:45,00:00:01,http,15428,80,172.016.117.052,207.025.071.022,0,-,http


In [11]:
print(combined_df['Name'].value_counts())

df = combined_df.copy(deep=True)

-              790526
neptune        457989
smurf          249609
satan           32632
ipsweep         15406
portsweep       10504
pod             10048
nmap             2357
teardrop         2172
warezclient      1766
back             1281
dict              881
rootkit           254
land               34
warezmaster        19
loadmodule         11
eject              11
ffb                10
multihop            9
imap                7
anomaly             6
format              5
phf                 5
ftp-write           4
perlmagic           4
syslog              3
spy                 2
warez               1
format-fail         1
eject-fail          1
warzclient          1
Name: Name, dtype: int64


In [12]:
df = df[df['Src port'] != '-']
df = df.reset_index(drop=True)
df = df[df['Dest port'] != '-']
df = df.reset_index(drop=True)
df = df[df['Dest port'] != 'customs']
df = df.reset_index(drop=True)

In [13]:
def duration_to_seconds(duration_str):
    parts = duration_str.split(":")
    hours = int(parts[0])
    minutes = int(parts[1])
    seconds = int(parts[2])
    return hours * 3600 + minutes * 60 + seconds

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

df[['src-net-id-1', 'src-net-id-2', 'src-subnet', 'src-device']] = df['Src IP'].str.split('.', expand=True)
df[['dest-net-id-1', 'dest-net-id-2', 'dest-subnet', 'dest-device']] = df['Dest IP'].str.split('.', expand=True)
df["duration_seconds"] = df["Duration"].apply(duration_to_seconds)

df.drop('Src IP', axis=1, inplace=True)
df.drop('Dest IP', axis=1, inplace=True)
df.drop('Duration', axis=1, inplace=True)
df.drop('Start time', axis=1, inplace=True)

tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(df['Server'])

train_data = df.copy(deep=True)

train_data.drop('idk', axis=1, inplace=True)
train_data.drop('idk2', axis=1, inplace=True)
train_data.drop('Date', axis=1, inplace=True)
train_data.drop('Server', axis=1, inplace=True)
train_data.drop('New server', axis=1, inplace=True)

columns_to_move = ['Name', 'Attack Score']

new_order = [col for col in train_data.columns if col not in columns_to_move] + columns_to_move

train_data = train_data[new_order]


In [15]:
train_data

Unnamed: 0,Src port,Dest port,src-net-id-1,src-net-id-2,src-subnet,src-device,dest-net-id-1,dest-net-id-2,dest-subnet,dest-device,duration_seconds,Name,Attack Score
0,28803,25,195,073,151,050,172,016,112,050,1,-,0
1,2992,955,135,013,216,191,172,016,113,050,1,neptune,1
2,20,8255,197,218,177,069,172,016,113,204,1,-,0
3,1134,53,192,168,001,010,172,016,112,020,1,-,0
4,45677,106,135,013,216,191,172,016,113,050,1,neptune,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371893,2041,161,194,027,251,021,192,168,001,001,1,-,0
1371894,7,7,207,230,054,138,172,016,112,050,1,smurf,1
1371895,12647,655,230,001,010,020,172,016,112,050,1,neptune,1
1371896,26691,80,172,016,112,194,209,001,236,048,1,-,0


In [16]:
count = train_data['Dest port'].value_counts().get('-', 0)

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(train_data.iloc[:, :-2]), columns=train_data.iloc[:, :-2].columns)

print(normalized_df)

         Src port  Dest port  src-net-id-1  src-net-id-2  src-subnet  \
0        0.439868   0.000716      0.847162      0.284585    0.597610   
1        0.045693   0.028478      0.585153      0.047431    0.856574   
2        0.000305   0.246395      0.855895      0.857708    0.701195   
3        0.017318   0.001552      0.834061      0.660079    0.000000   
4        0.697561   0.003134      0.585153      0.047431    0.856574   
...           ...        ...           ...           ...         ...   
1371893  0.031169   0.004776      0.842795      0.102767    0.996016   
1371894  0.000107   0.000179      0.899563      0.905138    0.211155   
1371895  0.193140   0.019523      1.000000      0.000000    0.035857   
1371896  0.407614   0.002358      0.746725      0.059289    0.442231   
1371897  0.235610   0.002358      0.746725      0.059289    0.462151   

         src-device  dest-net-id-1  dest-net-id-2  dest-subnet  dest-device  \
0          0.192913       0.802956       0.062745     0.

In [None]:
# import seaborn as sns

# normalized_df_with_labels = normalized_df.copy(deep=True)
# normalized_df_with_labels['Name'] = train_data['Name']

# sns.pairplot(normalized_df_with_labels, hue="Name")
# plt.show()

K means without PCA

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0)
train_data['Cluster'] = kmeans.fit_predict(normalized_df)

# Calculate majority class for each cluster
cluster_majority = train_data.groupby('Cluster')['Name'].agg(lambda x: x.value_counts().index[0]).reset_index()

# Assign cluster names based on majority class
cluster_names = {}
for index, row in cluster_majority.iterrows():
    cluster_names[row['Cluster']] = row['Name']

# Add cluster names to the original DataFrame
train_data['Cluster Name'] = train_data['Cluster'].map(cluster_names)


In [None]:
print(train_data['Cluster Name'].value_counts())

K means with PCA

In [None]:
pca = PCA(n_components=5)
df_pca = pca.fit_transform(normalized_df)
df_pca = pd.DataFrame(data=df_pca)
df_pca.head()

In [None]:
kmeans=KMeans(n_clusters=5)

train_data['Cluster'] = kmeans.fit_predict(df_pca)

# Calculate majority class for each cluster
cluster_majority = train_data.groupby('Cluster')['Name'].agg(lambda x: x.value_counts().index[0]).reset_index()

# Assign cluster names based on majority class
cluster_names = {}
for index, row in cluster_majority.iterrows():
    cluster_names[row['Cluster']] = row['Name']



In [None]:
# Add cluster names to the original DataFrame
train_data['Cluster Name'] = train_data['Cluster'].map(cluster_names)

cluster_counts = train_data['Cluster'].value_counts()

print(cluster_counts)
print(cluster_names)

In [None]:
print(train_data['Cluster Name'].value_counts())