In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
import sklearn
from sklearn import datasets, linear_model, model_selection
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
import os
pd.options.display.max_rows = 100
pd.options.display.max_seq_items = 100
pd.options.display.max_columns = 100

In [2]:
notebook_path = os.path.abspath("DataAnalysis.ipynb")
temp = os.path.abspath("Data")

# Load in all of the relevant work sheets
# We chose not to load in the worksheets that had attacks that we did not initially 
#    plan on modeling to save space and computation time

df1 = pd.read_csv(temp + "\\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", low_memory = False)
df2 = pd.read_csv(temp + "\\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv", low_memory = False)
df3 = pd.read_csv(temp + "\\Tuesday-WorkingHours.pcap_ISCX.csv", low_memory = False)
df4 = pd.read_csv(temp + "\\Wednesday-workingHours.pcap_ISCX.csv", low_memory = False)

In [3]:
# Combine dataframes together
df = df1.append(df2)
df = df.append(df3)
df = df.append(df4)

In [4]:
df.columns = df.columns.str.replace(' ', '')
df1.columns = df.columns.str.replace(' ', '')
df2.columns = df.columns.str.replace(' ', '')
df3.columns = df.columns.str.replace(' ', '')
df4.columns = df.columns.str.replace(' ', '')

In [None]:
df.isna().sum()
# NULL VALUES IN COLUMN "FlowBytes":
#         df2 = 15 , df3 = 201 , df4 = 1008
#         TOTAL NaN ROWS: 1228

In [5]:
# Count labels before dropping them
print("DF1")
print(df1.Label.value_counts())
print("\nDF2")
print(df2.Label.value_counts())
print("\nDF3")
print(df3.Label.value_counts())
print("\nDF4")
print(df4.Label.value_counts())

DF1
DDoS      128027
BENIGN     97718
Name: Label, dtype: int64

DF2
PortScan    158930
BENIGN      127537
Name: Label, dtype: int64

DF3
BENIGN         432074
FTP-Patator      7938
SSH-Patator      5897
Name: Label, dtype: int64

DF4
BENIGN              440031
DoS Hulk            231073
DoS GoldenEye        10293
DoS slowloris         5796
DoS Slowhttptest      5499
Heartbleed              11
Name: Label, dtype: int64


In [None]:
# As per our project plan, we dropped any attack that had below 5k instances
df = df[df.Label != 'Heartbleed']


# Do not want to delete NaN values. All NaN values are in the Flow Bytes column, first check to see if we need this column at all.

# df.dropna(how = 'any', subset=['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
#        'TotalBackwardPackets', 'TotalLengthofFwdPackets',
#        'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
#        'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
#        'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
#        'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
#        'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
#        'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
#        'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
#        'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
#        'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
#        'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
#        'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
#        'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
#        'Down/UpRatio', 'AveragePacketSize', 'AvgFwdSegmentSize',
#        'AvgBwdSegmentSize', 'FwdHeaderLength.1', 'FwdAvgBytes/Bulk',
#        'FwdAvgPackets/Bulk', 'FwdAvgBulkRate', 'BwdAvgBytes/Bulk',
#        'BwdAvgPackets/Bulk', 'BwdAvgBulkRate', 'SubflowFwdPackets',
#        'SubflowFwdBytes', 'SubflowBwdPackets', 'SubflowBwdBytes',
#        'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd',
#        'min_seg_size_forward', 'ActiveMean', 'ActiveStd', 'ActiveMax',
#        'ActiveMin', 'IdleMean', 'IdleStd', 'IdleMax', 'IdleMin', 'Label'], inplace = True)

# df

In [None]:
# df.Label.value_counts()

In [None]:
df = df[['Label', 'DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
       'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       'Down/UpRatio', 'AveragePacketSize', 'AvgFwdSegmentSize',
       'AvgBwdSegmentSize', 'FwdHeaderLength.1', 'FwdAvgBytes/Bulk',
       'FwdAvgPackets/Bulk', 'FwdAvgBulkRate', 'BwdAvgBytes/Bulk',
       'BwdAvgPackets/Bulk', 'BwdAvgBulkRate', 'SubflowFwdPackets',
       'SubflowFwdBytes', 'SubflowBwdPackets', 'SubflowBwdBytes',
       'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd',
       'min_seg_size_forward', 'ActiveMean', 'ActiveStd', 'ActiveMax',
       'ActiveMin', 'IdleMean', 'IdleStd', 'IdleMax', 'IdleMin']]

In [None]:
ben = df[df.Label == 'BENIGN']
dos_h = df[df.Label == 'DoS Hulk']
p_scan = df[df.Label == 'PortScan']
ddos = df[df.Label == 'DDoS']
dos_ge = df[df.Label == 'DoS GoldenEye']
ftp_pat = df[df.Label == 'FTP-Patator']
ssh_pat = df[df.Label == 'SSH-Patator']
dos_slowloris = df[df.Label == 'DoS slowloris']
dos_slowhttp = df[df.Label == 'DoS Slowhttptest']

In [None]:
# olist = list(ben.columns) # original list
# yvlist = list(['Label']) # y variable list

# # make a new list of only the values I want to compare
# xlist = [x for x in olist if x not in yvlist]

# fig = plt.figure(figsize=(20, 25))
# plotNum = 1
# for key in xlist:
#     ax = plt.subplot(6, 3, plotNum)
#     ax.scatter(x=ben[key], y=ben['Label'])
#     ax.set_xlabel(key)
#     ax.set_ylabel('Label')
#     plotNum = plotNum + 1

In [None]:
# # Making a correlation map
# corr = ben.corr()
# plt.figure(figsize=(79, 79))
# heatMap = sns.heatmap(
#     corr,
#     vmin = -1, vmax = 1, center = 0,
#     annot=True,
#     cmap=sns.diverging_palette(20, 220, n=200),
#     square=True
# )

In [None]:
# ben.dtypes

In [None]:
ben

In [None]:
del df1
del df2
del df3
del df4
del df5
del df6
del df7
del df8