In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
import sklearn
from sklearn import datasets, linear_model, model_selection
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
import os
import time
from sklearn.naive_bayes import GaussianNB
pd.options.display.max_rows = 10
pd.options.display.max_columns = 79

In [2]:
# Loading in Excel doc into a Dataframe
notebook_path = os.path.abspath("RunningData.ipynb")
temp = os.path.abspath("Data")

In [None]:
ps_lr = pd.read_csv(temp + "\\finalportscanlr.csv", low_memory = False)
ps_bayes = pd.read_csv(temp + "\\final-portscan-bayes.csv", low_memory = False)

ddos_lr = pd.read_csv(temp + "\\finalddoslr.csv", low_memory = False)
ddos_bayes = pd.read_csv(temp + "\\finalddosnaivebayes.csv", low_memory = False)

ps_label = pd.read_csv(temp + "\\PortScanLabel.csv", low_memory = False)
ddos_label = pd.read_csv(temp + "\\DDOSLabel.csv", low_memory = False)

ddos_df = pd.read_csv(temp + "\\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", low_memory = False)
ps_df = pd.read_csv(temp + "\\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv", low_memory = False)

In [None]:
# Edits the Column Names
ddos_df.columns = ddos_df.columns.str.replace(' ', '')
ddos_df.columns = ddos_df.columns.str.replace('/s', '_s')

ps_df.columns = ps_df.columns.str.replace(' ', '')
ps_df.columns = ps_df.columns.str.replace('/s', '_s')

In [None]:
ddos_df = ddos_df.iloc[:,:-1]

ps_df = ps_df.iloc[:,:-1]

In [None]:
# #Drops any rows with NA
ddos_df.dropna(how = 'any', subset = ['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes_s', 'FlowPackets_s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets_s',
       'BwdPackets_s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       'Down/UpRatio', 'AveragePacketSize', 'AvgFwdSegmentSize',
       'AvgBwdSegmentSize', 'FwdHeaderLength.1', 'FwdAvgBytes/Bulk',
       'FwdAvgPackets/Bulk', 'FwdAvgBulkRate', 'BwdAvgBytes/Bulk',
       'BwdAvgPackets/Bulk', 'BwdAvgBulkRate', 'SubflowFwdPackets',
       'SubflowFwdBytes', 'SubflowBwdPackets', 'SubflowBwdBytes',
       'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd',
       'min_seg_size_forward', 'ActiveMean', 'ActiveStd', 'ActiveMax',
       'ActiveMin', 'IdleMean', 'IdleStd', 'IdleMax', 'IdleMin'], inplace = True)

In [None]:
ps_df.dropna(how = 'any', subset = ['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes_s', 'FlowPackets_s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets_s',
       'BwdPackets_s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       'Down/UpRatio', 'AveragePacketSize', 'AvgFwdSegmentSize',
       'AvgBwdSegmentSize', 'FwdHeaderLength.1', 'FwdAvgBytes/Bulk',
       'FwdAvgPackets/Bulk', 'FwdAvgBulkRate', 'BwdAvgBytes/Bulk',
       'BwdAvgPackets/Bulk', 'BwdAvgBulkRate', 'SubflowFwdPackets',
       'SubflowFwdBytes', 'SubflowBwdPackets', 'SubflowBwdBytes',
       'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd',
       'min_seg_size_forward', 'ActiveMean', 'ActiveStd', 'ActiveMax',
       'ActiveMin', 'IdleMean', 'IdleStd', 'IdleMax', 'IdleMin'], inplace = True)

In [None]:
# Delete rows that contain infinity from FlowBytes and FlowPackets
ddos_df = ddos_df[~ddos_df.FlowBytes_s.str.startswith('Infinity')]
ddos_df = ddos_df[~ddos_df.FlowPackets_s.str.startswith('Infinity')]

In [None]:
ps_df = ps_df[~ps_df.FlowBytes_s.str.startswith('Infinity')]
ps_df = ps_df[~ps_df.FlowPackets_s.str.startswith('Infinity')]

In [None]:
# Converts FlowBytes_s and FlowPackets_s to numeric columns
ddos_df[['FlowBytes_s']] = ddos_df[['FlowBytes_s']].apply(pd.to_numeric)
ddos_df[['FlowPackets_s']] = ddos_df[['FlowPackets_s']].apply(pd.to_numeric)

In [None]:
ps_df[['FlowBytes_s']] = ps_df[['FlowBytes_s']].apply(pd.to_numeric)
ps_df[['FlowPackets_s']] = ps_df[['FlowPackets_s']].apply(pd.to_numeric)

In [None]:
xtrainDF, xtestDF, ytrainDF, ytestDF = train_test_split(ps_lr, ps_label, test_size=0.4, random_state=10)
reg = linear_model.LinearRegression()
fit = reg.fit(xtrainDF, ytrainDF)
prediction_of_test = fit.predict(ps_df)
print(np.sqrt(sklearn.metrics.mean_squared_error(ps_label, prediction_of_test)))

In [None]:
xtrainDF, xtestDF, ytrainDF, ytestDF = train_test_split(ddos_lr, ddos_label, test_size=0.4, random_state=10)
reg = linear_model.LinearRegression()
fit = reg.fit(xtrainDF, ytrainDF)
prediction_of_test = fit.predict(ddos_df)
print(np.sqrt(sklearn.metrics.mean_squared_error(ddos_label, prediction_of_test)))

In [None]:
xtrainDF, xtestDF, ytrainDF, ytestDF = train_test_split(ps_bayes, ps_label, test_size=0.4, random_state=10)
reg = GaussianNB()
fit = reg.fit(xtrainDF, ytrainDF)
prediction_of_test = fit.predict(ps_df)
print(np.sqrt(sklearn.metrics.mean_squared_error(ps_label, prediction_of_test)))

In [None]:
xtrainDF, xtestDF, ytrainDF, ytestDF = train_test_split(ddos_bayes, ddos_label, test_size=0.4, random_state=10)
reg = GaussianNB()
fit = reg.fit(xtrainDF, ytrainDF)
prediction_of_test = fit.predict(ddos_df)
print(np.sqrt(sklearn.metrics.mean_squared_error(ddos_label, prediction_of_test)))