<a href="https://colab.research.google.com/github/BOLTZZ/Novel-Deep-Learning-IDS-Code/blob/main/Final_ML_Models_of_Kyoto_2006%2B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Pre-Processing:

In [None]:
# Importing the formatted dataset from a google drive.
!pip install --upgrade --no-cache-dir gdown
!gdown https://drive.google.com/uc?id=1D2_7bHpqPfh_Nf3nxCzkj-BBjVEFRgiP

In [None]:
# Converting the csv file to a dataframe.
import pandas as pd
df = pd.read_csv("2007data.csv")

In [None]:
# Since the data is so large, we'll be essentially "boostrapping" without replacement by using test_train_split. We'll be selecting ~ 5 million datapoints out of the ~ 26 million datapoints.
y = df.Label
x = df.drop(labels = ['Label', 'Unnamed: 0'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
df = X_test.join(y_test.to_frame())
print(len(df))

# Numerical Features:

Creating the testing and training data with normalization:

In [None]:
# Creating the testing/training data by normalizing the features using z-score normalization.
numerical_features = ['duration', 'source_bytes', 'dest_bytes', 'count', 'Same_srv_rate', 'Serror_rate', 'Srv_serror', 'Dst_host_count', 'Dst_host_srv_count', 'Dst_host_same_src_port_rate', 'Dst_host_serror_rate', 'Dst_hostsrv_serror_rate', 'Source_port_number', 'Dest_port_number']
df_numerical = df[numerical_features]
df_numerical =(df_numerical - df_numerical.astype('float32').mean())/(df_numerical.astype('float32').std())
y = df.Label
x = df_numerical
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Creating a logistic regression model:

In [None]:
# Creating the model.
from sklearn.linear_model import LogisticRegression
log_reg_numerical = LogisticRegression(solver = 'sag')
log_reg_numerical.fit(X_train, y_train)

In [None]:
# Getting the accuracy of this model.
log_reg_numerical.score(X_test, y_test)

In [None]:
# Confusion matrix.
from sklearn.metrics import confusion_matrix
y_pred = log_reg_numerical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

Creating a decision tree model:

In [None]:
# Creating the model.
from sklearn.tree import DecisionTreeClassifier
decision_tree_numerical = DecisionTreeClassifier()
decision_tree_numerical.fit(X_train, y_train)

In [None]:
# Getting the accuracy of decision tree model.
decision_tree_numerical.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = decision_tree_numerical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

Creating a random forest model of 70 trees:

In [None]:
# Creates random forest model.
from sklearn.ensemble import RandomForestClassifier
random_forest_numerical = RandomForestClassifier(n_estimators = 70)
random_forest_numerical.fit(X_train, y_train)

In [None]:
# Gets accuracy of random forest model.
random_forest_numerical.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = random_forest_numerical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

# Categorical Features:

Creating the testing/training data with normalization and binary encoding:

In [None]:
# Performing binary encoding on the categorical features.
!pip install category_encoders
import category_encoders as ce
categorical_features = ['service', 'Flag', 'IDS_detection', 'Malware_detection', 'Ashula_detection', 'Source_IP_addr', 'Dest_IP_addr', 'Start_time', 'Protocol']
df_binary = df[categorical_features].join(df['Label'])
binary_encoder = ce.BinaryEncoder()
df_binary = binary_encoder.fit_transform(df_binary)

In [None]:
# Normalize the features using z-score normalization.
df_categorical = df_binary.drop(labels = 'Label', axis = 1)
df_categorical =(df_categorical - df_categorical.astype('float32').mean())/(df_categorical.astype('float32').std())
y = df_binary.Label
x = df_categorical
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Creating a logistic regression model:

In [None]:
# Creating the model.
log_reg_categorical = LogisticRegression(solver = 'lbfgs')
log_reg_categorical.fit(X_train, y_train)

In [None]:
# Getting the accuracy of this model.
log_reg_categorical.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = log_reg_categorical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

Creating a decison tree model:

In [None]:
# Creating the model.
decision_tree_categorical = DecisionTreeClassifier()
decision_tree_categorical.fit(X_train, y_train)

In [None]:
# Getting the accuracy of decision tree model.
decision_tree_categorical.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = decision_tree_categorical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

Creating a random forest model of 70 trees:

In [None]:
# Creates random forest model.
from sklearn.ensemble import RandomForestClassifier
random_forest_categorical = RandomForestClassifier(n_estimators = 70)
random_forest_categorical.fit(X_train, y_train)

In [None]:
# Gets accuracy of random forest model.
random_forest_categorical.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = random_forest_categorical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

# Numerical and Categorical Features:

Creating the testing/training data with normalization:

In [None]:
# Joining the categorical features and numerical features together to create the final dataframe.
df_numerical = df[numerical_features]
df_final = df_numerical.join(df_binary)

In [None]:
# Creating the training/testing data by normalizing the features using z-score normalization.
df_both = df_final.drop(labels = 'Label', axis = 1)
df_both =(df_both - df_both.astype('float32').mean())/(df_both.astype('float32').std())
y = df_final.Label
x = df_both
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Creating the logistic regression model:

In [None]:
# Creating the model.
log_reg_final = LogisticRegression(solver = 'lbfgs')
log_reg_final.fit(X_train, y_train)

In [None]:
# Getting the accuracy of this model.
log_reg_final.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = log_reg_final.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

Creating the decision tree model:

In [None]:
# Creating the model.
decision_tree_final = DecisionTreeClassifier()
decision_tree_final.fit(X_train, y_train)

In [None]:
# Getting the accuracy of decision tree model.
decision_tree_final.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = decision_tree_final.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

Creating the random forest model using 70 trees:

In [None]:
# Creates random forest model.
random_forest_final = RandomForestClassifier(n_estimators = 70)
random_forest_final.fit(X_train, y_train)

In [None]:
# Gets accuracy of random forest model.
random_forest_final.score(X_test, y_test)

In [None]:
# Confusion matrix.
y_pred = random_forest_final.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)