In [32]:
import numpy as np
import pandas as pd

In [33]:
# Define file paths
files = [
    'Data_of_Attack_Back_Normal.csv',
    'Data_of_Attack_Back.csv',
    'Data_of_Attack_Back_BufferOverflow.csv',
    'Data_of_Attack_Back_FTPWrite.csv',
    'Data_of_Attack_Back_GuessPassword.csv',
    'Data_of_Attack_Back_Neptune.csv',
    'Data_of_Attack_Back_NMap.csv',
    'Data_of_Attack_Back_PortSweep.csv',
    'Data_of_Attack_Back_RootKit.csv',
    'Data_of_Attack_Back_Satan.csv',
    'Data_of_Attack_Back_Smurf.csv'
]

# Load all files into a list of dataframes
dataframes = [pd.read_csv('C:/Users/Ajay/Downloads/Machine learning capstone project/Capstone Project 2/' + file) for file in files]


# Add Attack Column: Add a new column attack to each dataframe to denote the type of attack or normal.

In [34]:
attack_labels = [
    'Normal', 'Back', 'BufferOverflow', 'FTPWrite', 'GuessPassword',
    'Neptune', 'NMap', 'PortSweep', 'RootKit', 'Satan', 'Smurf'
]

for df, label in zip(dataframes, attack_labels):
    df['attack'] = label

In [35]:
dataframes[3].columns = ['duration', ' protocol_type', ' service', ' flag', ' src_bytes',
       ' dst_bytes', ' land', ' wrong_fragment', ' urgent', ' hot',
       ' num_failed_logins', ' logged_in', ' num_compromised', ' root_shell',
       ' su_attempted', ' num_root', ' num_file_creations', ' num_shells',
       ' num_access_files', ' num_outbound_cmds', ' is_host_login',
       ' is_guest_login', ' count', ' srv_count', ' serror_rate',
       ' srv_error_rate', ' rerror_rate', ' srv_rerror_rate', ' same_srv_rate',
       ' diff_srv_rate', ' srv_diff_host_rate', ' dst_host_count',
       ' dst_host_srv_count', ' dst_host_same_srv_rate',
       ' dst_host_diff_srv_rate', ' dst_host_same_src_port_rate',
       ' dst_host_srv_diff_host_rate', ' dst_host_serror_rate',
       ' dst_host_srv_serror_rate', ' dst_host_rerror_rate',
       ' dst_host_srv_rerror_rate', 'attack']

In [36]:
# there are extra spaces in column names of each dataset. therefore we will remove them
for df in dataframes:
    df.columns  = df.columns.str.replace(" ", "")

In [37]:
# Combining the dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# Resampling Data
1. For Binary Classification

In [38]:
from sklearn.utils import resample

In [42]:
# Separate majority and minority classes
df_normal = combined_df[combined_df['attack'] == 'Normal']
df_attack = combined_df[combined_df['attack'] != 'Normal']

# Upsample minority class
df_attack_upsampled = resample(df_attack,
                               replace=True,  # Sample with replacement
                               n_samples=len(df_normal),  # Match majority class
                               random_state=42)  # For reproducibility

# Combine majority class with upsampled minority class
balanced_df1 = pd.concat([df_normal, df_attack_upsampled])

2. For Multinomial Cassification

In [43]:
# Determine the minimum size among the attack types
min_size = combined_df['attack'].value_counts().min()

# Resample each class to the minimum size
balanced_dfs = []
for label in combined_df['attack'].unique():
    df = combined_df[combined_df['attack'] == label]
    df_resampled = resample(df,
                            replace=True,  # Sample with replacement
                            n_samples=min_size,  # Match minimum size
                            random_state=42)  # For reproducibility
    balanced_dfs.append(df_resampled)

# Combine all resampled dataframes
balanced_df2 = pd.concat(balanced_dfs)

# Feature encoding

# 1. Binomial Classification

In [64]:
# Nominal Features: Convert categorical features to numerical values using one-hot encoding

balanced_df1 = pd.get_dummies(balanced_df1, columns=['protocol_type', 'service', 'flag'])

In [65]:
# Binomial Features: Ensuring binary features are correctly formatted

binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
balanced_df1[binary_features] = balanced_df1[binary_features].astype(int)

In [47]:
from sklearn.preprocessing import StandardScaler

In [62]:
# Numeric Features

numeric_features = [
    'duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot',
    'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'count', 'srv_count',
    'serror_rate', 'srv_error_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate'
]

scaler = StandardScaler()
balanced_df1[numeric_features] = scaler.fit_transform(balanced_df1[numeric_features])


# 2. Multinomial Classification

In [67]:
# Nominal Features: Convert categorical features to numerical values using one-hot encoding

balanced_df2 = pd.get_dummies(balanced_df2, columns=['protocol_type', 'service', 'flag'])

In [68]:
# Binomial Features: Ensuring binary features are correctly formatted

binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
balanced_df2[binary_features] = balanced_df2[binary_features].astype(int)

In [69]:
# Numeric Features

numeric_features = [
    'duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot',
    'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'count', 'srv_count',
    'serror_rate', 'srv_error_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate'
]

scaler = StandardScaler()
balanced_df2[numeric_features] = scaler.fit_transform(balanced_df2[numeric_features])

# Model Training and Evaluation

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

1. Binomial Classification

In [70]:
# Prepare features and target variable
X = balanced_df1.drop(columns=['attack'])
y = balanced_df1['attack'].apply(lambda x: 1 if x != 'Normal' else 0)  # Attack vs. Normal

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[173263      0]
 [     4 172759]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    173263
           1       1.00      1.00      1.00    172763

    accuracy                           1.00    346026
   macro avg       1.00      1.00      1.00    346026
weighted avg       1.00      1.00      1.00    346026



2. Multinomial Classification

In [52]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [71]:
# Encode target variable
X = balanced_df2.drop(columns=['attack'])
le = LabelEncoder()
y = le.fit_transform(balanced_df2['attack'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


[[3 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 2 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 1]]
                precision    recall  f1-score   support

          Back       1.00      1.00      1.00         3
BufferOverflow       1.00      1.00      1.00         2
      FTPWrite       1.00      1.00      1.00         1
 GuessPassword       1.00      1.00      1.00         4
          NMap       1.00      1.00      1.00         2
       Neptune       1.00      1.00      1.00         3
        Normal       0.75      1.00      0.86         3
     PortSweep       1.00      1.00      1.00         2
       RootKit       1.00      1.00      1.00         1
         Satan       1.00      0.50      0.67         2
         Smurf       1.00      1.00      1.00         1

      accuracy                           0.96     

# Thank You