In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
pd.pandas.set_option('display.max_columns', None)
data_path = 'train.csv'
file_data = pd.read_csv(data_path)
print(file_data.head())

In [None]:
train_data = file_data['type_of_attack']
print(train_data)
counter1 = Counter(train_data)
values1 = list(counter1.keys())
frequencies1 = list(counter1.values())
plt.bar(values1, frequencies1)
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('Frequency of Occurrences of Values')
plt.show()

In [99]:
features_na = [feature for feature in file_data.columns if file_data[feature].isnull().sum()>1]
print(features_na)

[]


In [None]:
print(file_data.shape)
file_data = file_data.drop('Id', axis = 1)
print(file_data.shape)

In [None]:
# NUMERICAL FEATURES
features_num = [feature for feature in file_data.columns if file_data[feature].dtype != 'O']
print(features_num)
print(len(features_num))

In [None]:
# CATEGORICAL FEATURES
features_cat = [feature for feature in file_data.columns if file_data[feature].dtype == 'O']
print(features_cat)
print(len(features_cat))

In [None]:
# DISCRETE FEATURES
features_dis = [feature for feature in features_num if len(file_data[feature].unique()) < 25]
print(features_dis)
print(len(features_dis))

In [None]:
# CONTINUOUS FEATURES
features_cont = [feature for feature in features_num if len(file_data[feature].unique()) >= 25]
print(features_cont)
print(len(features_cont))

In [None]:
for feature in features_dis:
    print(feature, len(file_data[feature].unique()))

In [None]:
%matplotlib inline
for feature in features_cat:
    data = file_data.copy()
    for category in data[feature].unique():
        attack_counts = data[data[feature] == category]['type_of_attack'].value_counts()
        plt.figure(figsize=(6,6))
        attack_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors)
        plt.ylabel('')
        plt.title(f'Distribution of type_of_attack for {category} in {feature}')
        plt.show()


In [None]:
for feature in features_cont:
    data = file_data.copy()
    data[feature].hist(bins = 25)
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.title(feature)
    plt.show()

In [None]:
for feature in features_cont:
    plt.scatter(file_data[feature], file_data['type_of_attack'])
    plt.ylabel('type_of_attack')
    plt.xlabel(feature)
    plt.title(feature)
    plt.show()
    

In [None]:
# ONE-HOT ENCODING

data_onehot = file_data.copy()
for feature in features_cat:
    if (feature == 'type_of_attack'):
        continue
    data_onehot = pd.get_dummies(data_onehot, columns=[feature], prefix=feature, drop_first=True)
    
print(data_onehot.shape)

In [None]:
# DROPPING FEATURES WITH VARIANCE LESS THAN THE THRESHOLD

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
var_thres = VarianceThreshold(threshold=1e-5)
y = data_onehot['type_of_attack'].to_numpy()
X = data_onehot.drop('type_of_attack', axis = 1)
var_thres.fit(X)
const_columns = [column for column in X.columns if column not in X.columns[var_thres.get_support()]]
print(const_columns)
print(X.shape)
X = X.drop(const_columns, axis = 1)
print(X.shape)

In [None]:
# DROPPING FEATURES HAVING MUTUAL_INFO <= 1e-5

from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X, y)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)
mutual_info.sort_values(ascending=False).plot.bar(figsize = (20, 8))


In [None]:
zero_mi_features = mutual_info[mutual_info <= 1e-5].index
print(X.shape)
X = X.drop(columns = zero_mi_features)
print(X.shape)

In [None]:
# DELIVERABLES: mutual_info, const_columns, features_cat (for ONE-HOT ENCODING)

In [None]:
# BASELINE MODEL, LOGISTIC REGRESSION

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(random_state=42, max_iter=100)
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred1)}")
print(classification_report(y_test, y_pred1))

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model2 = SVC(random_state=42, max_iter=1000)
model2.fit(X_train_scaled, y_train)

y_pred2 = model2.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred2)}")
print(classification_report(y_test, y_pred2))

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_confusion_matrix(y_test, y_pred, class_names):
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    
    # Plotting the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 8))
    disp.plot(cmap='viridis', ax=ax, xticks_rotation='vertical')
    plt.title("Confusion Matrix")
    plt.show()
class_names = ['ipsweep probe', 'back dos', 'satan probe', 'portsweep probe', 'normal']

plot_confusion_matrix(y_test, y_pred, class_names)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

knn_model = KNeighborsClassifier(n_neighbors=5)  # You can tune n_neighbors
knn_model.fit(X_train, y_train)
y_pred3 = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred3)
print(f"KNN Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred3, target_names=class_names))
plot_confusion_matrix(y_test, y_pred3, class_names)  

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
n_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, num_classes=n_classes)
y_test = to_categorical(y_test, num_classes=n_classes)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
input_dim = X_train.shape[1]

model = Sequential([
    Dense(128, activation='relu', input_dim=input_dim),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(n_classes, activation='softmax')
])
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)
yp = model.predict(X_test)
yp_classes = np.argmax(yp, axis=1)
y_test_classes = np.argmax(y_test, axis=1)
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_classes, yp_classes):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_classes, yp_classes))
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
