# Installing Required Packages
This cell installs essential Python libraries such as pandas, numpy, and scikit-learn which are required for data manipulation, machine learning modeling, and evaluation.

In [None]:
%pip install pandas
%pip install numpy
%pip install scikit-learn



# Importing Libraries
This cell imports the necessary libraries and modules that will be used throughout the notebook.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, roc_auc_score

# Loading the Dataset
The dataset is loaded into a pandas DataFrame from a CSV file. Ensure that the file path is correct and accessible.

In [None]:
data = pd.read_csv('CIDDS-001-external-week1.csv')

# Converting Byte Values
This function converts byte values represented as strings (with K, M, G suffixes) to their numerical equivalent in bytes. The 'Bytes' column in the dataset is then updated with the converted values.

In [None]:
def convert_bytes(value):
    if 'M' in value:
        return float(value.replace('M', '')) * 10**6
    elif 'K' in value:
        return float(value.replace('K', '')) * 10**3
    elif 'G' in value:
        return float(value.replace('G', '')) * 10**9
    else:
        return float(value)
data['Bytes'] = data['Bytes'].apply(convert_bytes)

# Cleaning the Data
The cell removes specific columns from the dataset that are not needed for the analysis.

In [None]:
data = data.drop(columns =['attackID','attackType','attackDescription'])

# Handling High Cardinality Variables
Identifying variables with a high number of unique values and reducing their cardinality by keeping the top 10 values and setting the rest to 'Other'.

In [None]:
high_cardinality_vars = [col for col in data.select_dtypes(include=['object']).columns if data[col].nunique() > 100]
print("High Cardinality Variables:", high_cardinality_vars)

High Cardinality Variables: ['Date first seen', 'Src IP Addr', 'Dst IP Addr']


# One-Hot Encoding
Applying one-hot encoding to categorical variables in the dataset to convert them into a format that can be provided to machine learning models.

In [None]:
for col in high_cardinality_vars:
    top_categories = data[col].value_counts().index[:10]
    data[col] = np.where(data[col].isin(top_categories), data[col], 'Other')

# More Data Cleaning
Dropping additional columns that are not required for the analysis.

In [None]:
data_encoded = pd.get_dummies(data, columns=['Proto', 'Flags', 'class'])

# Displaying the Processed Data
This cell displays the first five rows of the processed dataset to provide a quick overview of the data.

In [None]:
data_encoded = data_encoded.drop(columns=['Date first seen','Src IP Addr','Dst IP Addr'])

# Checking for Missing Values
Identifying and printing any missing values in the dataset.

In [None]:
print(data_encoded.head())

    Duration  Src Pt   Dst Pt  Packets       Bytes  Flows  Tos  Proto_GRE    \
0  81412.697    8082  56978.0     3057   2100000.0      1    0            0   
1  81412.697   56978   8082.0     4748   2500000.0      1    0            0   
2  81504.787    8082  56979.0     8639   9100000.0      1    0            0   
3  81504.787   56979   8082.0    12024  10300000.0      1    0            0   
4  82100.692    8082  51649.0    11012  27200000.0      1    0            0   

   Proto_ICMP   Proto_TCP    ...  Flags_.A.RS.  Flags_.A.RSF  Flags_.AP...  \
0            0            1  ...             0             0             1   
1            0            1  ...             0             0             1   
2            0            1  ...             0             0             1   
3            0            1  ...             0             0             1   
4            0            1  ...             0             0             0   

   Flags_.AP.S.  Flags_.AP.SF  Flags_.APRS.  Flags_.APRS

# Converting Boolean to Integer
Converting all boolean columns in the dataset to integers to ensure consistency in data types.

In [None]:
missing_values = data_encoded.isnull().sum()
print("\nMissing Values :", missing_values,'\n')


Missing Values : Duration            0
Src Pt              0
Dst Pt              0
Packets             0
Bytes               0
Flows               0
Tos                 0
Proto_GRE           0
Proto_ICMP          0
Proto_TCP           0
Proto_UDP           0
Flags_  0x53        0
Flags_  0xc2        0
Flags_  0xd2        0
Flags_  0xd3        0
Flags_  0xd6        0
Flags_  0xd7        0
Flags_  0xda        0
Flags_  0xdb        0
Flags_  0xdf        0
Flags_......        0
Flags_....S.        0
Flags_...R..        0
Flags_...RS.        0
Flags_.A....        0
Flags_.A..S.        0
Flags_.A..SF        0
Flags_.A.R..        0
Flags_.A.R.F        0
Flags_.A.RS.        0
Flags_.A.RSF        0
Flags_.AP...        0
Flags_.AP.S.        0
Flags_.AP.SF        0
Flags_.APRS.        0
Flags_.APRSF        0
class_normal        0
class_suspicious    0
class_unknown       0
dtype: int64 



# Filtering the Data
Filtering the dataset to include only specific rows based on a condition.

In [None]:
for col in data_encoded.columns:
    if data_encoded[col].dtype == 'bool':
        data_encoded[col] = data_encoded[col].astype(int)

# Saving Processed Data to CSV
Saving the processed and encoded dataset to a CSV file.

In [None]:
data_filtered = data_encoded[data_encoded['class_unknown'] == 0]

# Separating Normal and Anomalous Data
Dividing the dataset into normal and anomalous data based on the 'class_normal' column.

In [None]:
data_encoded.to_csv('data_encoded.csv',index=False)

# One-Class SVM Model and Evaluation
In the following cells, a One-Class SVM model is set up, trained, and evaluated. The model is used for anomaly detection, identifying normal and anomalous patterns in the data.

In [None]:
data_normal = data_encoded[data_encoded['class_normal'] == 1]
data_anomalous = data_encoded[data_encoded['class_normal'] == 0]

In [None]:
X = data_encoded.drop(columns=['class_normal'])
y = data_encoded['class_normal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_normal = X_train_scaled[y_train == 0]
X_test_normal = X_test_scaled[y_test == 0]
X_test_anomalies = X_test_scaled[y_test == 1]

In [None]:
oc_svm = OneClassSVM(kernel='linear', nu=0.01)
#oc_svm = OneClassSVM(kernel='rbf', gamma='auto')

In [None]:
oc_svm.fit(X_train_normal)

In [None]:
y_pred_test_normal = oc_svm.predict(X_test_normal)
y_pred_test_normal = np.where(y_pred_test_normal == 1, 0, 1)

In [None]:
y_pred_test_anomalies = oc_svm.predict(X_test_anomalies)
y_pred_test_anomalies = np.where(y_pred_test_anomalies == 1, 0, 1)

In [None]:
y_pred_test_combined = np.concatenate([y_pred_test_normal, y_pred_test_anomalies])
y_test_combined = np.concatenate([np.zeros_like(y_pred_test_normal), np.ones_like(y_pred_test_anomalies)])

In [None]:
roc_auc_score_oc_svm = roc_auc_score(y_test_combined, y_pred_test_combined)
class_report_oc_svm = classification_report(y_test_combined, y_pred_test_combined)

In [None]:
print("Train-Test Split Validation")
print("AUC-ROC Score:", roc_auc_score_oc_svm)
print("Classification Report:\n", class_report_oc_svm)
print("\n" + "-"*50 + "\n")

Train-Test Split Validation
AUC-ROC Score: 0.9891264656956221
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     24647
           1       0.95      1.00      0.97      9921

    accuracy                           0.98     34568
   macro avg       0.97      0.99      0.98     34568
weighted avg       0.99      0.98      0.98     34568


--------------------------------------------------



In [None]:
strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
roc_auc_scores = []
classification_reports = []

In [None]:
for fold, (train_index, test_index) in enumerate(strat_k_fold.split(data_anomalous, np.zeros(data_anomalous.shape[0])), start=1):
    anomalies_train, anomalies_test = data_anomalous.iloc[train_index], data_anomalous.iloc[test_index]
    X_test = pd.concat([data_normal, anomalies_test]).drop('class_normal', axis=1)
    y_test = pd.concat([pd.Series(np.zeros(data_normal.shape[0])), pd.Series(np.ones(anomalies_test.shape[0]))])

    scaler = MinMaxScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    X_train_normal = data_normal.drop('class_normal', axis=1)
    X_train_normal_scaled = scaler.transform(X_train_normal)
    oc_svm.fit(X_train_normal_scaled)

    y_pred = oc_svm.predict(X_test_scaled)
    y_pred = np.where(y_pred == 1, 0, 1)

    roc_auc_score_oc_svm = roc_auc_score(y_test, y_pred)
    class_report_oc_svm = classification_report(y_test, y_pred)

    roc_auc_scores.append(roc_auc_score_oc_svm)
    classification_reports.append(class_report_oc_svm)

    print(f"Stratified K-Fold Cross-Validation: Fold {fold}")
    print("AUC-ROC Score:", roc_auc_score_oc_svm)
    print("Classification Report:\n", class_report_oc_svm)
    print("\n" + "-"*50 + "\n")


Stratified K-Fold Cross-Validation: Fold 1
AUC-ROC Score: 0.7770735374709704
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.99      0.90     49606
         1.0       0.97      0.56      0.71     24647

    accuracy                           0.85     74253
   macro avg       0.89      0.78      0.80     74253
weighted avg       0.87      0.85      0.84     74253


--------------------------------------------------

Stratified K-Fold Cross-Validation: Fold 2
AUC-ROC Score: 0.7789804632631563
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.99      0.90     49606
         1.0       0.97      0.57      0.72     24647

    accuracy                           0.85     74253
   macro avg       0.89      0.78      0.81     74253
weighted avg       0.87      0.85      0.84     74253


--------------------------------------------------

Stratified K-Fold Cross-Validation: Fo

In [None]:
average_roc_auc_score = np.mean(roc_auc_scores)
print("Average AUC-ROC Score Across All Folds:", average_roc_auc_score)

Average AUC-ROC Score Across All Folds: 0.7775794560144307
