In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Importing dataset
df=pd.read_csv('network.csv')

### Data Exploration

In [None]:
#Size of the dataset
df.shape

(211043, 44)

In [None]:
#First 5 rows of the dataset
df.head()

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,...,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,label,type
0,192.168.1.37,4444,192.168.1.193,49178,tcp,-,290.371539,101568,2592,OTH,...,0,0,-,-,-,-,-,-,1,backdoor
1,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000102,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor
2,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000148,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor
3,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000113,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor
4,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.00013,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor


In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211043 entries, 0 to 211042
Data columns (total 44 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   src_ip                  211043 non-null  object 
 1   src_port                211043 non-null  int64  
 2   dst_ip                  211043 non-null  object 
 3   dst_port                211043 non-null  int64  
 4   proto                   211043 non-null  object 
 5   service                 211043 non-null  object 
 6   duration                211043 non-null  float64
 7   src_bytes               211043 non-null  int64  
 8   dst_bytes               211043 non-null  int64  
 9   conn_state              211043 non-null  object 
 10  missed_bytes            211043 non-null  int64  
 11  src_pkts                211043 non-null  int64  
 12  src_ip_bytes            211043 non-null  int64  
 13  dst_pkts                211043 non-null  int64  
 14  dst_ip_bytes        

In [None]:
#Exploring the columns' values
df['proto'].value_counts()

Unnamed: 0_level_0,count
proto,Unnamed: 1_level_1
tcp,168747
udp,42015
icmp,281


In [None]:
#Exploring the types of attacks
df['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
normal,50000
backdoor,20000
ddos,20000
dos,20000
injection,20000
password,20000
scanning,20000
ransomware,20000
xss,20000
mitm,1043


In [None]:
#Exploring the presence and absence of attacks (1 means Attack 0 Benign)
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,161043
0,50000


## Data preprocessing

In [None]:
#Checking missing values
df.isna().sum()

Unnamed: 0,0
src_ip,0
src_port,0
dst_ip,0
dst_port,0
proto,0
service,0
duration,0
src_bytes,0
dst_bytes,0
conn_state,0


In [None]:
df.isnull().sum()

Unnamed: 0,0
src_ip,0
src_port,0
dst_ip,0
dst_port,0
proto,0
service,0
duration,0
src_bytes,0
dst_bytes,0
conn_state,0


In [None]:
# Drop non-numeric & mostly empty columns
df.drop(columns=["src_ip", "dst_ip", "dns_query", "dns_AA", "dns_RD", "dns_RA",
                 "dns_rejected", "ssl_version", "ssl_cipher", "ssl_resumed", "ssl_established",
                 "ssl_subject", "ssl_issuer", "http_trans_depth", "http_method", "http_uri",
                 "http_version", "http_request_body_len", "http_response_body_len", "http_status_code",
                 "http_user_agent", "http_orig_mime_types", "http_resp_mime_types", "weird_name", "weird_addl", "weird_notice"],
        inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211043 entries, 0 to 211042
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   src_port      211043 non-null  int64  
 1   dst_port      211043 non-null  int64  
 2   proto         211043 non-null  object 
 3   service       211043 non-null  object 
 4   duration      211043 non-null  float64
 5   src_bytes     211043 non-null  int64  
 6   dst_bytes     211043 non-null  int64  
 7   conn_state    211043 non-null  object 
 8   missed_bytes  211043 non-null  int64  
 9   src_pkts      211043 non-null  int64  
 10  src_ip_bytes  211043 non-null  int64  
 11  dst_pkts      211043 non-null  int64  
 12  dst_ip_bytes  211043 non-null  int64  
 13  dns_qclass    211043 non-null  int64  
 14  dns_qtype     211043 non-null  int64  
 15  dns_rcode     211043 non-null  int64  
 16  label         211043 non-null  int64  
 17  type          211043 non-null  object 
dtypes: f

In [None]:
#Encoding categorical columns using LabelEncoder

from sklearn.preprocessing import LabelEncoder
# Encode categorical columns
categorical_columns = ["proto", "service", "conn_state"]
encoder = LabelEncoder()

for col in categorical_columns:
    df[col] = encoder.fit_transform(df[col])



In [None]:
# Check encoding
df.head()


Unnamed: 0,src_port,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_qclass,dns_qtype,dns_rcode,label,type
0,4444,49178,1,0,290.371539,101568,2592,0,0,108,108064,31,3832,0,0,0,1,backdoor
1,49180,8080,1,0,0.000102,0,0,1,0,1,52,1,40,0,0,0,1,backdoor
2,49180,8080,1,0,0.000148,0,0,1,0,1,52,1,40,0,0,0,1,backdoor
3,49180,8080,1,0,0.000113,0,0,1,0,1,48,1,40,0,0,0,1,backdoor
4,49180,8080,1,0,0.00013,0,0,1,0,1,52,1,40,0,0,0,1,backdoor


In [None]:
df.fillna(0, inplace=True)  # Replace NaN with 0


In [None]:
#Scaling numerical columns using Standard Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_cols = df.columns.difference(["label", "type"])  # Exclude target columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
df.head()

Unnamed: 0,src_port,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_qclass,dns_qtype,dns_rcode,label,type
0,-1.771488,4.482402,-0.492369,-0.673165,0.501064,-0.009157,-0.014214,-1.445311,-0.006544,1.072197,4.81177,0.082107,0.011817,-0.083666,-0.151734,-0.207061,1,backdoor
1,0.545572,0.449865,-0.492369,-0.673165,-0.01365,-0.015099,-0.014358,-1.194698,-0.006544,-0.093652,-0.032474,-0.008608,-0.008122,-0.083666,-0.151734,-0.207061,1,backdoor
2,0.545572,0.449865,-0.492369,-0.673165,-0.01365,-0.015099,-0.014358,-1.194698,-0.006544,-0.093652,-0.032474,-0.008608,-0.008122,-0.083666,-0.151734,-0.207061,1,backdoor
3,0.545572,0.449865,-0.492369,-0.673165,-0.01365,-0.015099,-0.014358,-1.194698,-0.006544,-0.093652,-0.032654,-0.008608,-0.008122,-0.083666,-0.151734,-0.207061,1,backdoor
4,0.545572,0.449865,-0.492369,-0.673165,-0.01365,-0.015099,-0.014358,-1.194698,-0.006544,-0.093652,-0.032474,-0.008608,-0.008122,-0.083666,-0.151734,-0.207061,1,backdoor


In [None]:
df.shape

(211043, 18)

## Feature Selection

Calculating the importance of the features to keep the most important ones.

In [None]:

X = df.drop(columns=["label", "type"])  # Features
y = df["label"]  # Binary attack label

In [None]:
from sklearn.feature_selection import mutual_info_classif
# Compute feature importance scores
feature_scores = mutual_info_classif(X, y)

# Create a DataFrame to visualize importance
feature_importance = pd.DataFrame({"Feature": X.columns, "Importance": feature_scores})
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# Display the most important features
print(feature_importance.head(18))


         Feature  Importance
10  src_ip_bytes    0.422178
1       dst_port    0.421694
0       src_port    0.289152
12  dst_ip_bytes    0.251440
7     conn_state    0.210525
9       src_pkts    0.185828
5      src_bytes    0.183819
2          proto    0.172747
3        service    0.168637
4       duration    0.150177
14     dns_qtype    0.132363
6      dst_bytes    0.127591
11      dst_pkts    0.118415
13    dns_qclass    0.110705
15     dns_rcode    0.006645
8   missed_bytes    0.004464


In [None]:
#dropping columns with no importance
X = df.drop(columns=["label", "type","dns_rcode","dns_qclass","dst_pkts"])  # Features



## Training SVM model

In [None]:
#Splitting the data into training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


# Train model
svm_model = SVC(kernel="rbf", C=1, gamma="scale")
svm_model.fit(X_train, y_train)

# Predict
y_pred = svm_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


'from sklearn.svm import SVC\nfrom sklearn.metrics import accuracy_score, classification_report\n\n\n# Train model\nsvm_model = SVC(kernel="rbf", C=1, gamma="scale")\nsvm_model.fit(X_train, y_train)\n\n# Predict\ny_pred = svm_model.predict(X_test)\n\n# Evaluate\nprint("Accuracy:", accuracy_score(y_test, y_pred))\nprint("Classification Report:\n", classification_report(y_test, y_pred))'

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Defining custom colormap
from matplotlib.colors import LinearSegmentedColormap

colors = ['lightblue', 'lightcoral']
cmap = LinearSegmentedColormap.from_list('custom_cmap', colors)


# Plot confusion matrix with custom colormap and improved aesthetics
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap=cmap,  # Use custom colormap
            xticklabels=["Normal", "Attack"], yticklabels=["Normal", "Attack"],
            annot_kws={"size": 12},
            linewidths=0.5, linecolor='black',
            cbar=False)
plt.xlabel("Predicted", fontsize=14)
plt.ylabel("Actual", fontsize=14)
plt.title("Confusion Matrix", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

'import seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import confusion_matrix\nimport numpy as np\n\n# Compute confusion matrix\ncm = confusion_matrix(y_test, y_pred)\n\n# Defining custom colormap\nfrom matplotlib.colors import LinearSegmentedColormap\n\ncolors = [\'lightblue\', \'lightcoral\'] \ncmap = LinearSegmentedColormap.from_list(\'custom_cmap\', colors)\n\n\n# Plot confusion matrix with custom colormap and improved aesthetics\nplt.figure(figsize=(6, 4))\nsns.heatmap(cm, annot=True, fmt="d", cmap=cmap,  # Use custom colormap\n            xticklabels=["Normal", "Attack"], yticklabels=["Normal", "Attack"],\n            annot_kws={"size": 12},\n            linewidths=0.5, linecolor=\'black\',\n            cbar=False)\nplt.xlabel("Predicted", fontsize=14)\nplt.ylabel("Actual", fontsize=14)\nplt.title("Confusion Matrix", fontsize=16)\nplt.xticks(fontsize=12)\nplt.yticks(fontsize=12)\nplt.show()'

## Data balancing and retraining the model

In [None]:
#Balancing data using the SMOTE method
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initial distribution of the classes
print("Distribution des classes avant équilibrage :", Counter(y_train))

# oversampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Displaying the distribution after balancing
print("Distribution des classes après équilibrage :", Counter(y_train_smote))


'#Balancing data using the SMOTE method \nfrom imblearn.over_sampling import SMOTE\nfrom collections import Counter\n\n# Initial distribution of the classes\nprint("Distribution des classes avant équilibrage :", Counter(y_train))\n\n# oversampling \nsmote = SMOTE(random_state=42)\nX_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)\n\n# Displaying the distribution after balancing\nprint("Distribution des classes après équilibrage :", Counter(y_train_smote))'

In [None]:
# Train model after class balancing
svm_model = SVC(kernel="rbf", C=1, gamma="scale")
svm_model.fit(X_train_smote, y_train_smote)

# Predict
y_pred = svm_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

'# Train model after class balancing\nsvm_model = SVC(kernel="rbf", C=1, gamma="scale")\nsvm_model.fit(X_train_smote, y_train_smote)\n\n# Predict\ny_pred = svm_model.predict(X_test)\n\n# Evaluate\nprint("Accuracy:", accuracy_score(y_test, y_pred))\nprint("Classification Report:\n", classification_report(y_test, y_pred))'

## Optimizing the model hyperparameters using Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, loguniform

# Define parameter distributions
param_dist = {
    'C': [1 , 1.5 , 2],
    'gamma': ['scale', 'auto'] + list(loguniform(1e-3, 1e-1).rvs(10)),  # Mix of 'scale', 'auto', and random values
    'kernel': ['rbf', 'linear']
}

# Random search
random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings sampled
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters:", random_search.best_params_)

# Train SVM with best parameters
best_svm = random_search.best_estimator_
y_pred_best = best_svm.predict(X_test)

# Evaluate improved model
print("Optimized SVM Accuracy:", accuracy_score(y_test, y_pred_best))

' from sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.svm import SVC\nfrom sklearn.metrics import accuracy_score\nfrom scipy.stats import uniform, loguniform\n\n# Define parameter distributions\nparam_dist = {\n    \'C\': [1 , 1.5 , 2],\n    \'gamma\': [\'scale\', \'auto\'] + list(loguniform(1e-3, 1e-1).rvs(10)),  # Mix of \'scale\', \'auto\', and random values\n    \'kernel\': [\'rbf\', \'linear\']\n}\n\n# Random search\nrandom_search = RandomizedSearchCV(\n    SVC(),\n    param_distributions=param_dist,\n    n_iter=20,  # Number of parameter settings sampled\n    cv=5,\n    scoring=\'accuracy\',\n    n_jobs=-1,\n    random_state=42\n)\n\nrandom_search.fit(X_train, y_train)\n\n# Best parameters\nprint("Best Hyperparameters:", random_search.best_params_)\n\n# Train SVM with best parameters\nbest_svm = random_search.best_estimator_\ny_pred_best = best_svm.predict(X_test)\n\n# Evaluate improved model\nprint("Optimized SVM Accuracy:", accuracy_score(y_test, y_pred_best)) 

# Multi-class SVM (detection of the attack type)

In [None]:
#Multi-class SVM can detect specific attack types, but needs class balancing.
df['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
normal,50000
backdoor,20000
ddos,20000
dos,20000
injection,20000
password,20000
scanning,20000
ransomware,20000
xss,20000
mitm,1043



**Multi-Class SVM**

Goal: Classifies data into more than two categories (e.g., different types of cyberattacks).


**Algorithm:**

* One-vs-One (OvO): Trains an SVM for each pair of classes and chooses the most voted class.

* One-vs-Rest (OvR): Trains one SVM per class, separating it from the rest.



In [None]:
# Seperating the features and the labels
X = df.drop(columns=["type"])  # Features
y = df["type"]  # Target


In [None]:
from imblearn.over_sampling import SMOTE

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convertir les catégories en entiers

# Scaling the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Managing class imbalance using SMOTE
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_encoded)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initiliazing the multi class model
svm_model = SVC(kernel="rbf", C=1, gamma="scale", decision_function_shape="ovo")
"""Stratégie multi-classes :
Par défaut, SVM est un classificateur binaire.
decision_function_shape="ovo" (One-Versus-One) : Compare chaque paire de classes indépendamment. (pour les datasets à nombre de classe limité)

"""
# Training the model
svm_model.fit(X_train, y_train)

# Predicting
y_pred = svm_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
