In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer

# Load the dataset
df = pd.read_csv('cybersecurity_attacks_transformed_cleaned.csv')

In [2]:


# Example dataset (use your own dataset)
data = pd.DataFrame({
    'Protocol': ['ICMP', 'TCP', 'UDP', 'ICMP', 'TCP', 'UDP'],
    'Traffic Type': ['HTTP', 'DNS', 'FTP', 'HTTP', 'DNS', 'FTP'],
    'Attack Type': ['DDoS', 'Intrusion', 'Malware', 'DDoS', 'Intrusion', 'Malware'],
    'Action Taken': ['Logged', 'Blocked', 'Ignored', 'Logged', 'Blocked', 'Ignored'],
    'Malware Indicators': ['IoC Detected', np.nan, 'IoC Detected', np.nan, 'IoC Detected', 'IoC Detected'],
    'Alerts/Warnings': ['Alert Triggered', np.nan, 'Alert Triggered', np.nan, 'Alert Triggered', 'Alert Triggered']
})

# Preprocessing steps (encoding and scaling)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Label encode categorical features
label_encoder = LabelEncoder()
data['Protocol_encoded'] = label_encoder.fit_transform(data['Protocol'])
data['Traffic Type_encoded'] = label_encoder.fit_transform(data['Traffic Type'])
data['Attack Type_encoded'] = label_encoder.fit_transform(data['Attack Type'])
data['Action Taken_encoded'] = label_encoder.fit_transform(data['Action Taken'])

# Handling missing values for one-hot encoding
data['Malware Indicators'] = data['Malware Indicators'].fillna('Unknown')
data['Alerts/Warnings'] = data['Alerts/Warnings'].fillna('Unknown')

# One-hot encode categorical features
data = pd.get_dummies(data, columns=['Malware Indicators', 'Alerts/Warnings'])

# Features and target
X = data.drop(columns=['Protocol', 'Traffic Type', 'Attack Type', 'Action Taken'])  # drop non-numeric features
y = np.random.choice([0, 1], size=len(X))  # Dummy binary target


In [3]:
import pandas as pd

# Assuming 'y' is your target variable (the labels)
class_counts = pd.Series(y).value_counts()

# Print the class counts
print("Class counts:\n", class_counts)

# Find the smallest class and how many samples it has
smallest_class = class_counts.idxmin()
smallest_class_count = class_counts.min()

print(f"The smallest class is: {smallest_class}")
print(f"It has {smallest_class_count} samples.")


Class counts:
 1    5
0    1
Name: count, dtype: int64
The smallest class is: 0
It has 1 samples.


In [4]:
from sklearn.model_selection import StratifiedKFold

# Use 2 splits since the smallest class has 2 samples
skf = StratifiedKFold(n_splits=2)

# Example: Iterating over splits
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Train indices:", train_index)
    print("Test indices:", test_index)


Train indices: [3 4 5]
Test indices: [0 1 2]
Train indices: [0 1 2]
Test indices: [3 4 5]




In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Dictionary of models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}


In [6]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Define scoring metrics
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "precision": make_scorer(precision_score, zero_division=0),
    "recall": make_scorer(recall_score, zero_division=0),
    "f1": make_scorer(f1_score, zero_division=0)
}

# Set up StratifiedKFold (adjust n_splits as needed)
cv = StratifiedKFold(n_splits=2)

# Evaluate each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}:")
    for metric_name, scorer in scoring.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=scorer)
        print(f"  {metric_name.capitalize()} Scores: {scores}")
        print(f"  Mean {metric_name.capitalize()}: {scores.mean():.4f}")


Evaluating Logistic Regression:
  Accuracy Scores: [       nan 0.66666667]
  Mean Accuracy: nan
  Precision Scores: [nan  1.]
  Mean Precision: nan
  Recall Scores: [       nan 0.66666667]
  Mean Recall: nan
  F1 Scores: [nan 0.8]
  Mean F1: nan
Evaluating Support Vector Machine:
  Accuracy Scores: [nan  1.]
  Mean Accuracy: nan
  Precision Scores: [nan  1.]
  Mean Precision: nan
  Recall Scores: [nan  1.]
  Mean Recall: nan
  F1 Scores: [nan  1.]
  Mean F1: nan
Evaluating Decision Tree:
  Accuracy Scores: [0.66666667 0.66666667]
  Mean Accuracy: 0.6667
  Precision Scores: [0.66666667 1.        ]
  Mean Precision: 0.8333
  Recall Scores: [1.         0.66666667]
  Mean Recall: 0.8333
  F1 Scores: [0.8 0.8]
  Mean F1: 0.8000
Evaluating Random Forest:


1 fits failed out of a total of 2.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1301, in fit
    raise ValueError(
ValueError: This solver needs samples 

  Accuracy Scores: [0.66666667 0.66666667]
  Mean Accuracy: 0.6667
  Precision Scores: [0.66666667 1.        ]
  Mean Precision: 0.8333




  Recall Scores: [1.         0.66666667]
  Mean Recall: 0.8333
  F1 Scores: [0.8 0.8]
  Mean F1: 0.8000
Evaluating K-Nearest Neighbors:




  Accuracy Scores: [nan nan]
  Mean Accuracy: nan
  Precision Scores: [nan nan]
  Mean Precision: nan
  Recall Scores: [nan nan]
  Mean Recall: nan
  F1 Scores: [nan nan]
  Mean F1: nan
Evaluating Naive Bayes:
  Accuracy Scores: [0.66666667 1.        ]
  Mean Accuracy: 0.8333
  Precision Scores: [0.66666667 1.        ]
  Mean Precision: 0.8333
  Recall Scores: [1. 1.]
  Mean Recall: 1.0000
  F1 Scores: [0.8 1. ]
  Mean F1: 0.9000


Traceback (most recent call last):
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\metrics\_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\utils\_response.py", line 211, in _get_response_values
    y_pred = prediction_method(X)
             ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\neighbors\_classification.py", line 271, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
                

In [7]:
import pandas as pd

results = []

for model_name, model in models.items():
    for metric_name, scorer in scoring.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=scorer)
        results.append({
            "Model": model_name,
            "Metric": metric_name,
            "Mean Score": scores.mean(),
            "Std Dev": scores.std()
        })

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

# Optionally save to a CSV file
results_df.to_csv("model_cross_validation_results.csv", index=False)


1 fits failed out of a total of 2.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Duckie\miniconda3\envs\pythonCYBR520\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1301, in fit
    raise ValueError(
ValueError: This solver needs samples 

                     Model     Metric  Mean Score   Std Dev
0      Logistic Regression   accuracy         NaN       NaN
1      Logistic Regression  precision         NaN       NaN
2      Logistic Regression     recall         NaN       NaN
3      Logistic Regression         f1         NaN       NaN
4   Support Vector Machine   accuracy         NaN       NaN
5   Support Vector Machine  precision         NaN       NaN
6   Support Vector Machine     recall         NaN       NaN
7   Support Vector Machine         f1         NaN       NaN
8            Decision Tree   accuracy    0.666667  0.000000
9            Decision Tree  precision    0.833333  0.166667
10           Decision Tree     recall    0.833333  0.166667
11           Decision Tree         f1    0.800000  0.000000
12           Random Forest   accuracy    0.666667  0.000000
13           Random Forest  precision    0.833333  0.166667
14           Random Forest     recall    0.833333  0.166667
15           Random Forest         f1   