# License

Copyright 2024 Zoltan DOBRADY  
Contact: zoltan.dobrady@hotmail.com  

Licensed under the Apache License, Version 2.0 (the "License");  
you may not use this file except in compliance with the License.  
You may obtain a copy of the License at  

    http://www.apache.org/licenses/LICENSE-2.0  

Unless required by applicable law or agreed to in writing, software  
distributed under the License is distributed on an "AS IS" BASIS,  
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
See the License for the specific language governing permissions and  
limitations under the License.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
from sklearn.preprocessing import LabelEncoder

from matplotlib import pyplot
#from scapy.all import *

import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import classification_report, confusion_matrix

# Loading CSV files
training_file_path = 'Private_measuring/training1a.csv'
testing_file_path = 'Private_measuring/testing1a.csv'
training_data = pd.read_csv(training_file_path)
testing_data = pd.read_csv(testing_file_path)

# Adding 'attack' column to the training data
training_data['attack'] = training_data['Info'].apply(lambda x: 1 if '[SYN]' in x else 0)

# Label encoding for 'Source' and 'Destination' columns
le_source = LabelEncoder()
le_destination = LabelEncoder()

# Collecting all unique 'Source' and 'Destination' values
all_sources = set(training_data['Source']).union(set(testing_data['Source']))
all_destinations = set(training_data['Destination']).union(set(testing_data['Destination']))

# Training LabelEncoder with all unique values
le_source.fit(list(all_sources))
le_destination.fit(list(all_destinations))

# Converting data
training_data['Source_encoded'] = le_source.transform(training_data['Source'])
training_data['Destination_encoded'] = le_destination.transform(training_data['Destination'])
testing_data['Source_encoded'] = le_source.transform(testing_data['Source'])
testing_data['Destination_encoded'] = le_destination.transform(testing_data['Destination'])

# Selecting training data
X_train = training_data[['Source_encoded', 'Destination_encoded', 'Length']]
y_train = training_data['attack']

# Grid Search for RandomForestClassifier
param_grid = {
    'n_estimators': [5, 10 ],  # Multiple options for the size of the forest
    'max_depth': [1,2,5,10, None],   # Greater range for tree depth
    'min_samples_split': [1, 2],   # Multiple options for sample splitting
    'min_samples_leaf': [1, 2],      # Multiple options for leaf samples
    'max_features': ['sqrt', 'log2', None],# Multiple options for the maximum number of features
    'bootstrap': [True, False],            # Whether to use bootstrap or not
    'criterion': ['gini', 'entropy']       # Variety of decision criteria
}



rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found by Grid Search:", best_params)

# Creating and training the model with best parameters
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Selecting test data and making predictions
X_test = testing_data[['Source_encoded', 'Destination_encoded', 'Length']]
predictions = best_rf_model.predict(X_test)

# Adding prediction results to the test data
testing_data['attack'] = predictions

# Calculating the number of predicted attacks
num_predicted_attacks = sum(predictions)
print(f"Number of predicted attacks: {num_predicted_attacks}")

# Restoring original 'Source' and 'Destination' values
testing_data['Source'] = le_source.inverse_transform(testing_data['Source_encoded'])
testing_data['Destination'] = le_destination.inverse_transform(testing_data['Destination_encoded'])

# Removing unnecessary encoded columns
testing_data.drop(['Source_encoded', 'Destination_encoded'], axis=1, inplace=True)

# Saving the results in a new CSV file with original IP addresses
output_file_path = 'predicted_testing1a_with_original_ips_2.csv'
testing_data.to_csv(output_file_path, index=False)

# Classification report and confusion matrix
print("Classification Report:\n")
print(classification_report(y_train, best_rf_model.predict(X_train)))
print("Confusion Matrix:\n")
conf_matrix = confusion_matrix(y_train, best_rf_model.predict(X_train))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.show()

# Learning curve
train_sizes, train_scores, validation_scores = learning_curve(
    estimator=best_rf_model,
    X=X_train,
    y=y_train,
    train_sizes=[0.1, 0.33, 0.55, 0.78, 1.0],
    cv=5,
    scoring='accuracy'
)
train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
validation_scores_mean = validation_scores.mean(axis=1)
validation_scores_std = validation_scores.std(axis=1)
plt.figure(figsize=(12, 8))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std,
                 validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Accuracy")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Validation Accuracy")
plt.title("Learning Curve")
plt.xlabel("Number of Training Samples")
plt.ylabel("Accuracy")
plt.legend(loc="best")
plt.show()
