In [1]:
# Data manipulation and processing
import pandas as pd
import numpy as np
import re  # Added for regex operations

# Sklearn preprocessing and scaling
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_curve, auc,
    accuracy_score, classification_report, precision_recall_curve, confusion_matrix
)

# Model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC  # Importing SVM
from sklearn.utils import resample
import matplotlib.pyplot as plt
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
data = pd.read_csv('/home/achoo/Desktop/Honeypot/sanitized_logs_combined.csv', delimiter=',', header=None)
data.columns = ['eventid', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'session', 
                'protocol', 'version', 'hassh', 'hasshAlgorithms', 'message', 
                'sensor', 'timestamp']

# Check initial shape of data
print(f"Initial shape of data: {data.shape}")

# Define keywords that indicate malicious activity
malicious_keywords = ['failed', 'whoami', 'uname', 'chattr', 'cat', ' rm', '.ssh', 'authorized_keys',
                      'grep', 'chmod', 'curl', 'not found', 'mkdir']

# Create a target column based on the presence of keywords in the 'message' column
data['attack'] = data['message'].apply(lambda x: 1 if any(keyword in x for keyword in malicious_keywords) else 0)

# Keep only selected columns
data = data[['message', 'hasshAlgorithms', 'eventid', 'protocol', 'attack']]

Initial shape of data: (19317, 13)


In [3]:
# Initialize a LabelEncoder to encode categorical columns
le = LabelEncoder()

# Encode categorical columns (hasshAlgorithms, eventid, protocol)
data['hasshAlgorithms'] = le.fit_transform(data['hasshAlgorithms'])
data['eventid'] = le.fit_transform(data['eventid'])
data['protocol'] = le.fit_transform(data['protocol'])

# Separate the data into two groups: class 0 (non-malicious) and class 1 (malicious)
class_0 = data[data['attack'] == 0]
class_1 = data[data['attack'] == 1]

# Randomly undersample class 0 to match the number of instances in class 1
class_0_balanced = class_0.sample(n=len(class_1), random_state=42)

# Combine the balanced classes back together
balanced_data = pd.concat([class_0_balanced, class_1])

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
# Vectorize the 'message' column using TF-IDF
tfidf = TfidfVectorizer(max_features=500)  # Limit the number of features to prevent overfitting
message_tfidf = tfidf.fit_transform(balanced_data['message'])

# Convert the TF-IDF features into a DataFrame and concatenate with the other features
message_tfidf_df = pd.DataFrame(message_tfidf.toarray(), columns=tfidf.get_feature_names_out())
balanced_data = pd.concat([balanced_data.drop('message', axis=1), message_tfidf_df], axis=1)

# Separate features (X) and labels (y)
X = balanced_data.drop(['attack'], axis=1)
y = balanced_data['attack']

# Initialize StandardScaler to scale feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Initialize Support Vector Machine Classifier
model_svm = SVC(probability=True, random_state=42)  # Enable probability estimates

# Train the model
model_svm.fit(X_train, y_train)

# Predict on validation set
y_pred_svm = model_svm.predict(X_val)

# Evaluate the SVM model
print("SVM Classifier Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_svm):.4f}")
print(confusion_matrix(y_val, y_pred_svm))
print(classification_report(y_val, y_pred_svm))

SVM Classifier Evaluation:
Accuracy: 0.9942
[[692   3]
 [  5 689]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       695
           1       1.00      0.99      0.99       694

    accuracy                           0.99      1389
   macro avg       0.99      0.99      0.99      1389
weighted avg       0.99      0.99      0.99      1389



In [6]:
# Initialize XGBoost Classifier
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
model_xgb.fit(X_train, y_train)

# Predict on validation set
y_pred_xgb = model_xgb.predict(X_val)

# Evaluate the XGBoost model
print("XGBoost Classifier Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_xgb):.4f}")
print(confusion_matrix(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

XGBoost Classifier Evaluation:
Accuracy: 0.9986
[[693   2]
 [  0 694]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       695
           1       1.00      1.00      1.00       694

    accuracy                           1.00      1389
   macro avg       1.00      1.00      1.00      1389
weighted avg       1.00      1.00      1.00      1389



In [7]:
# Initialize VotingClassifier with soft voting
voting_clf = VotingClassifier(
    estimators=[('xgb', model_xgb), ('svm', model_svm)],
    voting='soft'
)

# Train the VotingClassifier
voting_clf.fit(X_train, y_train)

# Predict on validation set
y_pred_voting = voting_clf.predict(X_val)

# Evaluate the Voting Classifier
print("Voting Classifier (Soft) Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_voting):.4f}")
print(confusion_matrix(y_val, y_pred_voting))
print(classification_report(y_val, y_pred_voting))

Voting Classifier (Soft) Evaluation:
Accuracy: 0.9986
[[693   2]
 [  0 694]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       695
           1       1.00      1.00      1.00       694

    accuracy                           1.00      1389
   macro avg       1.00      1.00      1.00      1389
weighted avg       1.00      1.00      1.00      1389



In [8]:
# Import necessary libraries for saving models
import pickle

# Save the trained models to disk
filename_xgb = 'finalized_model_XGB.sav'
filename_svm = 'finalized_model_SVM.sav'  # Added filename for SVM
filename_voting = 'finalized_model_Voting.sav'

# Save each model using pickle
with open(filename_xgb, 'wb') as file:
    pickle.dump(model_xgb, file)

with open(filename_svm, 'wb') as file:  # Save SVM model
    pickle.dump(model_svm, file)

with open(filename_voting, 'wb') as file:
    pickle.dump(voting_clf, file)

# Confirm models have been saved
print("Models have been saved successfully.")

Models have been saved successfully.


In [9]:
# Load the trained models
with open('finalized_model_XGB.sav', 'rb') as f:
    model_xgb = pickle.load(f)

with open('finalized_model_SVM.sav', 'rb') as f:  
    model_svm = pickle.load(f)

with open('finalized_model_Voting.sav', 'rb') as f:
    voting_classifier = pickle.load(f)

In [10]:
# Load test data
test_data = pd.read_csv('/home/achoo/Desktop/Honeypot/test_sanitized_logs_combined.csv', delimiter=',', header=None)
test_data.columns = ['eventid', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'session', 
                     'protocol', 'version', 'hassh', 'hasshAlgorithms', 'message', 
                     'sensor', 'timestamp']

malicious_keywords = ['failed', 'whoami', 'uname', 'chattr', 'cat', ' rm', '.ssh', 'authorized_keys',
                      'grep', 'chmod', 'curl', 'not found', 'mkdir']

# Define a function to check for malicious login attempts
def flag_malicious(message):
    # Check for standard malicious keywords
    if any(keyword in message for keyword in malicious_keywords):
        return 1
    # Check for failed login attempts with random values
    failed_login_pattern = r'login attempt \[root\/[^\]]+\] failed'
    if re.search(failed_login_pattern, message):
        return 1
    return 0

# Create a target column based on the presence of keywords in the 'message' column
test_data['attack'] = test_data['message'].apply(lambda x: 1 if any(keyword in x for keyword in malicious_keywords) else 0)

# Keep only selected columns
test_data = test_data[['message', 'hasshAlgorithms', 'eventid', 'protocol', 'attack']]

# Initialize a LabelEncoder to encode categorical columns
le = LabelEncoder()

# Encode categorical columns (hasshAlgorithms, eventid, protocol)
test_data['hasshAlgorithms'] = le.fit_transform(test_data['hasshAlgorithms'])
test_data['eventid'] = le.fit_transform(test_data['eventid'])
test_data['protocol'] = le.fit_transform(test_data['protocol'])

# Vectorize the 'message' column using the same TF-IDF vectorizer settings
tfidf = TfidfVectorizer(max_features=500)
message_tfidf = tfidf.fit_transform(test_data['message'])

# Convert the TF-IDF features into a DataFrame and concatenate with the other features
message_tfidf_df = pd.DataFrame(message_tfidf.toarray(), columns=tfidf.get_feature_names_out())
test_data = pd.concat([test_data.drop('message', axis=1), message_tfidf_df], axis=1)

# Separate features (X_test) and labels (y_test)
X_test = test_data.drop(['attack'], axis=1)
y_test = test_data['attack']

# Initialize StandardScaler to scale feature data
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

In [11]:
# Evaluate XGBoost Classifier on test data
xgb_pred = model_xgb.predict(X_test_scaled)
print("XGBoost Classifier Test Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, xgb_pred):.4f}")
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))

# Evaluate SVM Classifier on test data
svm_pred = model_svm.predict(X_test_scaled)
print("SVM Classifier Test Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, svm_pred):.4f}")
print(confusion_matrix(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

# Evaluate Voting Classifier on test data
voting_pred = voting_clf.predict(X_test_scaled)
print("Voting Classifier Test Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, voting_pred):.4f}")
print(confusion_matrix(y_test, voting_pred))
print(classification_report(y_test, voting_pred))

XGBoost Classifier Test Evaluation:
Accuracy: 0.8217
[[17296   437]
 [ 3413   451]]
              precision    recall  f1-score   support

           0       0.84      0.98      0.90     17733
           1       0.51      0.12      0.19      3864

    accuracy                           0.82     21597
   macro avg       0.67      0.55      0.54     21597
weighted avg       0.78      0.82      0.77     21597

SVM Classifier Test Evaluation:
Accuracy: 0.8302
[[15697  2036]
 [ 1631  2233]]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     17733
           1       0.52      0.58      0.55      3864

    accuracy                           0.83     21597
   macro avg       0.71      0.73      0.72     21597
weighted avg       0.84      0.83      0.83     21597

Voting Classifier Test Evaluation:
Accuracy: 0.8418
[[17290   443]
 [ 2974   890]]
              precision    recall  f1-score   support

           0       0.85      0.98      0.91 