In [1]:
# Data manipulation and processing
import pandas as pd
import numpy as np
import re  # Added for regex operations

# Sklearn preprocessing and scaling
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_curve, auc,
    accuracy_score, classification_report, precision_recall_curve, confusion_matrix
)

# Model selection and evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC  # Importing SVM
from sklearn.utils import resample
import matplotlib.pyplot as plt
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
data = pd.read_csv('/home/achoo/Desktop/Honeypot/sanitized_logs_combined.csv', delimiter=',', header=None)
data.columns = ['eventid', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'session', 
                'protocol', 'version', 'hassh', 'hasshAlgorithms', 'message', 
                'sensor', 'timestamp']

# Check initial shape of data
print(f"Initial shape of data: {data.shape}")

# Define keywords that indicate malicious activity
malicious_keywords = ['failed', 'whoami', 'uname', 'chattr', 'cat', ' rm', '.ssh', 'authorized_keys',
                      'grep', 'chmod', 'curl', 'not found', 'mkdir', '/bin/', '/tmp/', 'sshd', '.sh', 
                      'ssh-rsa', 'ps', 'crontab', 'uptime', 'ifconfig', 'cpuinfo', 'df', 'chpasswd', 
                      'free', 'pkill', 'pgrep', 'admin']

# Create a target column based on the presence of keywords in the 'message' column
data['attack'] = data['message'].apply(lambda x: 1 if any(keyword in x for keyword in malicious_keywords) else 0)

# Keep only selected columns
data = data[['message', 'hasshAlgorithms', 'eventid', 'protocol', 'attack']]

Initial shape of data: (38520, 13)


In [3]:
# Initialize a LabelEncoder to encode categorical columns
le = LabelEncoder()

# Encode categorical columns (hasshAlgorithms, eventid, protocol)
data['hasshAlgorithms'] = le.fit_transform(data['hasshAlgorithms'])
data['eventid'] = le.fit_transform(data['eventid'])
data['protocol'] = le.fit_transform(data['protocol'])

# Separate the data into two groups: class 0 (non-malicious) and class 1 (malicious)
class_0 = data[data['attack'] == 0]
class_1 = data[data['attack'] == 1]

# Randomly undersample class 0 to match the number of instances in class 1
class_0_balanced = class_0.sample(n=len(class_1), random_state=42)

# Combine the balanced classes back together
balanced_data = pd.concat([class_0_balanced, class_1])

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
# Vectorize the 'message' column using TF-IDF
tfidf = TfidfVectorizer(max_features=500)  # Limit the number of features to prevent overfitting
message_tfidf = tfidf.fit_transform(balanced_data['message'])

# Convert the TF-IDF features into a DataFrame and concatenate with the other features
message_tfidf_df = pd.DataFrame(message_tfidf.toarray(), columns=tfidf.get_feature_names_out())
balanced_data = pd.concat([balanced_data.drop('message', axis=1), message_tfidf_df], axis=1)

# Separate features (X) and labels (y)
X = balanced_data.drop(['attack'], axis=1)
y = balanced_data['attack']

# Initialize StandardScaler to scale feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Initialize Support Vector Machine Classifier
model_svm = SVC(probability=True, random_state=42)  # Enable probability estimates

# Train the model
model_svm.fit(X_train, y_train)

# Predict on validation set
y_pred_svm = model_svm.predict(X_val)

# Evaluate the SVM model
print("SVM Classifier Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_svm):.4f}")
print(confusion_matrix(y_val, y_pred_svm))
print(classification_report(y_val, y_pred_svm))

SVM Classifier Evaluation:
Accuracy: 0.9850
[[2953    8]
 [  81 2879]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2961
           1       1.00      0.97      0.98      2960

    accuracy                           0.98      5921
   macro avg       0.99      0.98      0.98      5921
weighted avg       0.99      0.98      0.98      5921



In [6]:
# Initialize XGBoost Classifier
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
model_xgb.fit(X_train, y_train)

# Predict on validation set
y_pred_xgb = model_xgb.predict(X_val)

# Evaluate the XGBoost model
print("XGBoost Classifier Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_xgb):.4f}")
print(confusion_matrix(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

XGBoost Classifier Evaluation:
Accuracy: 0.9863
[[2960    1]
 [  80 2880]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2961
           1       1.00      0.97      0.99      2960

    accuracy                           0.99      5921
   macro avg       0.99      0.99      0.99      5921
weighted avg       0.99      0.99      0.99      5921



In [8]:
# Initialize Random Forest Classifier
model_rf = RandomForestClassifier(random_state=42)

# Train the Random Forest model
model_rf.fit(X_train, y_train)

# Predict on validation set
y_pred_rf = model_rf.predict(X_val)

# Evaluate the Random Forest model
print("Random Forest Classifier Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_rf):.4f}")
print(confusion_matrix(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))

Random Forest Classifier Evaluation:
Accuracy: 0.9862
[[2958    3]
 [  79 2881]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2961
           1       1.00      0.97      0.99      2960

    accuracy                           0.99      5921
   macro avg       0.99      0.99      0.99      5921
weighted avg       0.99      0.99      0.99      5921



In [9]:
# Initialize VotingClassifier with soft voting
voting_clf = VotingClassifier(
    estimators=[('svm', model_svm), ('xgb', model_xgb), ('rf', model_rf)],
    voting='soft'
)

# Train the VotingClassifier
voting_clf.fit(X_train, y_train)

# Predict on validation set
y_pred_voting = voting_clf.predict(X_val)

# Evaluate the Voting Classifier
print("Voting Classifier (Soft) Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_voting):.4f}")
print(confusion_matrix(y_val, y_pred_voting))
print(classification_report(y_val, y_pred_voting))

Voting Classifier (Soft) Evaluation:
Accuracy: 0.9863
[[2959    2]
 [  79 2881]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2961
           1       1.00      0.97      0.99      2960

    accuracy                           0.99      5921
   macro avg       0.99      0.99      0.99      5921
weighted avg       0.99      0.99      0.99      5921



In [12]:
# Import necessary libraries for saving models
import pickle

# Save the trained models to disk
filename_xgb = 'finalized_model_XGB.sav'
filename_svm = 'finalized_model_SVM.sav'
filename_rf = 'finalized_model_RF.sav'
filename_voting = 'finalized_model_Voting.sav'

# Confirm models have been saved
print("Models have been saved successfully.")

# Save each model using pickle
with open(filename_xgb, 'wb') as file:
    pickle.dump(model_xgb, file)

with open(filename_svm, 'wb') as file: 
    pickle.dump(model_svm, file)

with open(filename_rf, 'wb') as file:  
    pickle.dump(model_rf, file)

with open(filename_voting, 'wb') as file:
    pickle.dump(voting_clf, file)

Models have been saved successfully.


In [16]:
# Load each model from disk
with open(filename_xgb, 'rb') as file:
    model_xgb_loaded = pickle.load(file)

with open(filename_svm, 'rb') as file:
    model_svm_loaded = pickle.load(file)
    
with open(filename_rf, 'rb') as file: 
    model_rf_loaded = pickle.load(file)

with open(filename_voting, 'rb') as file:
    voting_clf_loaded = pickle.load(file)

In [17]:
# Load test data
test_data = pd.read_csv('/home/achoo/Desktop/Honeypot/test_sanitized_logs_combined.csv', delimiter=',', header=None)
test_data.columns = ['eventid', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'session', 
                     'protocol', 'version', 'hassh', 'hasshAlgorithms', 'message', 
                     'sensor', 'timestamp']

malicious_keywords = ['failed', 'whoami', 'uname', 'chattr', 'cat', ' rm', '.ssh', 'authorized_keys',
                      'grep', 'chmod', 'curl', 'not found', 'mkdir', '/bin/', '/tmp/', 'sshd', '.sh', 
                      'ssh-rsa', 'ps', 'crontab', 'uptime', 'ifconfig', 'cpuinfo', 'df', 'chpasswd', 
                      'free', 'pkill', 'pgrep', 'admin']

# Define a function to check for malicious login attempts
def flag_malicious(message):
    # Check for standard malicious keywords
    if any(keyword in message for keyword in malicious_keywords):
        return 1
    # Check for failed login attempts with random values
    failed_login_pattern = r'login attempt \[root\/[^\]]+\] failed'
    if re.search(failed_login_pattern, message):
        return 1
    return 0

# Create a target column based on the presence of keywords in the 'message' column
test_data['attack'] = test_data['message'].apply(lambda x: 1 if any(keyword in x for keyword in malicious_keywords) else 0)

# Keep only selected columns
test_data = test_data[['message', 'hasshAlgorithms', 'eventid', 'protocol', 'attack']]

# Initialize a LabelEncoder to encode categorical columns
le = LabelEncoder()

# Encode categorical columns (hasshAlgorithms, eventid, protocol)
test_data['hasshAlgorithms'] = le.fit_transform(test_data['hasshAlgorithms'])
test_data['eventid'] = le.fit_transform(test_data['eventid'])
test_data['protocol'] = le.fit_transform(test_data['protocol'])

# Vectorize the 'message' column using the same TF-IDF vectorizer settings
tfidf = TfidfVectorizer(max_features=500)
message_tfidf = tfidf.fit_transform(test_data['message'])

# Convert the TF-IDF features into a DataFrame and concatenate with the other features
message_tfidf_df = pd.DataFrame(message_tfidf.toarray(), columns=tfidf.get_feature_names_out())
test_data = pd.concat([test_data.drop('message', axis=1), message_tfidf_df], axis=1)

# Separate features (X_test) and labels (y_test)
X_test = test_data.drop(['attack'], axis=1)
y_test = test_data['attack']

# Initialize StandardScaler to scale feature data
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

In [18]:
# Evaluate XGBoost Classifier on test data
xgb_pred = model_xgb.predict(X_test_scaled)
print("XGBoost Classifier Test Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, xgb_pred):.4f}")
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))

# Evaluate SVM Classifier on test data
svm_pred = model_svm.predict(X_test_scaled)
print("SVM Classifier Test Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, svm_pred):.4f}")
print(confusion_matrix(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

# Evaluate Random Forest Classifier on test data
rf_pred = model_rf.predict(X_test_scaled)
print("Random Forest Classifier Test Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

# Evaluate Voting Classifier on test data
voting_pred = voting_clf.predict(X_test_scaled)
print("Voting Classifier Test Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, voting_pred):.4f}")
print(confusion_matrix(y_test, voting_pred))
print(classification_report(y_test, voting_pred))

XGBoost Classifier Test Evaluation:
Accuracy: 0.7825
[[23056  2152]
 [ 6381  7646]]
              precision    recall  f1-score   support

           0       0.78      0.91      0.84     25208
           1       0.78      0.55      0.64     14027

    accuracy                           0.78     39235
   macro avg       0.78      0.73      0.74     39235
weighted avg       0.78      0.78      0.77     39235

SVM Classifier Test Evaluation:
Accuracy: 0.7563
[[24229   979]
 [ 8582  5445]]
              precision    recall  f1-score   support

           0       0.74      0.96      0.84     25208
           1       0.85      0.39      0.53     14027

    accuracy                           0.76     39235
   macro avg       0.79      0.67      0.68     39235
weighted avg       0.78      0.76      0.73     39235

Random Forest Classifier Test Evaluation:
Accuracy: 0.6606
[[25206     2]
 [13316   711]]
              precision    recall  f1-score   support

           0       0.65      1.00    