In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix
import warnings
import csv
import subprocess
import select
import time
import re
import ipaddress
from datetime import datetime
import pickle

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [9]:
class KFoldValidation():
    def GetAverageScore(self, n, features, label, model, X_train):
        kf = KFold(n_splits=n, random_state=None, shuffle=True)
        X = features
        y = label

        train_acc = 0.0
        test_acc = 0.0
        for train_i, test_i in kf.split(X_train):
            xtrain, xtest = X[train_i], X[test_i]
            ytrain, ytest = y[train_i], y[test_i]
            model.fit(xtrain, ytrain)
            train_acc += model.score(xtrain, ytrain)
            test_acc += model.score(xtest, ytest)

        print("Average Train Accuracy : " + str(train_acc / n))
        print("Average Test Accuracy : " + str(test_acc / n))

In [10]:
class SSHDataAddition:
    df = ['user', 'is_private', 'is_failure', 'is_root', 'is_valid', 'not_valid_count', 
          'ip_failure', 'ip_success', 'no_failure', 'first', 'td', 'ts', 'class']

    def __init__(self):
        print("Initializing data addition modules")

    def writeSSH(self, df_temp):
        print(df_temp)
        icmp_writer = csv.DictWriter(open('data/SSH.csv', 'a+'), fieldnames=self.df)
        icmp_writer.writerow(df_temp)

In [11]:
class Parse_SSH:
    def __init__(self):
        self.dict = {}
        self.user_account = ['osamac', 'kamran', 'student', 'root']
        self.number_of_failure = 0

    def isValid(self, name):
        return "1" if name in self.user_account else "0"

    def ParseUsr(self, line):
        # Various regex patterns to extract user from log lines
        # ...
        return usr.group(2) if usr else "-1"

    def ParseIP(self, line):
        ip = re.search(r'(\bfrom\s)(\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b)', line)
        return ip.group(2) if ip else "-1"

    def isPrivate(self, ip):
        try:
            return int(ipaddress.ip_address(ip).is_private)
        except:
            return 0

    def ParseDate(self, line):
        date = line[0:15]
        date = datetime.strptime(date, '%b %d %H:%M:%S')
        date = date.replace(year=2018)
        return date.timestamp()

    def isRoot(self, line):
        return "1" if "root" in line else "0"

    def SSHProcessed(self, line):
        t = self.ParseDate(line)
        # Logic to classify log entries as successful or failed
        # ...
        if usr != "-1" and ip != "-1":
            self.dict[ip] = {
                # Dictionary of extracted values
            }
            return self.dict[ip]
        return {}

In [16]:
class SSHPerdiction:
    def __init__(self):
        print("Loading Models:")
        ssh_fname = "models/ssh_rfc.pkl"  # Save the classifier model as ssh_rfc.pkl instead of ssh_rfr.pkl
        self.ssh_model = pickle.load(open(ssh_fname, 'rb'))

    def prepareDict(self, dict):
        return [
            dict['is_private'],
            dict['is_failure'],
            dict['is_root'],
            dict['is_valid'],
            dict['not_valid_count'],
            dict['ip_failure'],
            dict['ip_success'],
            dict['no_failure'],
            dict['first']
        ]

    def predictSSH(self, instance):
        return self.ssh_model.predict([instance])[0]  # Predict class directly (0 or 1)

In [24]:
# Reading CSV Data
colum = ['user', 'is_private', 'is_failure', 'is_root', 'is_valid', 'not_valid_count',
         'ip_failure', 'ip_success', 'no_failure', 'first', 'td', 'ts', 'class']

csv = pd.read_csv("SSH.csv")
df = pd.DataFrame(csv, columns=colum)

# Preprocess Data
df.pop('user')
df.pop('ts')
df.pop('td')
label = np.array(df.pop("class"))
features = np.array(df)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.1, shuffle=True, random_state=22)

# Training Model
print("Training Model..")
model = RandomForestClassifier()  # Changed to RandomForestClassifier
model.fit(X_train, y_train)

# Accuracy Calculation
print("Train Accuracy : ", model.score(X_train, y_train))
print("Test Accuracy : ", model.score(X_test, y_test))

Training Model..
Train Accuracy :  1.0
Test Accuracy :  1.0


In [29]:
# Confusion Matrix
y_pred = model.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
print(cnf_matrix)

# K-Fold Validation
kf = KFoldValidation()
kf.GetAverageScore(3, features, label, RandomForestClassifier(), X_train)  # Changed to RandomForestClassifier

[[22  0]
 [ 1  6]]
Average Train Accuracy : 1.0
Average Test Accuracy : 0.9921568627450981
