In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
import matplotlib.pyplot as plt
import pickle
import joblib

In [2]:
# log = "/content/drive/MyDrive/Colab Notebooks/DATASET/auth.log"
log = '/home/iamdpk/Project Work/SOC-support-system/Dataset/auth.log'
path = '/home/iamdpk/Project Work/SOC-support-system/backend/soc/mlmodels/auth_system/'

In [3]:
def authParser(log_file):
  file_name = log_file
  file = open(file_name, "r")
  data = []
  order = ["date", "time", "ip", "process", "event"]
  
  for line in file.readlines():
    split1 = re.split(":", line, 3)
    event = split1[3]
    date = re.findall("^\S+\s+[0-9]{2}", line)
    time = re.findall("([0-9]{2}\\:[0-9]{2}\\:[0-9]{2})", line)[0]
    ip = re.findall("([a-z]{2}\\-[0-9]*\\-[0-9]*\\-[0-9]*\\-[0-9]*)", line)[0]
    process = re.findall("([a-zA-Z]*\\[[0-9]*\\])", line)[0]
    details = [date, time, ip, process, event]
    structure = {key:value for key, value in zip(order, details)}
    data.append(structure)
  return data

In [4]:
data1 = authParser(log)

In [5]:
df1 = pd.DataFrame(data1)
df1

Unnamed: 0,date,time,ip,process,event
0,[Nov 30],06:39:00,ip-172-31-27-153,CRON[21882],pam_unix(cron:session): session closed for us...
1,[Nov 30],06:47:01,ip-172-31-27-153,CRON[22087],pam_unix(cron:session): session opened for us...
2,[Nov 30],06:47:03,ip-172-31-27-153,CRON[22087],pam_unix(cron:session): session closed for us...
3,[Nov 30],07:07:14,ip-172-31-27-153,sshd[22116],Connection closed by 122.225.103.87 [preauth]\n
4,[Nov 30],07:07:35,ip-172-31-27-153,sshd[22118],Connection closed by 122.225.103.87 [preauth]\n
...,...,...,...,...,...
86834,[Dec 31],22:26:20,ip-172-31-27-153,sshd[7998],Connection closed by 218.2.0.133 [preauth]\n
86835,[Dec 31],22:27:07,ip-172-31-27-153,sshd[8001],Connection closed by 218.2.0.133 [preauth]\n
86836,[Dec 31],22:27:48,ip-172-31-27-153,sshd[8003],Invalid user admin from 218.2.0.133\n
86837,[Dec 31],22:27:48,ip-172-31-27-153,sshd[8003],input_userauth_request: invalid user admin [p...


In [6]:
df1_copy = df1.copy()

In [7]:
def clean(dataframe, column_name):
  dataframe[column_name] = dataframe[column_name].str.lower()
  # Remove numbers and special-characters from Event
  dataframe[column_name] = dataframe[column_name].str.replace('\d+', '')
  dataframe[column_name] = dataframe[column_name].str.replace(r'[^\w\s]+', '')
  # remove after 'user' including user :(
  dataframe[column_name] = dataframe[column_name].apply(lambda x: x.split(' user ')[0])
  return dataframe

In [8]:
df1_clean = clean(df1,"event")
df1_clean

  dataframe[column_name] = dataframe[column_name].str.replace('\d+', '')
  dataframe[column_name] = dataframe[column_name].str.replace(r'[^\w\s]+', '')


Unnamed: 0,date,time,ip,process,event
0,[Nov 30],06:39:00,ip-172-31-27-153,CRON[21882],pam_unixcronsession session closed for
1,[Nov 30],06:47:01,ip-172-31-27-153,CRON[22087],pam_unixcronsession session opened for
2,[Nov 30],06:47:03,ip-172-31-27-153,CRON[22087],pam_unixcronsession session closed for
3,[Nov 30],07:07:14,ip-172-31-27-153,sshd[22116],connection closed by preauth\n
4,[Nov 30],07:07:35,ip-172-31-27-153,sshd[22118],connection closed by preauth\n
...,...,...,...,...,...
86834,[Dec 31],22:26:20,ip-172-31-27-153,sshd[7998],connection closed by preauth\n
86835,[Dec 31],22:27:07,ip-172-31-27-153,sshd[8001],connection closed by preauth\n
86836,[Dec 31],22:27:48,ip-172-31-27-153,sshd[8003],invalid
86837,[Dec 31],22:27:48,ip-172-31-27-153,sshd[8003],input_userauth_request invalid


In [9]:
stopwords1 = ['pam_unixcronsession' 'by', 'string', 'from', 'bye', 'for', 'port', 'sshd', 'ssh', 'root', 'preauth']

In [10]:
def remStopWord(dataframe, column_name, s_words):
  import nltk
  from nltk.corpus import stopwords
  nltk.download('stopwords')
  all_stopwords = stopwords.words('english')
  all_stopwords.extend(s_words)
  dataframe[column_name] = dataframe[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (all_stopwords)]))
  return dataframe

In [11]:
df1_clean = remStopWord(df1_clean, "event", stopwords1)
df1_clean

[nltk_data] Downloading package stopwords to /home/iamdpk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,date,time,ip,process,event
0,[Nov 30],06:39:00,ip-172-31-27-153,CRON[21882],pam_unixcronsession session closed
1,[Nov 30],06:47:01,ip-172-31-27-153,CRON[22087],pam_unixcronsession session opened
2,[Nov 30],06:47:03,ip-172-31-27-153,CRON[22087],pam_unixcronsession session closed
3,[Nov 30],07:07:14,ip-172-31-27-153,sshd[22116],connection closed
4,[Nov 30],07:07:35,ip-172-31-27-153,sshd[22118],connection closed
...,...,...,...,...,...
86834,[Dec 31],22:26:20,ip-172-31-27-153,sshd[7998],connection closed
86835,[Dec 31],22:27:07,ip-172-31-27-153,sshd[8001],connection closed
86836,[Dec 31],22:27:48,ip-172-31-27-153,sshd[8003],invalid
86837,[Dec 31],22:27:48,ip-172-31-27-153,sshd[8003],input_userauth_request invalid


In [12]:
def loadModel(path, mod_name):
  return pickle.load(open(path + 'auth_'+mod_name+'.sav', 'rb'))

In [14]:
def authToken(input_data):
  return list(set(str(input_data).split(' ')))

In [15]:
loaded_vectorizer = loadModel(path,'vectorizer_model')
loaded_vectorizer

In [16]:
vector_op1 = loaded_vectorizer.transform(df1_clean['event'])
vector_op1

<86839x277 sparse matrix of type '<class 'numpy.float64'>'
	with 220161 stored elements in Compressed Sparse Row format>

In [17]:
loaded_pca = loadModel(path,'pca_model')
loaded_pca

In [18]:
pca_data1 = loaded_pca.transform(vector_op1.todense())
pca_data1



array([[ 3.04223988e-01,  5.12274123e-01,  3.06410907e-14, ...,
        -6.19885948e-01,  2.43192246e-01, -3.67199292e-01],
       [ 2.94032589e-01,  4.36985991e-01,  3.07641817e-14, ...,
        -2.85186866e-01,  6.48906022e-01, -6.43508876e-01],
       [ 3.04223988e-01,  5.12274123e-01,  3.06410907e-14, ...,
        -6.19885948e-01,  2.43192246e-01, -3.67199292e-01],
       ...,
       [ 8.04050563e-01, -2.49367568e-01, -4.42742406e-01, ...,
         4.08813436e-03, -8.08968572e-03, -9.19350297e-04],
       [ 8.04050563e-01, -2.49367568e-01,  4.42742406e-01, ...,
         4.08813436e-03, -8.08968572e-03, -9.19350297e-04],
       [ 3.12452616e-01,  5.82015486e-01,  3.08167358e-14, ...,
        -5.14356949e-01, -3.20301718e-01,  1.67126177e-01]])

In [20]:
loaded_model_kmeans = loadModel(path,'kmeans_model')
loaded_model_kmeans

In [21]:
model_data1 = loaded_model_kmeans.predict(pca_data1)
model_data1

array([4, 4, 4, ..., 1, 2, 4], dtype=int32)

In [22]:
loaded_model2 = loadModel(path,'sgd_model')
loaded_model2

In [23]:
model_data2 = loaded_model2.predict(pd.DataFrame(vector_op1.todense()))
model_data2

array(['anomaly', 'anomaly', 'anomaly', ..., 'normal', 'normal',
       'anomaly'], dtype='<U7')