<a href="https://colab.research.google.com/github/kashindra-mahato/SOC-support-system/blob/kashindra/authlog_training_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

##Imports

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
import matplotlib.pyplot as plt
import pickle

##Parser for auth

In [2]:
def authParser(log_file):
  file_name = log_file
  file = open(file_name, "r")
  data = []
  order = ["date", "time", "ip", "process", "event"]
  
  for line in file.readlines():
    split1 = re.split(":", line, 3)
    event = split1[3]
    date = re.findall("^\S+\s+[0-9]{2}", line)
    time = re.findall("([0-9]{2}\\:[0-9]{2}\\:[0-9]{2})", line)[0]
    ip = re.findall("([a-z]{2}\\-[0-9]*\\-[0-9]*\\-[0-9]*\\-[0-9]*)", line)[0]
    process = re.findall("([a-zA-Z]*\\[[0-9]*\\])", line)[0]
    details = [date, time, ip, process, event]
    structure = {key:value for key, value in zip(order, details)}
    data.append(structure)
  return data

##parser for single line(string)

In [3]:
def authParserLine(line):
  order = ["date", "time", "ip", "process", "event"]
  
  split1 = re.split(":", line, 3)
  event = split1[3]
  date = re.findall("^\S+\s+[0-9]{2}", line)
  time = re.findall("([0-9]{2}\\:[0-9]{2}\\:[0-9]{2})", line)[0]
  ip = re.findall("([a-z]{2}\\-[0-9]*\\-[0-9]*\\-[0-9]*\\-[0-9]*)", line)[0]
  process = re.findall("([a-zA-Z]*\\[[0-9]*\\])", line)[0]
  details = [date, time, ip, process, event]
  structure = {key:value for key, value in zip(order, details)}
  
  return structure

##function for Cleaning

In [4]:
def clean(dataframe, column_name):
  dataframe[column_name] = dataframe[column_name].str.lower()
  # Remove numbers and special-characters from Event
  dataframe[column_name] = dataframe[column_name].str.replace('\d+', '')
  dataframe[column_name] = dataframe[column_name].str.replace(r'[^\w\s]+', '')
  # remove after 'user' including user :(
  dataframe[column_name] = dataframe[column_name].apply(lambda x: x.split(' user ')[0])
  return dataframe

##Stop word remover

In [5]:
def remStopWord(dataframe, column_name, s_words):
  import nltk
  from nltk.corpus import stopwords
  nltk.download('stopwords')
  all_stopwords = stopwords.words('english')
  all_stopwords.extend(s_words)
  dataframe[column_name] = dataframe[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (all_stopwords)]))
  return dataframe

##Tokenizer

In [6]:
def authToken(input_data):
  return list(set(str(input_data).split(' ')))

##Vectorizer

In [7]:
def authVectorizer(path, input_data):
  vectorizer = TfidfVectorizer(tokenizer=authToken)
  vectorizer.fit(input_data)
  vector_op = vectorizer.transform(input_data)
  pickle.dump(vectorizer, open(path + 'auth_vectorizer_model.sav', 'wb'))
  return vector_op, vectorizer

##principle component analyzer

In [8]:
def pca(path, input_data):
  pca = PCA(n_components=0.95) 
  pca.fit(input_data)
  data = pca.transform(input_data)
  pickle.dump(pca, open(path + 'auth_pca_model.sav', 'wb'))
  return data, pca

##training model

In [9]:
def train(path, input_data, mod):
  if mod == 'kmeans':
    model = KMeans(7)
    model.fit(input_data)
    data = model.transform(input_data)
    pickle.dump(model, open(path + 'auth_kmeans_model.sav', 'wb'))
    return data, model

  if mod == 'sgd':
    model = SGDClassifier(loss='perceptron')
    X = input_data.drop('y_label',axis=1)
    y = input_data['y_label']
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
    model.fit(x_train, y_train)
    pickle.dump(model, open(path + 'auth_sgd_model.sav', 'wb'))
    print(model.score(x_test, y_test))
    return model


##model loader

In [10]:
def loadModel(path, mod_name):
  return pickle.load(open(path + 'auth_'+mod_name+'.sav', 'rb'))

##function to calculate distance between each instance and their respective centroids

In [11]:
def fromCentDistance(pca_data, model):
  centroids = model.cluster_centers_
  df_cent = pd.DataFrame(centroids)
  df_cent['label'] = [i for i in range(len(centroids))]
  df_pca_data = pd.DataFrame(pca_data)
  df_pca_data['label'] = model.predict(pca_data)
  df_pca_data_join = df_pca_data.join(df_cent, 
                                      on='label', 
                                      how='left', 
                                      lsuffix='_pca',
                                      rsuffix='_cent')
  df_pca_1 = df_pca_data_join.loc[:, "0_pca":"label_pca"]
  df_pca_1.drop('label_pca', axis=1, inplace=True)
  df_cent_1 = df_pca_data_join.loc[:, "0_cent":"label_cent"]
  df_cent_1.drop('label_cent', axis=1, inplace=True)
  list1 = df_pca_1.to_numpy().tolist()
  list2 = df_cent_1.to_numpy().tolist()
  distances = []
  for i in range(len(list1)):
    dist = distance.euclidean(list1[i],list2[i])
    distances.append(dist)
  
  return distances

##function to calculate modified_zscore

In [12]:
def modified_zscore(distance, consistency_correction=1.4826):
    
    median = np.median(distance)
    
    deviation_from_med = np.array(distance) - median
    
    mad = np.mean(np.abs(deviation_from_med))
    mod_zscore = deviation_from_med/(consistency_correction*mad)
    
    return mod_zscore, mad

##function to plot anomalies(z>3)

In [13]:
def plot_anomaly(data, threshold):
    data = data.copy().sort_values(ascending=False).values
    ranks = np.linspace(1, len(data), len(data))
    mask_outliers = (data > threshold)
    
    plt.figure(dpi=100)
    plt.plot(ranks[mask_outliers], data[mask_outliers],'o', color='r',label='anomalies')
    plt.plot(ranks[~mask_outliers], data[~mask_outliers],'o', color='b', label='normal')
    plt.axhline(threshold,color='r',label='threshold', alpha=0.5)
    plt.legend(loc = 'upper right')
    plt.title('Modified z-score vs. Log Event', fontweight='bold')
    plt.xticks(np.arange(0, 21, step=2.0))
    plt.xlabel('Event')
    plt.ylabel('Modified z-score')
    plt.show()

#Training

In [14]:
# log = "/content/drive/MyDrive/Colab Notebooks/DATASET/auth.log"
log = '/home/iamdpk/Project Work/SOC-support-system/Dataset/auth.log'

In [15]:
path = '/home/iamdpk/Project Work/SOC-support-system/resources/auth_system/'

In [17]:
data = authParser(log)
data

[{'date': ['Nov 30'],
  'time': '06:39:00',
  'ip': 'ip-172-31-27-153',
  'process': 'CRON[21882]',
  'event': ' pam_unix(cron:session): session closed for user root\n'},
 {'date': ['Nov 30'],
  'time': '06:47:01',
  'ip': 'ip-172-31-27-153',
  'process': 'CRON[22087]',
  'event': ' pam_unix(cron:session): session opened for user root by (uid=0)\n'},
 {'date': ['Nov 30'],
  'time': '06:47:03',
  'ip': 'ip-172-31-27-153',
  'process': 'CRON[22087]',
  'event': ' pam_unix(cron:session): session closed for user root\n'},
 {'date': ['Nov 30'],
  'time': '07:07:14',
  'ip': 'ip-172-31-27-153',
  'process': 'sshd[22116]',
  'event': ' Connection closed by 122.225.103.87 [preauth]\n'},
 {'date': ['Nov 30'],
  'time': '07:07:35',
  'ip': 'ip-172-31-27-153',
  'process': 'sshd[22118]',
  'event': ' Connection closed by 122.225.103.87 [preauth]\n'},
 {'date': ['Nov 30'],
  'time': '07:08:13',
  'ip': 'ip-172-31-27-153',
  'process': 'sshd[22120]',
  'event': ' Connection closed by 122.225.103.87

In [None]:
df = pd.DataFrame(data)

In [None]:
df_copy = df.copy()

In [None]:
df_clean = clean(df,"event")

In [None]:
stopwords = ['pam_unixcronsession' 'by', 'string', 'from', 'bye', 'for', 'port', 'sshd', 'ssh', 'root', 'preauth']

In [None]:
df_clean = remStopWord(df_clean, "event", stopwords)

In [None]:
vector_op, vectorizer = authVectorizer(path, df_clean['event'])

In [None]:
pca_data, pca = pca(path, vector_op.todense())

In [None]:
model_data, model = train(path, pca_data, 'kmeans')

In [None]:
distance = fromCentDistance(pca_data, model)

In [None]:
df_copy['distance'] = distance

In [None]:
df_copy['label'] = model.labels_

In [None]:
mod_zscore, mad = modified_zscore(df_copy['distance'])

In [None]:
df_copy['mod_zscore'] = mod_zscore.tolist()

In [None]:
df_copy.loc[df_copy.mod_zscore>3, 'y_label'] = 'anomaly'
df_copy.loc[df_copy.mod_zscore<3, 'y_label'] = 'normal'

In [None]:
df_sgd = pd.DataFrame(vector_op.todense())

In [None]:
df_sgd['y_label'] = df_copy['y_label']

In [None]:
model = train(path, df_sgd, 'sgd')

#Output

In [None]:
plot_anomaly(df_copy['mod_zscore'],3)

In [None]:
df.loc[df_copy.mod_zscore>3].event.unique()

In [None]:
df_copy.loc[df_copy.mod_zscore>3].event.unique()

In [None]:
df.loc[df_copy.label == 0].event.unique()

In [None]:
df.loc[df_copy.label == 1].event.unique()

In [None]:
df.loc[df_copy.label == 2].event.unique()

In [None]:
df.loc[df_copy.label == 3].event.unique()

In [None]:
df.loc[df_copy.label == 4].event.unique()

In [None]:
df.loc[df_copy.label == 5].event.unique()

In [None]:
df.loc[df_copy.label == 6].event.unique()

#Testing

In [None]:
input = "Nov 30 09:22:03 ip-172-31-27-153 sshd[22218]: Did not receive identification string from 196.200.90.236"

In [None]:
data1 = authParserLine(input)

In [None]:
df1 = pd.DataFrame(data1)

In [None]:
df1_copy = df1.copy()

In [None]:
df1_clean = clean(df1,"event")

In [None]:
stopwords1 = ['pam_unixcronsession' 'by', 'string', 'from', 'bye', 'for', 'port', 'sshd', 'ssh', 'root', 'preauth']

In [None]:
df1_clean = remStopWord(df1_clean, "event", stopwords1)

In [None]:
loaded_vectorizer = loadModel(path,'vectorizer_model')

In [None]:
vector_op1 = loaded_vectorizer.transform(df1_clean['event'])

In [None]:
loaded_pca = loadModel(path,'pca_model')

In [None]:
pca_data1 = loaded_pca.transform(vector_op1.todense())

In [None]:
loaded_model_kmeans = loadModel(path,'kmeans_model')

In [None]:
model_data1 = loaded_model_kmeans.predict(pca_data1)

In [None]:
model_data1

In [None]:
loaded_model2 = loadModel(path,'sgd_model')

In [None]:
model_data2 = loaded_model2.predict(pd.DataFrame(vector_op1.todense()))

In [None]:
model_data2