**References**

[1] Dataset from AI Competition for Predicting Security Risk Level through Log Analysis, https://dacon.io/competitions/official/235717/data<br>
[2] Baseline from AI Competition for Predicting Security Risk Level through Log Analysis, https://dacon.io/competitions/official/235717/codeshare/2536?page=1&dtype=recent<br>
[3] Removing redundant log records, https://github.com/Kitsunetic/log-analytics

**Mount my drive and change the path to the directory in which has the data**
* This is necessary only when you use google drive

In [None]:
import os, shutil
from google.colab import drive
drive.mount("/content/gdrive")

# usually "gdrive/My Drive/" is the location where you will start 
# when you connect to your owngoogle drive
data_dir = "gdrive/My Drive/Test/DACON_Log" 
os.chdir(data_dir)

Mounted at /content/gdrive


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

**Load data and remove redudant log records**

In [None]:
all_data = pd.read_csv('train.csv')

db = {}
for i, (level, full_log) in enumerate(zip(all_data.level, all_data.full_log)):
    text = full_log
    if text not in db:
        db[text] = {}
    if level not in db[text]:
        db[text][level] = {"cnt": 0, "list": []}
    db[text][level]["cnt"] += 1
    db[text][level]["list"].append(i)
keys = list(db.keys())

index_list = []
for key in keys:
    d = db[key]
    if len(list(d.keys())) > 1:
        for k in d.keys():
          for j in d[k]["list"]:
            index_list.append(j)
all_data.drop(index_list, inplace=True)

# Remove all log records when they have exactly same contents but have different levels
# because we cannot know which one is correct
# We have confirmed that the log records of levels 2, 4, and 6 that have few instances do not have such a case

**Split train and test data**

In [None]:
portion = 0.5

data_list = []
data_count = []
for level in range(7):
    data = all_data[all_data['level'] == level]
    data_list.append(data)
    count = int(data.shape[0] *  portion)
    data_count.append(count)

train = pd.concat([data_list[0][:data_count[0]],
                  data_list[1][:data_count[1]],
                  data_list[2][:data_count[2]],
                  data_list[3][:data_count[3]],
                  data_list[4][:data_count[4]],
                  data_list[5][:data_count[5]],
                  data_list[6][:data_count[6]]])

test = pd.concat([data_list[0][data_count[0]:],
                 data_list[1][data_count[1]:],
                 data_list[2][data_count[2]:],
                 data_list[3][data_count[3]:],
                 data_list[4][data_count[4]:],
                 data_list[5][data_count[5]:],
                 data_list[6][data_count[6]:]])

train= train.sample(frac=1)  # shuffle rows
train = train.sample(frac=1).reset_index(drop=True)  # index reset

test= test.sample(frac=1)  # shuffle rows
test = test.sample(frac=1).reset_index(drop=True)  # index reset

**Data preprocessing**

In [None]:
# Remove all uninformative parts
# BE CAREFUL that the order of the statements below matters

remove_uninformative_tokens = True
if remove_uninformative_tokens != True:
    # Regular Expression for Date & Time #1: 2021-02-19T21:08:00 (ISO 8601 Dates and Times)
    train['full_log']=train['full_log'].str.replace(r'(?:\d{4})-(?:\d{2})-(?:\d{2})T(?:\d{2}):(?:\d{2}):(?:\d{2}(?:\.\d*)?)(?:(?:[+-](?:\d{2}):(?:\d{2})|Z)?)', '<time>') #<1>
    # Regular Expression for Date & Time #2: 0000-00-00 00:00:00
    train['full_log']=train['full_log'].str.replace(r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})', '<time>')
    # Regular Expression for Date & Time #3: Jan 1 00:00:00
    train['full_log']=train['full_log'].str.replace(r'(\b\d{1,2}\D{0,3})?\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)\D*(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])\D?)?((19[7-9]\d|20\d{2})|\d{2})* (\d{2}):(\d{2}):(\d{2})', '<time>')
    # Regular Expression for IP
    train['full_log']=train['full_log'].str.replace(r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', '<ip>')
    # Regular Expression for a Combination of Alphanumeric and Special Characters
    train['full_log']=train['full_log'].str.replace(r"([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*[><?@+'`~^%&\*\[\]\{\}.!#|\\\"$';,:;=/\(\),\-\w+]*", '<val>')
    # Regular Expression for the Value Part of Key=Value pair
    train['full_log']=train['full_log'].str.replace(r'(?<==)([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*', '<val>')
    # Regular Expression for Numbers
    train['full_log']=train['full_log'].str.replace(r'[-+]?([0-9]*[.])?[0-9]+([eE][-+]?\d+)?', '<num>')

    # Regular Expression for Date & Time #1: 2021-02-19T21:08:00 (ISO 8601 Dates and Times)
    test['full_log']=test['full_log'].str.replace(r'(?:\d{4})-(?:\d{2})-(?:\d{2})T(?:\d{2}):(?:\d{2}):(?:\d{2}(?:\.\d*)?)(?:(?:[+-](?:\d{2}):(?:\d{2})|Z)?)', '<time>') #<1>
    # Regular Expression for Date & Time #2: 0000-00-00 00:00:00
    test['full_log']=test['full_log'].str.replace(r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})', '<time>')
    # Regular Expression for Date & Time #3: Jan 1 00:00:00
    test['full_log']=test['full_log'].str.replace(r'(\b\d{1,2}\D{0,3})?\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)\D*(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])\D?)?((19[7-9]\d|20\d{2})|\d{2})* (\d{2}):(\d{2}):(\d{2})', '<time>')
    # Regular Expression for IP
    test['full_log']=test['full_log'].str.replace(r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', '<ip>')
    # Regular Expression for a Combination of Alphanumeric and Special Characters
    test['full_log']=test['full_log'].str.replace(r"([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*[><?@+'`~^%&\*\[\]\{\}.!#|\\\"$';,:;=/\(\),\-\w+]*", '<val>')
    # Regular Expression for the Value Part of Key=Value pair
    test['full_log']=test['full_log'].str.replace(r'(?<==)([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*', '<val>')
    # Regular Expression for Numbers
    test['full_log']=test['full_log'].str.replace(r'[-+]?([0-9]*[.])?[0-9]+([eE][-+]?\d+)?', '<num>')
else:
    # Regular Expression for Date & Time #1: 2021-02-19T21:08:00 (ISO 8601 Dates and Times)
    train['full_log']=train['full_log'].str.replace(r'(?:\d{4})-(?:\d{2})-(?:\d{2})T(?:\d{2}):(?:\d{2}):(?:\d{2}(?:\.\d*)?)(?:(?:[+-](?:\d{2}):(?:\d{2})|Z)?)', '') #<1>
    # Regular Expression for Date & Time #2: 0000-00-00 00:00:00
    train['full_log']=train['full_log'].str.replace(r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})', '')
    # Regular Expression for Date & Time #3: Jan 1 00:00:00
    train['full_log']=train['full_log'].str.replace(r'(\b\d{1,2}\D{0,3})?\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)\D*(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])\D?)?((19[7-9]\d|20\d{2})|\d{2})* (\d{2}):(\d{2}):(\d{2})', '')
    # Regular Expression for IP
    train['full_log']=train['full_log'].str.replace(r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', '')
    # Regular Expression for a Combination of Alphanumeric and Special Characters
    train['full_log']=train['full_log'].str.replace(r"([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*[><?@+'`~^%&\*\[\]\{\}.!#|\\\"$';,:;=/\(\),\-\w+]*", '')
    # Regular Expression for the Value Part of Key=Value pair
    train['full_log']=train['full_log'].str.replace(r'(?<==)([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*', '')
    # Regular Expression for Numbers
    train['full_log']=train['full_log'].str.replace(r'[-+]?([0-9]*[.])?[0-9]+([eE][-+]?\d+)?', '')

    # Regular Expression for Date & Time #1: 2021-02-19T21:08:00 (ISO 8601 Dates and Times)
    test['full_log']=test['full_log'].str.replace(r'(?:\d{4})-(?:\d{2})-(?:\d{2})T(?:\d{2}):(?:\d{2}):(?:\d{2}(?:\.\d*)?)(?:(?:[+-](?:\d{2}):(?:\d{2})|Z)?)', '') #<1>
    # Regular Expression for Date & Time #2: 0000-00-00 00:00:00
    test['full_log']=test['full_log'].str.replace(r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})', '')
    # Regular Expression for Date & Time #3: Jan 1 00:00:00
    test['full_log']=test['full_log'].str.replace(r'(\b\d{1,2}\D{0,3})?\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)\D*(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])\D?)?((19[7-9]\d|20\d{2})|\d{2})* (\d{2}):(\d{2}):(\d{2})', '')
    # Regular Expression for IP
    test['full_log']=test['full_log'].str.replace(r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', '')
    # Regular Expression for a Combination of Alphanumeric and Special Characters
    test['full_log']=test['full_log'].str.replace(r"([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*[><?@+'`~^%&\*\[\]\{\}.!#|\\\"$';,:;=/\(\),\-\w+]*", '')
    # Regular Expression for the Value Part of Key=Value pair
    test['full_log']=test['full_log'].str.replace(r'(?<==)([A-Za-z]+[0-9]|[0-9]+[A-Za-z])[A-Za-z0-9]*', '')
    # Regular Expression for Numbers
    test['full_log']=test['full_log'].str.replace(r'[-+]?([0-9]*[.])?[0-9]+([eE][-+]?\d+)?', '')




In [None]:
train_text=list(train['full_log'])
train_level=np.array(train['level'])

test_text=list(test['full_log'])
test_level=np.array(test['level'])

In [None]:
# Do vertorization with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Unigram
#vectorizer=CountVectorizer(analyzer="word", max_features=5000)
#vectorizer=CountVectorizer(analyzer="word", max_features=500)
vectorizer=CountVectorizer(analyzer="word", max_features=200)

# Bigram
#vectorizer=CountVectorizer(analyzer="word", max_features=5000, ngram_range=(2, 2))
#vectorizer=CountVectorizer(analyzer="word", max_features=500, ngram_range=(2, 2))
#vectorizer=CountVectorizer(analyzer="word", max_features=200, ngram_range=(2, 2))

train_features=vectorizer.fit_transform(train_text)

In [None]:
# Just for displaying the feature(matrix) dimension
train_features

<235943x500 sparse matrix of type '<class 'numpy.int64'>'
	with 7557257 stored elements in Compressed Sparse Row format>

In [None]:
# Just for showing the result of the vectorising
counts = pd.DataFrame(train_features.sum(axis=0), columns=vectorizer.get_feature_names())
counts.T.sort_values(by=0, ascending=False)



Unnamed: 0,0
error,382318
type,347542
audit,310388
msg,250773
elasticsearch,224314
...,...
description,291
oss_telemetry,290
no_shard_available_action_exception,285
json,285


**Model training**

In [None]:
train_x = train_features
train_y = train_level

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier(n_estimators=100)
forest.fit(train_x, train_y)

RandomForestClassifier()

**Model test**

In [None]:
test_features=vectorizer.fit_transform(test_text)

In [None]:
# Just for showing the result of the vectorising
counts = pd.DataFrame(test_features.sum(axis=0), columns=vectorizer.get_feature_names())
counts.T.sort_values(by=0, ascending=False)

Unnamed: 0,0
error,383506
type,348522
audit,311097
msg,251389
elasticsearch,225057
...,...
hits,292
limit,291
home,290
source,286


In [None]:
eval_x = test_features
eval_y = test_level

forest.score(eval_x, eval_y)

0.9891966348089597

In [None]:
# Analyze the result with a confusion matrix
pred=forest.predict(eval_x)
crosstab = pd.crosstab(eval_y, pred, rownames=['real'], colnames=['pred'])
crosstab

pred,0,1,3,4,5,6
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,166139,310,278,1,2,0
1,153,65527,356,0,5,0
2,0,1,5,0,0,0
3,1105,2,880,0,82,0
4,0,0,0,5,0,0
5,21,12,215,0,842,0
6,0,1,0,0,0,3
