# **Part 1. 데이터 선정 이유 및 문제 정의**


> 데이터 선정 이유: 
* 올해 여름 한국 원자력 연구원이 사이버테러 공격을 당한 뉴스를 보고 이 데이터셋으로 로그데이터를 통한 보안 위험도 예측과 공격 탐지를 학습해보기로 결정하였습니다.


> 이번 프로젝트의 목적 및 데이터셋 활용 방안:
* 기존에 확인된 위험요소는 위험도를 0~6까지 총 7단계로 분류가 되어있고, 새로운 위험요소를 7로 지정하여 총 8단계로 분류하는 모델을 만들어볼 계획입니다. 새로운 형태의 침해시에는, 프로세스의 동작패턴이나 명령어의 사용종류가 달라지므로, 로그데이터의 분석을 통해서 기존에 없던 패턴의 새로운 공격에 직면하였을 때에 신속하고 정확하게 대비할 수 있도록 모델링을 해보겠습니다.


# **Part 2. 데이터셋과 필요한 라이브러리 호출**

### **1. 데이터 불러오기**

In [1]:
import pandas as pd
import numpy as np
import random
import re
import os
import gc

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv('/content/drive/MyDrive/security_log/train.csv')
test = pd.read_csv('/content/drive/MyDrive/security_log/test.csv')
sample_submit = pd.read_csv('/content/drive/MyDrive/security_log/sample_submission.csv')

In [3]:
print(train.shape)
train.head(3)

(472972, 3)


Unnamed: 0,id,level,full_log
0,0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""error"",""@timestamp"":""2020-09-24T01:02:22Z"",""tags"":[""warning"",""stats-collection""],""pid"":6458,""level"":""error"",""error"":{""message"":""No Living connections"",""name"":""Error"",""stack"":""Error: No Living connections\n at sendReqWithConnection (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:226:15)\n at next (/usr/share/kibana/node_modules/elasticsearch/src/lib/connection_pool.js:214:7)\n at process._tickCallback (internal/process/next_tick.js:61:11)""},""message"":""No Living connections""}"
1,1,0,"Feb 8 16:21:00 localhost logstash: [2021-02-08T16:21:00,548][INFO ][logstash.outputs.elasticsearch] retrying failed action with response code: 503 ({""type""=>""unavailable_shards_exception"", ""reason""=>""[wazuh-alerts-audit-3.x-2021.16.08][3] primary shard is not active Timeout: [1m], request: [BulkShardRequest [[wazuh-alerts-audit-3.x-2021.16.08][3]] containing [25] requests]""})"
2,2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""error"",""@timestamp"":""2021-01-12T16:50:40Z"",""tags"":[""warning"",""stats-collection""],""pid"":4332,""level"":""error"",""error"":{""message"":""No Living connections"",""name"":""Error"",""stack"":""Error: No Living connections\n at sendReqWithConnection (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:226:15)\n at next (/usr/share/kibana/node_modules/elasticsearch/src/lib/connection_pool.js:214:7)\n at process._tickCallback (internal/process/next_tick.js:61:11)""},""message"":""No Living connections""}"


In [4]:
print(test.shape)
test.head(3)

(1418916, 2)


Unnamed: 0,id,full_log
0,1000000,"Feb 8 15:47:26 localhost kibana: {""type"":""error"",""@timestamp"":""2021-02-08T06:47:26Z"",""tags"":[""warning"",""stats-collection""],""pid"":4604,""level"":""error"",""error"":{""message"":""[search_phase_execution_exception] all shards failed"",""name"":""Error"",""stack"":""[search_phase_execution_exception] all shards failed :: {\""path\"":\""/.kibana_task_manager/_doc/_search\"",\""query\"":{\""ignore_unavailable\"":true},\""body\"":\""{\\\""sort\\\"":[{\\\""task.runAt\\\"":\\\""asc\\\""},{\\\""_id\\\"":\\\""desc\\\""}],\\\""query\\\"":{\\\""bool\\\"":{\\\""must\\\"":[{\\\""term\\\"":{\\\""type\\\"":\\\""task\\\""}},{\\\""bool\\\"":{\\\""filter\\\"":{\\\""term\\\"":{\\\""_id\\\"":\\\""oss_telemetry-vis_telemetry\\\""}}}}]}}}\"",\""statusCode\"":503,\""response\"":\""{\\\""error\\\"":{\\\""root_cause\\\"":[],\\\""type\\\"":\\\""search_phase_execution_exception\\\"",\\\""reason\\\"":\\\""all shards failed\\\"",\\\""phase\\\"":\\\""query\\\"",\\\""grouped\\\"":true,\\\""failed_shards\\\"":[]},\\\""status\\\"":503}\""}\n at respond (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:308:15)\n at checkRespForFailure (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:267:7)\n at HttpConnector.<anonymous> (/usr/share/kibana/node_modules/elasticsearch/src/lib/connectors/http.js:166:7)\n at IncomingMessage.wrapper (/usr/share/kibana/node_modules/elasticsearch/node_modules/lodash/lodash.js:4935:19)\n at IncomingMessage.emit (events.js:194:15)\n at endReadableNT (_stream_readable.js:1103:12)\n at process._tickCallback (internal/process/next_tick.js:63:19)""},""message"":""[search_phase_execution_exception] all shards failed""}"
1,1000001,"Sep 24 03:46:39 localhost kibana: {""type"":""error"",""@timestamp"":""2020-09-23T18:46:39Z"",""tags"":[""warning"",""stats-collection""],""pid"":6458,""level"":""error"",""error"":{""message"":""No Living connections"",""name"":""Error"",""stack"":""Error: No Living connections\n at sendReqWithConnection (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:226:15)\n at next (/usr/share/kibana/node_modules/elasticsearch/src/lib/connection_pool.js:214:7)\n at process._tickCallback (internal/process/next_tick.js:61:11)""},""message"":""No Living connections""}"
2,1000002,"type=SYSCALL msg=audit(1611888200.428:210563): arch=c000003e syscall=257 success=yes exit=10 a0=ffffffffffffff9c a1=7f89fd9c9030 a2=90800 a3=0 items=1 ppid=1 pid=86518 auid=4294967295 uid=0 gid=980 euid=0 suid=0 fsuid=0 egid=980 sgid=980 fsgid=980 tty=(none) ses=4294967295 comm=""ossec-syscheckd"" exe=""/var/esild/bin/ossec-syscheckd"" subj=system_u:system_r:unconfined_service_t:s0 key=""audit-wazuh-r"" type=CWD msg=audit(1611888200.428:210563): cwd=""/"" type=PATH msg=audit(1611888200.428:210563): item=0 name=""/etc/java/security/security.d"" inode=34054258 dev=fd:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:etc_t:s0 objtype=NORMAL cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0 type=PROCTITLE msg=audit(1611888200.428:210563): proctitle=""/var/esild/bin/ossec-syscheckd"""


In [5]:
print(sample_submit.shape)
sample_submit.head(3)

(1418916, 2)


Unnamed: 0,id,level
0,1000000,0
1,1000001,0
2,1000002,0


### **2. 데이터 전처리**

> **중복값과 결측치 처리**

In [6]:
train.isnull().sum()

id          0
level       0
full_log    0
dtype: int64

In [7]:
test.isnull().sum()

id          0
full_log    0
dtype: int64

In [8]:
sample_submit.isnull().sum()

id       0
level    0
dtype: int64

> **데이터 전처리**

In [9]:
train.head(3)

Unnamed: 0,id,level,full_log
0,0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""error"",""@timestamp"":""2020-09-24T01:02:22Z"",""tags"":[""warning"",""stats-collection""],""pid"":6458,""level"":""error"",""error"":{""message"":""No Living connections"",""name"":""Error"",""stack"":""Error: No Living connections\n at sendReqWithConnection (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:226:15)\n at next (/usr/share/kibana/node_modules/elasticsearch/src/lib/connection_pool.js:214:7)\n at process._tickCallback (internal/process/next_tick.js:61:11)""},""message"":""No Living connections""}"
1,1,0,"Feb 8 16:21:00 localhost logstash: [2021-02-08T16:21:00,548][INFO ][logstash.outputs.elasticsearch] retrying failed action with response code: 503 ({""type""=>""unavailable_shards_exception"", ""reason""=>""[wazuh-alerts-audit-3.x-2021.16.08][3] primary shard is not active Timeout: [1m], request: [BulkShardRequest [[wazuh-alerts-audit-3.x-2021.16.08][3]] containing [25] requests]""})"
2,2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""error"",""@timestamp"":""2021-01-12T16:50:40Z"",""tags"":[""warning"",""stats-collection""],""pid"":4332,""level"":""error"",""error"":{""message"":""No Living connections"",""name"":""Error"",""stack"":""Error: No Living connections\n at sendReqWithConnection (/usr/share/kibana/node_modules/elasticsearch/src/lib/transport.js:226:15)\n at next (/usr/share/kibana/node_modules/elasticsearch/src/lib/connection_pool.js:214:7)\n at process._tickCallback (internal/process/next_tick.js:61:11)""},""message"":""No Living connections""}"


In [10]:
train = train.drop('id', axis=1)

In [11]:
train['level'].value_counts()

0    334065
1    132517
3      4141
5      2219
2        12
4        10
6         8
Name: level, dtype: int64

In [12]:
def pre(df):
  df = df.str.lower()
  word = ['mon','tue','wed','thu','fri','sat','sun','jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
  df = df.apply(lambda x: re.sub(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]',' ',x))
  df = df.apply(lambda x: ' '.join([w for w in x.split() if w not in (word)]))
  df = df.apply(lambda x: re.sub(r'[^a-zA-Zㄱ-ㅣ가-힣0-9:=\s\(\)./,\<\>]+',' ',x))
  df = df.str.replace(r'[0-9]', '')

  return df

In [13]:
train['full_log'] = pre(train.full_log)
test['full_log'] = pre(test.full_log)

In [14]:
train_text = list(train['full_log'])
train_level = np.array(train['level'])

> **토큰 벡터화**

In [15]:
# CountVectorizer로 벡터화
vectorizer = CountVectorizer(analyzer='word', max_features=10000)

train_features = vectorizer.fit_transform(train_text)

In [16]:
train_features.shape

(472972, 10000)

In [17]:
#TfidfVectorizer로 벡터화

tf_vectorizer = TfidfVectorizer(analyzer="word", max_features=10000)

train_Tfid = tf_vectorizer.fit_transform(train_text)

In [18]:
train_Tfid.shape

(472972, 10000)

# **3. 모델링 (Modeling)**

## **3-1. Random Forest Classifier**

In [19]:
# 훈련 데이터 셋과 검증 데이터 셋으로 분리

X_train, X_val, y_train, y_val = train_test_split(train_features, train_level, test_size=0.2, random_state=2)

In [20]:
# RandomForestClassifier로 모델링 => model_rf

model_rf = RandomForestClassifier(n_estimators=100,
                                  max_depth=5,
                                  class_weight='balanced',
                                  random_state=2)
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=2, verbose=0,
                       warm_start=False)

In [21]:
# model_rf 모델 검증

model_rf.score(X_val, y_val)

0.9182726359744172

In [22]:
# crosstab으로 확인

pred_rf = model_rf.predict(X_val)
crosstab = pd.crosstab(y_val, pred_rf, rownames=['real'], colnames=['pred'])
crosstab

pred,0,1,2,3,4,5,6
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,62220,0,12,679,2,61,3766
1,49,23408,18,2974,0,101,16
3,0,0,0,817,0,40,0
4,0,0,0,0,3,0,0
5,8,0,1,3,0,416,1


In [23]:
# 새로운 위험요소에 대한 가정 추가 => 예측치의 예측 확률이 0.90이하인 경우, 즉 확신이 없을 경우 이상치로 판단

preds = model_rf.predict(X_val)
probas = model_rf.predict_proba(X_val)
print(preds.shape)
print(probas.shape)

(94595,)
(94595, 7)


In [24]:
preds[np.where(np.max(probas, axis=1) < 0.90)] = 7
new_crosstab = pd.crosstab(y_val, preds, rownames=['real'], colnames=['pred'])
new_crosstab

pred,1,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,66740
1,15569,10997
3,0,857
4,0,3
5,0,429


In [25]:
# test 데이터 전처리

test_text = list(test['full_log'])
ids = list(test['id'])

In [26]:
# test 데이터 vectorizer

test_features=vectorizer.transform(test_text)

In [27]:
results = model_rf.predict(test_features)
results_proba = model_rf.predict_proba(test_features)

In [28]:
results[np.where(np.max(results_proba, axis=1) < 0.9)] = 7

In [29]:
sample_submit['level']=results

In [30]:
sample_submit['level'].value_counts()

7    1187780
1     231136
Name: level, dtype: int64

## **3-2. Tfid + Random Forest Classifier**

In [31]:
# 훈련 데이터 셋과 검증 데이터 셋으로 분리

X_train, X_val, y_train, y_val = train_test_split(train_Tfid, train_level, test_size=0.2, random_state=2)

In [32]:
model_rf2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=2)
model_rf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=2, verbose=0,
                       warm_start=False)

In [33]:
# model_rf2 모델 검증

model_rf2.score(X_val, y_val)

0.9975368676991384

In [34]:
# crosstab으로 확인

pred_rf2 = model_rf2.predict(X_val)
crosstab = pd.crosstab(y_val, pred_rf2, rownames=['real'], colnames=['pred'])
crosstab

pred,0,1,2,3,4,5
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,66646,82,0,1,0,11
1,115,26444,1,0,0,6
3,1,0,0,855,0,1
4,0,0,0,0,3,0
5,15,0,0,0,0,414


## **3-3. Extra Trees Classifier**

In [35]:
model_et = ExtraTreesClassifier(n_estimators=100, random_state=2)

In [36]:
model_et.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=2, verbose=0,
                     warm_start=False)

In [37]:
# model_et 모델 검증

model_et.score(X_val, y_val)

0.9979068661134309

In [38]:
# crosstab으로 확인

pred_et = model_et.predict(X_val)
crosstab = pd.crosstab(y_val, pred_et, rownames=['real'], colnames=['pred'])
crosstab

pred,0,1,3,4,5
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,66694,45,1,0,0
1,134,26431,0,0,1
3,1,0,855,0,1
4,0,0,0,3,0
5,15,0,0,0,414


In [39]:
# 새로운 위험요소에 대한 가정 추가 => 예측치의 예측 확률이 0.90이하인 경우, 즉 확신이 없을 경우 이상치로 판단

preds = model_et.predict(X_val)
probas = model_et.predict_proba(X_val)
print(preds.shape)
print(probas.shape)

(94595,)
(94595, 7)


In [40]:
preds[np.where(np.max(probas, axis=1) < 0.90)] = 7
new_crosstab = pd.crosstab(y_val, preds, rownames=['real'], colnames=['pred'])
new_crosstab

pred,0,1,3,4,5,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,66540,28,1,0,0,171
1,46,26418,0,0,1,101
3,1,0,855,0,1,0
4,0,0,0,3,0,0
5,6,0,0,0,413,10


In [41]:
# test 데이터 전처리

test_text = list(test['full_log'])
ids = list(test['id'])

In [42]:
#test 데이터 vectorizer

test_features = vectorizer.transform(test_text)

In [43]:
results = model_et.predict(test_features)
results_proba = model_et.predict_proba(test_features)

In [44]:
results[np.where(np.max(results_proba, axis=1) < 0.9)] = 7

In [45]:
sample_submit['level']=results

In [46]:
sample_submit['level'].value_counts()

0    946579
1    379726
7     81794
3      7645
5      3172
Name: level, dtype: int64