In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score

In [None]:
pd.set_option('display.max_colwidth', -1) # 각 컬럼 width를 최대로하여 full_log 확인

  """Entry point for launching an IPython kernel.


# Data

In [None]:
DIR = './data'
MASKING_VER = 'masked_03'
SUBMISSION_FILE = f'{DIR}/submission_extra_{MASKING_VER}_drop_1'

train = pd.read_csv(f'{DIR}/train_{MASKING_VER}_drop.csv', index_col=0)
validation = pd.read_csv(f'{DIR}/validation_{MASKING_VER}.csv', index_col=0)
test = pd.read_csv(f'{DIR}/test_{MASKING_VER}.csv', index_col=0)
submission = pd.read_csv(f'{DIR}/sample_submission.csv', index_col=0)


train.shape, validation.shape, test.shape, submission.shape

  mask |= (ar1 == a)


((472550, 3), (3, 1), (1418916, 1), (1418916, 1))

In [None]:
train['level'].value_counts() # 레벨별 로그 갯수 확인

0    334020
1    132182
3    4139  
5    2180  
2    11    
4    10    
6    8     
Name: level, dtype: int64

In [None]:
X_data = train['full_log']
y_data = train['level']

# Vectorize

In [None]:
# 해당 모델이서 TfidfVectorizer가 CountVectorizer보다 더 성능이 우수하다고 판단하여 사용
VOCAB_SIZE = 10000

vectorizer = TfidfVectorizer(analyzer='word', max_features=VOCAB_SIZE) 
X_features = vectorizer.fit_transform(X_data)

In [None]:
X_features

<472550x9980 sparse matrix of type '<class 'numpy.float64'>'
	with 17482703 stored elements in Compressed Sparse Row format>

# Train

## Split

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X_features, y_data, test_size=0.08, random_state=100, stratify=y_data)

X_train.shape, y_train.shape, X_eval.shape, y_eval.shape

((434746, 9980), (434746,), (37804, 9980), (37804,))

In [None]:
y_train.value_counts() # train_test_split후 train 데이터의 레벨별 로그 갯수 확인

0    307298
1    121608
3    3808  
5    2006  
2    10    
4    9     
6    7     
Name: level, dtype: int64

In [None]:
y_eval.value_counts() # test_size에 따른 데이터의 레벨별 로그 갯수 확인

0    26722
1    10574
3    331  
5    174  
2    1    
4    1    
6    1    
Name: level, dtype: int64

## Extra

RandomForestClassifier보다 훨씬 더 무작위성을 갖는 모델인 ExtraTreesClassifier를 사용

In [None]:
clf = ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

ExtraTreesClassifier(n_jobs=-1)

In [None]:
proba = clf.predict_proba(X_eval)

In [None]:
pred = np.argmax(proba, axis=-1)
crosstab = pd.crosstab(y_eval, pred, rownames=['real'], colnames=['pred'])

print(f1_score(y_eval, pred, average='macro')) # 대회 평가 산식인 Macro F1 확인
crosstab

0.9983030090558923


pred,0,1,2,3,4,5,6
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,26721,1,0,0,0,0,0
1,21,10553,0,0,0,0,0
2,0,0,1,0,0,0,0
3,3,0,0,328,0,0,0
4,0,0,0,0,1,0,0
5,2,0,0,0,0,172,0
6,0,0,0,0,0,0,1


In [None]:
THRESHOLD = 0.9 # 모든 위험도의 임계값을 0.9로 설정

In [None]:
pred_open = pred.copy()
pred_open[np.where(np.max(proba, axis=1) < THRESHOLD)] = 7
new_crosstab = pd.crosstab(y_eval, pred_open, rownames=['real'], colnames=['pred'])

print(f1_score(y_eval, pred_open, average='macro'))
new_crosstab

0.8726583249476694


pred,0,1,2,3,4,5,6,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,26658,0,0,0,0,0,0,64
1,7,10552,0,0,0,0,0,15
2,0,0,1,0,0,0,0,0
3,2,0,0,326,0,0,0,3
4,0,0,0,0,1,0,0,0
5,1,0,0,0,0,171,0,2
6,0,0,0,0,0,0,1,0


In [None]:
proba_all = clf.predict_proba(X_features)

In [None]:
pred_all = np.argmax(proba_all, axis=-1)
crosstab = pd.crosstab(y_data, pred_all, rownames=['real'], colnames=['pred'])

print(f1_score(y_data, pred_all, average='macro'))
crosstab

0.9998634765468999


pred,0,1,2,3,4,5,6
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,334019,1,0,0,0,0,0
1,23,132159,0,0,0,0,0
2,0,0,11,0,0,0,0
3,3,0,0,4136,0,0,0
4,0,0,0,0,10,0,0
5,2,0,0,0,0,2178,0
6,0,0,0,0,0,0,8


In [None]:
pred_open_all = pred_all.copy()
pred_open_all[np.where(np.max(proba_all, axis=1) < THRESHOLD)] = 7
new_crosstab = pd.crosstab(y_data, pred_open_all, rownames=['real'], colnames=['pred'])

print(f1_score(y_data, pred_open_all, average='macro'))
new_crosstab

0.874812626617933


pred,0,1,2,3,4,5,6,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,333954,0,0,0,0,0,0,66
1,8,132158,0,0,0,0,0,16
2,0,0,11,0,0,0,0,0
3,2,0,0,4134,0,0,0,3
4,0,0,0,0,10,0,0,0
5,1,0,0,0,0,2177,0,2
6,0,0,0,0,0,0,8,0


## validate

In [None]:
# Validation을 통한 성능 검증
X_valid = validation['full_log']
X_valid = vectorizer.transform(X_valid)
valid_proba = clf.predict_proba(X_valid)

In [None]:
results = np.argmax(valid_proba, axis=-1)
results[np.where(np.max(valid_proba, axis=1) < THRESHOLD)] = 7 

print(valid_proba)
results

[[0.01 0.99 0.   0.   0.   0.   0.  ]
 [0.01 0.   0.   0.22 0.   0.77 0.  ]
 [0.74 0.16 0.   0.01 0.   0.08 0.01]]


array([1, 7, 7], dtype=int64)

In [None]:
validation['full_log'][0] # 3개의 validation sample data중 첫 번째 data 분류 실패

'type=ANOM_PROMISCUOUS msg=audit(<NUM>.<NUM>:<NUM>): dev=enp2s0 prom=<NUM> old_prom=<NUM> auid=<NUM> uid=<NUM> gid=<NUM> ses=<NUM> type=SYSCALL msg=audit(<NUM>.<NUM>:<NUM>): arch=<NUM> syscall=<NUM> success=yes exit=<NUM> a0=<NUM> a1=<NUM> a2=<NUM> a3=<NUM> items=<NUM> ppid=<NUM> pid=<NUM> auid=<NUM> uid=<NUM> gid=<NUM> euid=<NUM> suid=<NUM> fsuid=<NUM> egid=<NUM> sgid=<NUM> fsgid=<NUM> tty=(none) ses=<NUM> comm="W#<NUM>-enp2s0" exe="/usr/sbin/suricata" subj=system_u:system_r:unconfined_service_t:s0 key=(null) type=PROCTITLE msg=audit(<NUM>.<NUM>:<NUM>): proctitle=<NUM>'

# Predict

In [None]:
X_test = test['full_log']
X_test = vectorizer.transform(X_test)
results_proba = clf.predict_proba(X_test)

In [None]:
results = np.argmax(results_proba, axis=-1)
results[np.where(np.max(results_proba, axis=1) < THRESHOLD)] = 7

# 결과 저장

In [None]:
submission['level'] = results
submission['level'].value_counts().sort_index()

0    1001107
1    396030 
2    34     
3    12908  
4    34     
5    6419   
6    25     
7    2359   
Name: level, dtype: int64