In [1]:
import os
import sys
import csv

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
CUR_DIR = os.getcwd()
CUR_DIR

'/Users/lugu/Documents/git-repositories/ds.lminho248@gmail.com/dacon_prediction_security_risk_through_log_analysis'

In [3]:
def load_data(fn, dtypes=None, delimiter=','):
    
    return pd.read_csv(fn)

In [4]:
trainPath = os.path.join(CUR_DIR, 'data', 'train.csv')
validPath = os.path.join(CUR_DIR, 'data', 'validation_sample.csv')
testPath = os.path.join(CUR_DIR, 'data', 'test.csv')

trainSet = load_data(trainPath)
validSet = load_data(validPath)
testSet = load_data(testPath)

print("Length of TrainSet : %d" % len(trainSet))
print("Length of TestSet : %d" % len(testSet))
print("Length of ValidSet : %d" % len(validSet))

Length of TrainSet : 472972
Length of TestSet : 1418916
Length of ValidSet : 3


## Data Preprocessing

In [5]:
trainSet['full_log'] = trainSet['full_log'].str.replace(r'[0-9]', '<num>', regex=True)
testSet['full_log'] = testSet['full_log'].str.replace(r'[0-9]', '<num>', regex=True)

In [6]:
trainText = list(trainSet['full_log'])
trainLevel = np.array(trainSet['level'])

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', max_features=10000)

trainFeatures = vectorizer.fit_transform(trainText)
trainFeatures

<472972x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 16452589 stored elements in Compressed Sparse Row format>

## Modeling

In [8]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2
RANDOM_SEED = 42

trainX, validX, trainY, validY = train_test_split(
                                    trainFeatures, trainLevel, 
                                    test_size=TEST_SIZE, random_state=RANDOM_SEED
                                )

In [9]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)

forest.fit(trainX, trainY)

RandomForestClassifier()

In [10]:
forest.score(validX, validY)

0.9978857233469

In [11]:
pred = forest.predict(validX)
crosstab = pd.crosstab(validY, pred, rownames=['real'], colnames=['pred'])

crosstab

pred,0,1,2,3,4,5
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,66723,61,0,0,0,2
1,116,26430,0,0,0,1
2,0,0,2,0,0,0
3,1,1,0,799,0,0
4,0,0,0,0,3,0
5,12,3,0,0,0,438
6,1,0,0,2,0,0


In [12]:
# 새로운 위험요소에 대한 가정 추가
# 예측치의 예측 확률이 90% 이하인 경우, 즉 확신이 없을 경우 이상치로 판단
preds = forest.predict(validX)
probas = forest.predict_proba(validX)
print(preds.shape)
print(probas.shape)

(94595,)
(94595, 7)


In [13]:
preds[np.where(np.max(probas, axis=1) < 0.90)] = 7
newCrosstab = pd.crosstab(validY, preds, rownames=['real'], colnames=['pred'])

newCrosstab

pred,0,1,2,3,4,5,7
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,66511,1,0,0,0,0,274
1,23,26406,0,0,0,0,118
2,0,0,2,0,0,0,0
3,0,0,0,790,0,0,11
4,0,0,0,0,2,0,1
5,3,0,0,0,0,432,18
6,0,0,0,0,0,0,3


## Prediction

In [14]:
testText = list(testSet['full_log'])
testIds = list(testSet['id'])

In [15]:
testFeatures = vectorizer.transform(testText)

In [16]:
results = forest.predict(testFeatures)
resultsProba = forest.predict_proba(testFeatures)
results[np.where(np.max(resultsProba, axis=1) < 0.90)] = 7

In [17]:
submissionPath = os.path.join(CUR_DIR, 'data', 'sample_submission.csv')

submission = load_data(submissionPath)

In [18]:
submission['level'] = results

submission

Unnamed: 0,id,level
0,1000000,0
1,1000001,0
2,1000002,1
3,1000003,0
4,1000004,1
...,...,...
1418911,2418911,0
1418912,2418912,0
1418913,2418913,1
1418914,2418914,0


In [19]:
submission.to_csv('baseline.csv', index=False)