In [1]:
# XGBoost 분류기 훈련
# 경사 부스팅 Gradient Boosting
# XGBoost : 악성코드 탐지기
# 이진트리를 생성해 노드를 겹쳐서 최종 값을 뽑아내는 형태

In [2]:
import pandas as pd

df = pd.read_csv('./data/file_pe_headers.csv',sep=',')
y = df['Malware']
X = df.drop(['Name','Malware'], axis=1).to_numpy()

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
# 오류창 끄는 옵션
import warnings
warnings.filterwarnings(action='ignore')

# XGBoost 모델 훈련
from xgboost import XGBClassifier

XGB_model_instance = XGBClassifier() # XGB 인스턴스
XGB_model_instance.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [5]:
# 모델 성능 평가
from sklearn.metrics import accuracy_score

y_test_pred = XGB_model_instance.predict(X_test)
accuracy_score(y_test, y_test_pred) # 테스트라벨, 예측라벨

0.9935418082936778

In [6]:
# statsmodels 시계열 분석
# 시계열 : 연속적인 시간에서 얻은 값들로 이뤄진 수열
# 시계열 예시 : 주식시장의 가격
# 사이버 공격(Cyber Attack)을 예측하는데 유용하다

# 랜덤 데이터 생성
from random import random

x = [x for x in range(1,100)]
time_series = [2*x + random() for x in x]
time_series

[2.3002209092674537,
 4.165726566952758,
 6.059715060622859,
 8.721731744849734,
 10.610639165249355,
 12.872569814508402,
 14.081810694234226,
 16.690177857399835,
 18.774710207341816,
 20.21012339441905,
 22.271328443872868,
 24.238475966409524,
 26.61353665729824,
 28.803321361804578,
 30.186172345641342,
 32.164422401786695,
 34.97519996387735,
 36.98335815312373,
 38.246163233418095,
 40.61772664747659,
 42.13011851962157,
 44.595045134476784,
 46.559654503344234,
 48.900822424499346,
 50.152566270123565,
 52.063820529887664,
 54.324668536025875,
 56.11076532134533,
 58.75924118049378,
 60.99342090524681,
 62.02467776514638,
 64.07245957915907,
 66.66705093781637,
 68.56589946618884,
 70.28070965410174,
 72.45568643982453,
 74.88798628490592,
 76.8329287393881,
 78.49863179704684,
 80.81892392443073,
 82.00666397674556,
 84.9601301586821,
 86.23223796748847,
 88.2237797461605,
 90.91884526098863,
 92.79459698502093,
 94.9252883776787,
 96.79226083695325,
 98.52628405170803,
 100.7

In [7]:
# 일반 선형그래프 출력
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter(x=x, y=time_series))
fig.show()

In [8]:
# 자동회귀 AR, Auto Gression
# 선형적인 특징에 의존한 머신러닝 기법
from statsmodels.tsa.ar_model import AR

model = AR(time_series)
model_fit = model.fit()
model_fit.predict(len(time_series), len(time_series)) # 100~100를 넣었을때 결과 예측

array([200.55231048])

In [9]:
model_fit.params # 매개변수

array([ 1.22648191e+01,  1.67085948e-01, -8.71071070e-03,  1.31179515e-01,
        5.91785571e-02,  1.47528128e-01,  1.21533886e-01,  5.06388980e-02,
        4.68215154e-02,  4.00846408e-02,  1.44929283e-01, -1.38020262e-01,
        2.37567019e-01])

In [10]:
# 이동 평균 MA, Moving Average
from statsmodels.tsa.arima_model import ARMA

model = ARMA(time_series, order=(0,1))
model_fit = model.fit(disp=False)
model_fit.predict(len(time_series), len(time_series)) # 100~100를 넣었을때 결과 예측

array([148.22192145])

In [11]:
model_fit.params # 매개변수

array([100.44815866,   0.99996446])

In [12]:
# 단순지수평활 SES, Simple Exponential Smoothing
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

model = SimpleExpSmoothing(time_series)
model_fit = model.fit()
model_fit.predict(100,100)

array([198.95351068])