# xgboost

In [5]:
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

## 데이터 가져오기

In [6]:
train_set = pd.read_csv('train_set.csv')
train_set = train_set[train_set['ECLO'] <= 25]
test_set = pd.read_csv('test_set.csv')

## train test valid 분리

In [7]:
train_x, test_x, train_y, test_y = train_test_split(train_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']], train_set['ECLO'], test_size=0.2)

In [8]:
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2)

## xgboost 모델 정의

In [9]:
model = xgb.XGBRegressor(gpu_id=0, seed=42)

In [10]:
model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
            early_stopping_rounds=300, verbose=False)



## gridsearch

In [11]:
n_estimators = [10, 100, 1000, 10000]
depthes = [5, 10, 15, 20]
max_idx = [0, 0]

In [12]:
tmp = []
for i in n_estimators:
    del model
    model = xgb.XGBRegressor(gpu_id=0, seed=42)
    model.set_params(**{'n_estimators':i})
    model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
            early_stopping_rounds=300, verbose=False)
    tmp.append(np.sqrt(mean_squared_log_error(test_y, model.predict(test_x))))

max_idx[0] = np.argmin(tmp)
tmp



[0.4472752252787772,
 0.4485371270239892,
 0.4485371270239892,
 0.4485371270239892]

In [18]:
max_idx

[0, 0]

In [24]:
tmp = []
for i in depthes:
    del model
    model = xgb.XGBRegressor(gpu_id=0, seed=42)
    model.set_params(**{'n_estimators':10, 'max_depth':i})
    model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
            early_stopping_rounds=300, verbose=False)
    tmp.append(np.sqrt(mean_squared_log_error(test_y, model.predict(test_x))))

max_idx[1] = np.argmin(tmp)
tmp



[0.44682088708620815,
 0.44873293437816325,
 0.44961810449174205,
 0.4496259249071217]

In [25]:
max_idx

[0, 0]

In [26]:
del model
model = xgb.XGBRegressor(gpu_id=0, seed=42)
model.set_params(**{'n_estimators':10, 'max_depth':5})
model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
            early_stopping_rounds=300, verbose=False)



## 성능 측정

In [27]:
np.sqrt(mean_squared_log_error(test_y, model.predict(test_x)))

0.44682088708620815

## 결과 저장

In [29]:
ECLOs = model.predict(test_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']])
ECLOs

array([4.1614995, 3.6551104, 5.095342 , ..., 4.893866 , 4.9667215,
       4.9667215], dtype=float32)

In [30]:
min(ECLOs)

2.6877718

In [32]:
ans = pd.read_csv('open/sample_submission.csv')
ans['ECLO'] = ECLOs
ans.to_csv('ans_xgb.csv', index=False)