# Divide & Conquer

In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV

## 데이터 가져오기

In [2]:
train_set = pd.read_csv('train_set.csv')
train_set = train_set[train_set['ECLO'] <= 20]
test_set = pd.read_csv('test_set.csv')

## train test valid 분리

In [37]:
train_dead_x, test_dead_x, train_dead_y, test_dead_y = train_test_split(train_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']], train_set['사망자수'], test_size=0.2, random_state=42)
traind_heavy_x, test_heavy_x, train_heavy_y, test_heavy_y = train_test_split(train_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']], train_set['중상자수'], test_size=0.2, random_state=42)
train_light_x, test_light_x, train_light_y, test_light_y = train_test_split(train_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']], train_set['경상자수'], test_size=0.2, random_state=42)
train_patients_x, test_patients_x, train_patients_y, test_patients_y = train_test_split(train_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']], train_set['부상자수'], test_size=0.2, random_state=42)
train_x, test_x, train_y, test_y = train_test_split(train_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']], train_set['ECLO'], test_size=0.2, random_state=42)

In [38]:
train_dead_x, valid_dead_x, train_dead_y, valid_dead_y = train_test_split(train_dead_x, train_dead_y, test_size=0.2, random_state=42)
train_heavy_x, valid_heavy_x, train_heavy_y, valid_heavy_y = train_test_split(traind_heavy_x, train_heavy_y, test_size=0.2, random_state=42)
train_light_x, valid_light_x, train_light_y, valid_light_y = train_test_split(train_light_x, train_light_y, test_size=0.2, random_state=42)
train_patients_x, valid_patients_x, train_patients_y, valid_patients_y = train_test_split(train_patients_x, train_patients_y, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# 모델 정의

In [39]:
model_dead = xgb.XGBRegressor(gpu_id=0, seed=42)
model_heavy = xgb.XGBRegressor(gpu_id=0, seed=42)
model_light = xgb.XGBRegressor(gpu_id=0, seed=42)
model_patients = xgb.XGBRegressor(gpu_id=0, seed=42)

## 모델 학습

In [63]:
n_estimators = [i for i in range(10, 100, 10)]
depthes = [i for i in range(5, 11)]

In [64]:
xgb_params = {
    'n_estimators' : n_estimators,
    'max_depth' : depthes
}

In [65]:
grid_dead = GridSearchCV(model_dead, param_grid=xgb_params, cv=3, refit=True, n_jobs=10, scoring='neg_mean_squared_log_error')
grid_dead.fit(train_dead_x, train_dead_y, eval_set=[(valid_dead_x, valid_dead_y)])
test_dead_result = []
for i in grid_dead.predict(test_dead_x):
    if i > 0:
        test_dead_result.append(i)
    else:
        test_dead_result.append(0)
grid_dead.best_params_

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


[0]	validation_0-rmse:0.11493
[1]	validation_0-rmse:0.11466
[2]	validation_0-rmse:0.11452
[3]	validation_0-rmse:0.11445
[4]	validation_0-rmse:0.11441
[5]	validation_0-rmse:0.11439
[6]	validation_0-rmse:0.11439
[7]	validation_0-rmse:0.11438
[8]	validation_0-rmse:0.11437
[9]	validation_0-rmse:0.11438


{'max_depth': 5, 'n_estimators': 10}

In [66]:
grid_heavy = GridSearchCV(model_heavy, param_grid=xgb_params, cv=3, refit=True, n_jobs=10, scoring='neg_mean_squared_log_error')
grid_heavy.fit(train_heavy_x, train_heavy_y, eval_set=[(valid_heavy_x, valid_heavy_y)])
test_heavy_result = []
for i in model_heavy.predict(test_heavy_x):
    if i > 0:
        test_heavy_result.append(i)
    else:
        test_heavy_result.append(0)
grid_heavy.best_params_

         nan         nan         nan -0.10788914         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan]


[0]	validation_0-rmse:0.51040
[1]	validation_0-rmse:0.50916
[2]	validation_0-rmse:0.50854
[3]	validation_0-rmse:0.50822
[4]	validation_0-rmse:0.50807
[5]	validation_0-rmse:0.50798
[6]	validation_0-rmse:0.50793
[7]	validation_0-rmse:0.50789
[8]	validation_0-rmse:0.50787
[9]	validation_0-rmse:0.50783


{'max_depth': 5, 'n_estimators': 10}

In [67]:
grid_light = GridSearchCV(model_light, param_grid=xgb_params, cv=3, refit=True, n_jobs=10, scoring='neg_mean_squared_log_error')
grid_light.fit(train_light_x, train_light_y, eval_set=[(valid_light_x, valid_light_y)])
test_light_result = []
for i in model_light.predict(test_light_x):
    if i > 0:
        test_light_result.append(i)
    else:
        test_light_result.append(0)
grid_light.best_params_

         nan         nan         nan -0.17952347         nan         nan
         nan         nan         nan         nan         nan         nan
 -0.17954491         nan         nan         nan         nan         nan
         nan         nan         nan -0.17959266         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan]


[0]	validation_0-rmse:0.91711
[1]	validation_0-rmse:0.90694
[2]	validation_0-rmse:0.90192
[3]	validation_0-rmse:0.89952
[4]	validation_0-rmse:0.89837
[5]	validation_0-rmse:0.89773
[6]	validation_0-rmse:0.89744
[7]	validation_0-rmse:0.89729
[8]	validation_0-rmse:0.89721
[9]	validation_0-rmse:0.89705


{'max_depth': 6, 'n_estimators': 10}

In [69]:
grid_patients = GridSearchCV(model_patients, param_grid=xgb_params, cv=3, refit=True, n_jobs=10, scoring='neg_mean_squared_log_error')
grid_patients.fit(train_patients_x, train_patients_y, eval_set=[(valid_patients_x, valid_patients_y)])
test_patients_result = []
for i in model_patients.predict(test_patients_x):
    if i > 0:
        test_patients_result.append(i)
    else:
        test_patients_result.append(0)
grid_patients.best_params_

         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan]


[0]	validation_0-rmse:0.34917
[1]	validation_0-rmse:0.34897
[2]	validation_0-rmse:0.34888
[3]	validation_0-rmse:0.34883
[4]	validation_0-rmse:0.34880
[5]	validation_0-rmse:0.34879
[6]	validation_0-rmse:0.34878
[7]	validation_0-rmse:0.34878
[8]	validation_0-rmse:0.34878
[9]	validation_0-rmse:0.34879


{'max_depth': 5, 'n_estimators': 10}

## 성능 측정

In [70]:
print('사망자수 :', np.sqrt(mean_squared_log_error(test_dead_y, test_dead_result)))
print('중상자수 :', np.sqrt(mean_squared_log_error(test_heavy_y, test_heavy_result)))
print('경상자수 :', np.sqrt(mean_squared_log_error(test_light_y, test_light_result)))
print('부상자수 :', np.sqrt(mean_squared_log_error(test_patients_y, test_patients_result)))

사망자수 : 0.07969607110437096
중상자수 : 0.3284977832244147
경상자수 : 0.4244114524305828
부상자수 : 0.2102914007452747


In [71]:
test_dead_result = np.array(test_dead_result, dtype=float)
test_heavy_result = np.array(test_heavy_result, dtype=float)
test_light_result = np.array(test_light_result, dtype=float)
test_patients_result = np.array(test_patients_result, dtype=float)
np.sqrt(mean_squared_log_error(test_y, test_dead_result * 10 + test_heavy_result * 5 + test_light_result * 3 + test_patients_result * 1))

0.44009525234153585

## 결과 저장

In [72]:
ECLOs = \
    model_dead.predict(test_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']]) * 10 \
    + model_heavy.predict(test_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']]) * 5 \
    + model_light.predict(test_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']]) * 3 \
    + model_patients.predict(test_set[['요일', '기상상태', '도로형태', '노면상태', '사고유형']])

In [73]:
ans = pd.read_csv('open/sample_submission.csv')
ans['ECLO'] = ECLOs
ans.to_csv('ans_div_con.csv', index=False)