# BaseLine Modeling

## 1. setting modules

In [1]:
# load modules
import numpy as np
import pandas as pd

# split
from sklearn.model_selection import train_test_split

# models 
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# metrics
from sklearn.metrics import mean_squared_log_error

In [2]:
# load data
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

In [3]:
# 필요한 데이터만 가져오기
use_train = train[['사고일시', '요일', '기상상태', '도로형태', '노면상태', '사고유형', '시군구', '사망자수']]
use_test = test[test.columns.difference(['ID'])]

## 2. data preprocessing

In [4]:
# one hot encoding
onehot = pd.get_dummies(use_train[['기상상태', '도로형태', '노면상태', '사고유형']], drop_first=True)
use_train2 = pd.concat([use_train, onehot], axis = 1).drop(['기상상태', '도로형태', '노면상태', '사고유형'], axis = 1)

onehot_test = pd.get_dummies(test[['기상상태', '도로형태', '노면상태', '사고유형']], drop_first=True)
use_test2 = pd.concat([use_test, onehot_test], axis = 1).drop(['기상상태', '도로형태', '노면상태', '사고유형'], axis = 1)

In [5]:
# 요일 월~금: 0 토~일:1
use_train2['주말'] = np.where(use_train2['요일'].isin(['월요일', '화요일', '수요일', '목요일', '금요일']), 0, 1)
use_train3 = use_train2.drop(['요일'], axis = 1)

use_test2['주말'] = np.where(use_test2['요일'].isin(['월요일', '화요일', '수요일', '목요일', '금요일']), 0, 1)
use_test3 = use_test2.drop(['요일'], axis = 1)

In [6]:
# 사고일시 나누기

## date type data
use_train3['사고일시'] = pd.to_datetime(use_train3['사고일시'])
use_test3['사고일시'] = pd.to_datetime(use_test3['사고일시'])

In [7]:
# split year, month, day, hour
use_train3['year'] = use_train3['사고일시'].dt.year
use_train3['month'] = use_train3['사고일시'].dt.month
use_train3['day'] = use_train3['사고일시'].dt.day
use_train3['hour'] = use_train3['사고일시'].dt.hour

use_test3['year'] = use_test3['사고일시'].dt.year
use_test3['month'] = use_test3['사고일시'].dt.month
use_test3['day'] = use_test3['사고일시'].dt.day
use_test3['hour'] = use_test3['사고일시'].dt.hour

In [8]:
# 시군구 나누기
use_train3[['시', '구', '동가']] = use_train3['시군구'].str.split(' ', expand = True)
use_train4 = use_train3.drop(['사고일시', '시군구', '시', 'year'], axis = 1)

use_test3[['시', '구', '동가']] = use_test3['시군구'].str.split(' ', expand = True)
use_test4 = use_test3.drop(['사고일시', '시군구', '시', 'year'], axis = 1)

In [9]:
# type change - 상혁
use_train4['구'] = use_train4['구'].astype('category')
use_train4['동가'] = use_train4['동가'].astype('category')

use_test4['구'] = use_test4['구'].astype('category')
use_test4['동가'] = use_test4['동가'].astype('category')

In [11]:
# 안개 없기 때문에 0
use_test4['기상상태_안개'] = 0

## 3. Modeling

In [12]:
## 3. hold-out train test split
x_train, x_valid, y_train, y_valid = train_test_split(use_train4[use_train4.columns.difference(['사망자수'])],
                                                      use_train4['사망자수'],
                                                      test_size = 0.3,
                                                      random_state = 42
                                                    )

In [13]:
# category values
labels = x_train.dtypes.reset_index()
categorical_cols = list(labels[labels[0] == 'category'].index)

In [14]:
# model
xgb = XGBRegressor(objective='count:poisson',
                    random_state = 42,
                    use_label_encoder=False,
                    enable_categorical=True,
                    tree_method='hist'
                    )
lgbm = LGBMRegressor(
    objective='poisson',
    random_state = 42
)
cb = CatBoostRegressor(
    cat_features = categorical_cols,
    objective = 'Poisson',
    random_state = 42
)

In [15]:
# fitting
xgb.fit(x_train, y_train)
lgbm.fit(x_train, y_train)
cb.fit(x_train, y_train,
        eval_set=(x_valid, y_valid),
        use_best_model = True,
        plot = True)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 298
[LightGBM] [Info] Number of data points in the train set: 27726, number of used features: 22
[LightGBM] [Info] Start training from score -4.887792


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8618622	test: 0.8619330	best: 0.8619330 (0)	total: 472ms	remaining: 7m 51s
1:	learn: 0.7484133	test: 0.7485016	best: 0.7485016 (1)	total: 562ms	remaining: 4m 40s
2:	learn: 0.6524432	test: 0.6526774	best: 0.6526774 (2)	total: 653ms	remaining: 3m 37s
3:	learn: 0.5660794	test: 0.5662360	best: 0.5662360 (3)	total: 769ms	remaining: 3m 11s
4:	learn: 0.4986906	test: 0.4989890	best: 0.4989890 (4)	total: 849ms	remaining: 2m 48s
5:	learn: 0.4414234	test: 0.4418494	best: 0.4418494 (5)	total: 918ms	remaining: 2m 32s
6:	learn: 0.3919861	test: 0.3925812	best: 0.3925812 (6)	total: 1.02s	remaining: 2m 24s
7:	learn: 0.3500069	test: 0.3507135	best: 0.3507135 (7)	total: 1.05s	remaining: 2m 10s
8:	learn: 0.3134048	test: 0.3142272	best: 0.3142272 (8)	total: 1.17s	remaining: 2m 8s
9:	learn: 0.2815096	test: 0.2824084	best: 0.2824084 (9)	total: 1.27s	remaining: 2m 5s
10:	learn: 0.2537776	test: 0.2547556	best: 0.2547556 (10)	total: 1.37s	remaining: 2m 3s
11:	learn: 0.2304418	test: 0.2315149	best: 0

<catboost.core.CatBoostRegressor at 0x1e8fce110d0>

In [242]:
# predict
pred_xgb = xgb.predict(x_valid)
pred_lgbm = lgbm.predict(x_valid)
pred_cb = cb.predict(x_valid)

In [243]:
# metrics
rmsle_xgb = mean_squared_log_error(y_valid, pred_xgb, squared = False)
rmsle_lgbm = mean_squared_log_error(y_valid, pred_lgbm, squared = False)
rmsle_cb = mean_squared_log_error(y_valid, pred_cb, squared = False)

print(f'xgboost : {rmsle_xgb}')
print(f'lightgbm : {rmsle_lgbm}')
print(f'catboost : {rmsle_cb}')

xgboost : 0.06535774685603425
lightgbm : 0.06426787080362707
catboost : 0.06283688668842609


In [290]:
# 성능이 가장 좋은 test값 산출하기
pred_test = cb.predict(use_test4[x_train.columns])

In [294]:
# submission 만들기
sample_submission = pd.read_csv('../Data/sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['사망자수'] = pred_test
baseline_submission.to_csv('../Data/baseline_사망자수.csv', index = False, encoding = 'cp949')