In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train_df = pd.read_csv('./train.csv')

In [4]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

In [9]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

In [11]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    # test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [6]:
from sklearn.model_selection import train_test_split

In [12]:
#학습 데이터와 검증 데이터를 8:2로 분할합니다
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2)

In [24]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [14]:
from sklearn.metrics import mean_squared_error

In [22]:
def result_report(model, name):
    train_dot = model.predict(train_x)
    val_dot = model.predict(val_x)
    
    train_rmse = mean_squared_error(train_y, train_dot, squared=False)
    val_rmse = mean_squared_error(val_y, val_dot, squared=False)
    print(name)
    print('Train RMSE :', train_rmse)
    print('Validation RMSE :', val_rmse)

In [25]:
result_report(model, 'Random Forest')

Random Forest
Train RMSE : 439.79034575698597
Validation RMSE : 1199.5721039917535


In [26]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=100, verbose=100)

[0]	validation_0-rmse:2251.62511
[100]	validation_0-rmse:1232.17228
[200]	validation_0-rmse:1205.57148
[300]	validation_0-rmse:1191.92982
[400]	validation_0-rmse:1179.54287
[500]	validation_0-rmse:1166.08458
[600]	validation_0-rmse:1155.65529
[700]	validation_0-rmse:1147.99722
[800]	validation_0-rmse:1139.02033
[900]	validation_0-rmse:1132.09019
[999]	validation_0-rmse:1126.23905


In [27]:
result_report(model, 'XGBoost')

XGBoost
Train RMSE : 1015.3071411058783
Validation RMSE : 1126.2390493304895


In [40]:
# conda install lightgbm

In [41]:
from lightgbm import LGBMRegressor

In [43]:
model = LGBMRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=100, verbose=100)

[100]	valid_0's l2: 1.51024e+06
[200]	valid_0's l2: 1.45589e+06
[300]	valid_0's l2: 1.42425e+06
[400]	valid_0's l2: 1.3961e+06
[500]	valid_0's l2: 1.37313e+06
[600]	valid_0's l2: 1.35144e+06
[700]	valid_0's l2: 1.32915e+06
[800]	valid_0's l2: 1.31092e+06
[900]	valid_0's l2: 1.29673e+06
[1000]	valid_0's l2: 1.28612e+06


In [44]:
result_report(model, 'LightGBM')

LightGBM
Train RMSE : 994.2266918184708
Validation RMSE : 1134.072719818625
