In [5]:
import pandas as pd
pd.set_option("display.max_row", 100)
pd.set_option("display.max_column", 100)
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from lightgbm import LGBMRegressor

In [7]:
cus = pd.read_csv("data/cus_info.csv")
iem = pd.read_csv("data/iem_info_20210902.csv")
hist = pd.read_csv("data/stk_bnc_hist.csv")
train = pd.read_csv("data/stk_hld_train.csv")
test = pd.read_csv("data/stk_hld_test.csv")
submission = pd.read_csv("data/sample_submission.csv")

In [22]:
# 앞서 언급한 것처럼 베이스라인에서는 "hist_d" column을 임의로 생성하여 모델을 학습시키도록 하겠습니다.
# 베이스라인에서는 "hold_d"값, 즉 주식 보유기간의 0.6배에 해당하는 기간을 임의로 설정하여 "hist_d"를 생성하였습니다.
# 결국 모델은 "hist_d"만큼 주식을 보유 했을때의 "hold_d"를 예측하게 될 것입니다.

train["hist_d"] = train["hold_d"]*0.6
train.hist_d = np.trunc(train["hist_d"])

In [23]:
train.head(3)

Unnamed: 0,act_id,iem_cd,byn_dt,hold_d,hist_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,20180726,11,11.0
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005930,20180131,80,80.0
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005070,20180517,5,5.0


In [24]:
# train과 test에 고객정보(cus_info)와 주식정보(iem_info)를 추가하겠습니다.

train_data = pd.merge(train, cus, how = "left", on = ["act_id"])
train_data = pd.merge(train_data, iem, how = "left", on = ["iem_cd"])

test_data = pd.merge(test, cus, how = "left", on = ["act_id"])
test_data = pd.merge(test_data, iem, how = "left", on = ["iem_cd"])

In [25]:
train_data.head(3)

Unnamed: 0,act_id,iem_cd,byn_dt,hold_d,hist_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,20180726,11,11.0,1,9,3,2,2,9,5,5,8,GS건설,1,1,1
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005930,20180131,80,80.0,1,9,3,2,2,9,5,5,8,삼성전자,9,1,1
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005070,20180517,5,5.0,1,9,3,2,2,9,5,5,8,코스모신소재,12,2,99


In [26]:
# train_data에서 Y값을 추출한 후 hold_d column을 지워주겠습니다.

train_label = train_data["hold_d"]
train_data.drop(["hold_d"], axis = 1, inplace = True)

In [27]:
# 추가적으로 약간의 전처리를 통해 train data와 test data를 구성하겠습니다.

hist["stk_p"] = hist["tot_aet_amt"] / hist["bnc_qty"]
hist = hist.fillna(0)

train_data = pd.merge(train_data, hist, how = "left", on = ["act_id", "iem_cd"])
train_data = train_data[(train_data["byn_dt"] == train_data["bse_dt"])]
train_data.reset_index(drop = True, inplace = True)

test_data = pd.merge(test_data, hist, how = "left", on = ["act_id", "iem_cd"])
test_data = test_data[(test_data["byn_dt"] == test_data["bse_dt"])]
test_data.reset_index(drop = True, inplace = True)

train_data = train_data.drop(["act_id", "iem_cd", "byn_dt", "bse_dt"], axis = 1)
test_data = test_data.drop(["act_id", "iem_cd", "byn_dt", "submit_id", "hold_d", "bse_dt"], axis = 1)

L_encoder = LabelEncoder()
L_encoder.fit(iem["iem_krl_nm"])
train_data["iem_krl_nm"] = L_encoder.transform(train_data["iem_krl_nm"])
test_data["iem_krl_nm"] = L_encoder.transform(test_data["iem_krl_nm"])

In [28]:
train_data.head(3)

Unnamed: 0,hist_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,bnc_qty,tot_aet_amt,stk_par_pr,stk_p
0,11.0,1,9,3,2,2,9,5,5,8,101,1,1,1,274.0,11782000.0,5000.0,43000.0
1,80.0,1,9,3,2,2,9,5,5,8,1361,9,1,1,2.0,4990000.0,5000.0,2495000.0
2,5.0,1,9,3,2,2,9,5,5,8,2530,12,2,99,786.0,14619600.0,1000.0,18600.0


In [29]:
test_data.head(3)

Unnamed: 0,hist_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,bnc_qty,tot_aet_amt,stk_par_pr,stk_p
0,153,1,9,3,2,2,9,5,5,8,418,4,1,1,300.0,3945000.0,5000.0,13150.0
1,335,1,9,3,2,2,9,5,5,8,2230,10,3,99,198.0,2524500.0,500.0,12750.0
2,139,1,9,3,2,2,9,5,5,8,1515,13,2,99,138.0,4291800.0,500.0,31100.0


In [30]:
train_data.reset_index(drop = True, inplace=True)
train_label.reset_index(drop = True, inplace=True)

In [31]:
models = []

folds = KFold(n_splits=10)
for train_idx, val_idx in folds.split(train_data):
    
    train_x = train_data.iloc[train_idx, :]
    train_y = train_label[train_idx]
    val_x = train_data.iloc[val_idx, :]
    val_y = train_label[val_idx]
    
    model = LGBMRegressor(objective= "regression",
                          max_depth= 5,
                          n_estimators= 2000,
                          learning_rate= 0.01,
                          num_leaves = 31)
    
    model.fit(train_x, train_y,
              eval_set=[(val_x, val_y)],
              eval_metric=["rmtrain.head()se"],
              early_stopping_rounds=300,
              verbose=500)
    
    models.append(model)

Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 4.81221	valid_0's l2: 23.1574
[1000]	valid_0's rmse: 4.58261	valid_0's l2: 21.0003
[1500]	valid_0's rmse: 4.56186	valid_0's l2: 20.8106
[2000]	valid_0's rmse: 4.5428	valid_0's l2: 20.637
Did not meet early stopping. Best iteration is:
[1999]	valid_0's rmse: 4.54242	valid_0's l2: 20.6336
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 1.53335	valid_0's l2: 2.35117
Early stopping, best iteration is:
[586]	valid_0's rmse: 1.51763	valid_0's l2: 2.3032
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 1.0532	valid_0's l2: 1.10922
Early stopping, best iteration is:
[567]	valid_0's rmse: 0.989849	valid_0's l2: 0.979801
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 0.49342	valid_0's l2: 0.243464
[1000]	valid_0's rmse: 0.320453	valid_0's l2: 0.10269
[1500]	valid_0's rmse: 0.31769	valid_0's l2: 0.100927
[20

In [32]:
result = []
for i in models:
    result.append(i.predict(test_data))
predict = np.mean(result, axis = 0)

In [33]:
predict

array([152.61107196, 334.68199433, 138.80052218, ..., 792.50358315,
        11.00602787,   4.01231495])

In [34]:
submission["hold_d"] = np.round(predict)

In [35]:
submission.to_csv("dacon_baseline.csv", index = False)