In [1]:
import pandas as pd
import numpy as np
import os

import json

from sklearn.model_selection  import train_test_split

## 데이터 불러오기

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'


# 훈련 데이터 가져오는 부분이다.
train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [5]:
train_q1_data.shape

(298526, 31)

In [3]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1) 

In [7]:
print(train_input.shape)

(298526, 2, 31)


## 훈련, 검증 데이터셋 나누기

In [8]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_labels, test_size=0.2, random_state=4242)

## xgboost 불러오기

In [10]:
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


## 모델 구성 및 학습

In [11]:
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label) # 학습 데이터 읽어 오기
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label) # 평가 데이터 읽어 오기

data_list = [(train_data, 'train'), (eval_data, 'valid')]

In [12]:
params = {} # 인자를 통해 XGB모델에 넣어 주자 
params['objective'] = 'binary:logistic' # 로지스틱 예측을 통해서 
params['eval_metric'] = 'rmse' # root mean square error를 사용  
bst = xgb.train(params, train_data, num_boost_round = 1000, evals = data_list, early_stopping_rounds=10)

[0]	train-rmse:0.48371	valid-rmse:0.48430
[1]	train-rmse:0.47361	valid-rmse:0.47458
[2]	train-rmse:0.46684	valid-rmse:0.46820
[3]	train-rmse:0.46220	valid-rmse:0.46382
[4]	train-rmse:0.45864	valid-rmse:0.46047
[5]	train-rmse:0.45582	valid-rmse:0.45799
[6]	train-rmse:0.45374	valid-rmse:0.45604
[7]	train-rmse:0.45098	valid-rmse:0.45348
[8]	train-rmse:0.44942	valid-rmse:0.45216
[9]	train-rmse:0.44777	valid-rmse:0.45069
[10]	train-rmse:0.44670	valid-rmse:0.44980
[11]	train-rmse:0.44571	valid-rmse:0.44897
[12]	train-rmse:0.44512	valid-rmse:0.44848
[13]	train-rmse:0.44424	valid-rmse:0.44775
[14]	train-rmse:0.44264	valid-rmse:0.44623
[15]	train-rmse:0.44206	valid-rmse:0.44580
[16]	train-rmse:0.44154	valid-rmse:0.44538
[17]	train-rmse:0.44104	valid-rmse:0.44513
[18]	train-rmse:0.43992	valid-rmse:0.44412
[19]	train-rmse:0.43946	valid-rmse:0.44371
[20]	train-rmse:0.43890	valid-rmse:0.44322
[21]	train-rmse:0.43817	valid-rmse:0.44261
[22]	train-rmse:0.43713	valid-rmse:0.44176
[23]	train-rmse:0.436

## 테스트 데이터 가져오기

In [15]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'),allow_pickle=True)

## 예측하기

In [16]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1) 
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [17]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate': test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)