# 라이브러리 호출

In [2]:
import pandas as pd
import random
import os
import numpy as np
import sklearn as sk
import sklearn.ensemble as sken
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from tqdm import tqdm
import lightgbm as lgb 

In [3]:
CONFIG = {
    'seed' :21,
    'n_splits' : 5, 
    'sample_submission' : './data/sample_submission.csv',
    'submission' : './data/submission.csv',
    'train' : './data/train.csv',
    'test' : './data/test.csv',
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(21) # Seed 고정

In [5]:
train = pd.read_csv(CONFIG['train'])
test = pd.read_csv(CONFIG['test'])

In [6]:
feature = ['가입일', '음성사서함이용', '주간통화시간', '주간통화횟수', '주간통화요금', '저녁통화시간',
       '저녁통화횟수', '저녁통화요금', '밤통화시간', '밤통화횟수', '밤통화요금', '상담전화건수']

In [7]:
x_train = train[feature].values
y_train = train['전화해지여부'].values
test = test.drop('ID', axis = 1).values

In [23]:
kf = StratifiedKFold(n_splits = CONFIG['n_splits'], shuffle = True, random_state = CONFIG['seed'])
models = []
n_estimators = 555
''' eval_metric
        rmse : root mean square error
        mae : mean absolute error
        logloss : negative log-likelihood
        error : binary classificaion error rate (임계값 0.5)
        merror : multiclass classification error rate
        mlogloss : multiclass logloss
        auc : area under the curve
'''
for idxm, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
    X_train, X_test = x_train[train_index], x_train[valid_index]
    Y_train, Y_test = y_train[train_index], y_train[valid_index]
    # model = lgb.LGBMClassifier(n_estimators = n_estimators, random_state=CONFIG['seed'])
    model = xgb.XGBClassifier(
        eval_metric="error", 
        learning_rate = 1e-3,
        min_child_weight = 1,
        max_depth = 5,
        early_stopping_rounds=2)
    model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)])
    print(f"Score : {model.score(X_test, Y_test)}")
    models.append(model)
preds = [model.predict(test).tolist() for model in models]

[0]	validation_0-error:0.10828
[1]	validation_0-error:0.10828
[2]	validation_0-error:0.10811
[3]	validation_0-error:0.10811
Score : 0.891887417218543
[0]	validation_0-error:0.10894
[1]	validation_0-error:0.10894
[2]	validation_0-error:0.10877
[3]	validation_0-error:0.10877
Score : 0.8912251655629139
[0]	validation_0-error:0.10679
[1]	validation_0-error:0.10662
[2]	validation_0-error:0.10629
[3]	validation_0-error:0.10662
[4]	validation_0-error:0.10629
Score : 0.8937086092715232
[0]	validation_0-error:0.10877
[1]	validation_0-error:0.10894
Score : 0.8912251655629139
[0]	validation_0-error:0.10811
[1]	validation_0-error:0.10811
Score : 0.891887417218543


In [24]:
pred_vec = np.array((preds))
ensemble = []
for i in range(pred_vec.shape[1]):
    x = [pred_vec[:,i].tolist().count(0), pred_vec[:,i].tolist().count(1)]
    ensemble.append(x.index(max(x)))

In [25]:
submit = pd.read_csv(CONFIG['sample_submission'])
submit['전화해지여부'] = ensemble
submit.head()
submit.to_csv('./data/submission.csv', index = False)