In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd

import warnings
from pathlib import Path

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb

In [None]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

warnings.filterwarnings('ignore')

# 데이터 로드

In [None]:
# # 로컬 버전

# data_dir = Path('../input/dankook')
# feature_dir = Path('../output/feature')
# val_dir = Path('../output/oof_pred')
# test_dir = Path('../output/test_pred')
# sub_dir = Path('../output/sub')


# train_file = data_dir / 'train.csv'
# test_file = data_dir / 'test.csv'
# sample_file = data_dir / 'sample_submission.csv'

In [None]:
# 코렙 

from google.colab import drive
drive.mount('/content/drive')

data_dir = Path('/content/drive/My Drive/Colab Notebooks/input/dankook')
feature_dir = Path('/content/drive/My Drive/Colab Notebooks/output/feature')
val_dir = Path('/content/drive/My Drive/Colab Notebooks/output/oof_pred')
test_dir = Path('/content/drive/My Drive/Colab Notebooks/output/test_pred')
sub_dir = Path('/content/drive/My Drive/Colab Notebooks/output/sub')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
algorithm_name = 'xgbcv'
feature_name = 'polynomial_feature'
model_name = f'{algorithm_name}_{feature_name}'

polynomial_feature_Ver1_file = feature_dir / f'{feature_name}_Ver1.csv'
polynomial_feature_Ver2_file = feature_dir / f'{feature_name}_Ver2.csv'
polynomial_feature_Ver3_file = feature_dir / f'{feature_name}_Ver3.csv'

xgb_oof_pred_ver1_file = val_dir / f'{model_name}_oof_pred_ver1.csv'
xgb_oof_pred_ver2_file = val_dir / f'{model_name}_oof_pred_ver2.csv'
xgb_oof_pred_ver3_file = val_dir / f'{model_name}_oof_pred_ver3.csv'

xgb_test_pred_ver1_file = test_dir / f'{model_name}_test_pred_ver1.csv'
xgb_test_pred_ver2_file = test_dir / f'{model_name}_test_pred_ver2.csv'
xgb_test_pred_ver3_file = test_dir / f'{model_name}_test_pred_ver3.csv'

xgb_submission_ver1_file = sub_dir / f'{model_name}_submission_Ver1.csv'
xgb_submission_ver2_file = sub_dir / f'{model_name}_submission_Ver2.csv'
xgb_submission_ver3_file = sub_dir / f'{model_name}_submission_Ver3.csv'

In [None]:
SEED = 2020
num_class = 3
n_splits = 5
target_column = 'class'

## Ver1 데이터 로드

In [None]:
dataset = pd.read_csv(polynomial_feature_Ver1_file, index_col=0)
print(dataset.shape)
dataset.head()

In [None]:
# train set
Ver1_X = dataset.loc[dataset[target_column] != -1 , :]
Ver1_X.drop(columns=target_column,inplace=True,axis=1)
Ver1_y = dataset.loc[dataset[target_column] != -1, target_column]
Ver1_y.astype(int)

# test set
Ver1_test = dataset.loc[dataset[target_column] == -1, :]
Ver1_test.drop(columns=target_column, inplace=True,axis=1)

print(Ver1_X.shape, Ver1_y.shape, Ver1_test.shape)

## Ver2 데이터 로드

In [None]:
dataset = pd.read_csv(polynomial_feature_Ver2_file, index_col=0)
print(dataset.shape)
dataset.head()

ParserError: ignored

In [None]:
# train set
Ver2_X = dataset.loc[dataset[target_column] != -1 , :]
Ver2_X.drop(columns=target_column,inplace=True,axis=1)
Ver2_y = dataset.loc[dataset[target_column] != -1, target_column]
Ver2_y.astype(int)

# test set
Ver2_test = dataset.loc[dataset[target_column] == -1, :]
Ver2_test.drop(columns=target_column, inplace=True,axis=1)

print(Ver2_X.shape, Ver2_y.shape, Ver2_test.shape)

## Ver3 데이터 로드

In [None]:
dataset = pd.read_csv(polynomial_feature_Ver3_file, index_col=0)
print(dataset.shape)
dataset.head()

In [None]:
# train set
Ver3_X = dataset.loc[dataset[target_column] != -1 , :]
Ver3_X.drop(columns=target_column,inplace=True,axis=1)
Ver3_y = dataset.loc[dataset[target_column] != -1, target_column]
Ver3_y.astype(int)

# test set
Ver3_test = dataset.loc[dataset[target_column] == -1, :]
Ver3_test.drop(columns=target_column, inplace=True,axis=1)

print(Ver3_X.shape, Ver3_y.shape, Ver3_test.shape)

# 모델 학습 

## Xgboost 

In [None]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'random_state': 2020,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor'
}

In [None]:
mlogloss = []

xgb_oof_pred_ver1 = np.zeros((Ver1_X.shape[0],num_class))
xgb_test_pred_ver2 = np.zeros((Ver1_test.shape[0],num_class))
xgb_oof_pred_ver2 = np.zeros((Ver2_X.shape[0],num_class))
xgb_test_pred_ver2 = np.zeros((Ver2_test.shape[0],num_class))
xgb_oof_pred_ver3 = np.zeros((Ver3_X.shape[0],num_class))
xgb_test_pred_ver3 = np.zeros((Ver3_test.shape[0],num_class))

y = Ver1_y

for X, test in [(Ver1_X, Ver1_test),(Ver2_X,Ver2_test), (Ver3_X, Ver3_test)]:
    kFold = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    for fold, (trn_idx, val_idx) in enumerate(kFold.split(X,y)):
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dval, 'val')]
        
        
        xgb_clf = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=5000, evals=watchlist, early_stopping_rounds=50, verbose_eval= 5000)
        mlogloss.append(xgb_clf.best_score)
        
        # Predict 
        dtest = xgb.DMatrix(test)
        if X.shape[1]==527:
            xgb_test_pred_ver1 += xgb_clf.predict(dtest) / (n_splits*4)
            xgb_oof_pred_ver1[val_idx] += xgb_clf.predict(dval) /4
        elif X.shape[2]==903:
            xgb_test_pred_ver2 += xgb_clf.predict(dtest) / (n_splits*4)
            xgb_oof_pred_ver2[val_idx] += xgb_clf.predict(dval) / 4
        else:
            xgb_test_pred_ver3 += xgb_clf.predict(dtest) / (n_splits*4)
            xgb_oof_pred_ver3[val_idx] += xgb_clf.predict(dval) / 4
        
    print("*"*100)
    print("Traning has finished")
print('xgb ver1 logloss= ',log_loss(y,xgb_oof_pred_ver1))
print('xgb ver2 logloss= ',log_loss(y,xgb_oof_pred_ver2))
print('xgb ver3 logloss= ',log_loss(y,xgb_oof_pred_ver3))

# 제출 파일 및 기타 파일 생성

In [None]:
# submission 파일 생성

sub = pd.read_csv(sample_file,index_col=0)

# Ver1
sub[target_column] = np.argmax(xgb_test_pred_ver1, axis=1)
sub.to_csv(xgb_submission_ver1_file

# Ver2
sub[target_column] = np.argmax(xgb_test_pred_ver2, axis=1)
sub.to_csv(xgb_submission_ver2_file)

# Ver3
sub[target_column] = np.argmax(xgb_test_pred_ver3, axis=1)
sub.to_csv(xgb_submission_ver3_file)

In [None]:
# xgb_oof_pred 파일 생성

# Ver1
np.savetxt(xgb_oof_pred_ver1_file, xgb_oof_pred_ver1,fmt='%.18f', delimiter=',')

# Ver2
np.savetxt(xgb_oof_pred_ver2_file, xgb_oof_pred_ver2,fmt='%.18f', delimiter=',')

# Ver3
np.savetxt(xgb_oof_pred_ver3_file, xgb_oof_pred_ver3,fmt='%.18f', delimiter=',')

In [None]:
# xgb_test_pred 파일 생성

# Ver1
np.savetxt(xgb_test_pred_ver1_file, xgb_test_pred_ver1,fmt='%.18f', delimiter=',')

# Ver2
np.savetxt(xgb_test_pred_ver2_file, xgb_test_pred_ver2,fmt='%.18f', delimiter=',')

# Ver3
np.savetxt(xgb_test_pred_ver3_file, xgb_test_pred_ver3,fmt='%.18f', delimiter=',')