In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, make_scorer
from sklearn.model_selection import KFold
from catboost import Pool, CatBoostRegressor

# from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
np.random.seed(7)

# data load

In [3]:
base = os.getcwd()
data_path = os.path.join(base, 'data')
submit_path = os.path.join(base, 'submit')
model_path = os.path.join(base, 'model')
save_data_path = os.path.join(data_path, 'sub')

def load_data(name):
    return np.load(os.path.join(data_path, f"{name}.npy"))

def reshape(data):
    return data.reshape(data.shape[0] * 40 * 40, data.shape[-1])

def save_data(data, name):
    np.save(os.path.join(save_data_path, f"{name}.npy"), data)

def load_y_data(name):
    return np.load(os.path.join(save_data_path, f"{name}.npy"))

if not os.path.isdir(model_path):
    os.mkdir(model_path)

In [4]:
data = reshape(load_data('dl_train'))
x_test = reshape(load_data('test'))

# seperate dataset

In [5]:
X = data[:, :-1]
Y = data[:,  -1].reshape(data.shape[0], 1)
data = range(data.shape[0])
print(X.shape, Y.shape)

(121561600, 14) (121561600, 1)


# K-Fold

# CatBoostRegressor

In [None]:
kfold = KFold(n_splits=5, random_state=7, shuffle=False)
cat_features = list()
train_preds = list()
test_preds = list()
scores = list()

for i, (train_idx, val_idx) in enumerate(kfold.split(data)):
    train_dataset = Pool(data=X[train_idx, :],
                     label=Y[train_idx, :],
                     cat_features=cat_features)

    eval_dataset = Pool(data=X[val_idx, :],
                        label=Y[val_idx, :],
                        cat_features=cat_features)
    
    clf = CatBoostRegressor(iterations=800, learning_rate=0.001, \
                            depth=5, l2_leaf_reg=20, \
                            bootstrap_type='Bernoulli', subsample=0.6, \
                            eval_metric='MAE', metric_period=10, \
                            od_type='Iter', od_wait=45, random_seed=7,\
                            allow_writing_files=True)
    
    clf.fit(train_dataset, \
            eval_set=eval_dataset, \
            use_best_model=True, \
            plot=True, verbose=True)
    
    clf.save_model(os.path.join(model_path, f"cat_mae_800_0.001_5_{i}"))
    train_preds.append(clf.predict(X))
    test_preds.append(clf.predict(x_test))



In [None]:
train_preds = np.array(train_preds)
test_preds = np.array(test_preds)

In [None]:
save_data(train_preds, "cat_nfold_train")
save_data(test_preds, "cat_nfold_test")

# 제출

In [9]:
def submit(clf, name, preprocess=None):
    x_test = reshape(load_data('test'))
    
    pred = clf.predict(x_test)

    submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
    submission.iloc[:, 1:] = pred.reshape(-1, 1600)

    submission.to_csv(os.path.join(submit_path, f'{name}.csv'), index=False)

In [10]:
submit(clf, 'catboost_train_val_Bernoulli_500_0.01_4_mae')

- https://dacon.io/competitions/official/235591/mysubmission/
- D:\인공지능_공모전\github\submit