In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, make_scorer
from sklearn.model_selection import KFold
from catboost import Pool, CatBoostRegressor, cv

# from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
np.random.seed(7)

# metrics

In [None]:
# def mae(y_true, y_pred) :
#     y_true, y_pred = np.array(y_true), np.array(y_pred)
    
#     y_true = y_true.reshape(1, -1)[0]
#     y_pred = y_pred.reshape(1, -1)[0]
#     over_threshold = y_true >= 0.1
    
#     return np.mean(np.abs(y_true[over_threshold] - y_pred[over_threshold]))

# def fscore(y_true, y_pred):
#     y_true, y_pred = np.array(y_true), np.array(y_pred)
    
#     y_true = y_true.reshape(1, -1)[0]
#     y_pred = y_pred.reshape(1, -1)[0]
#     remove_NAs = y_true >= 0
    
#     y_true = np.where(y_true[remove_NAs] >= 0.1, 1, 0)
#     y_pred = np.where(y_pred[remove_NAs] >= 0.1, 1, 0)
    
#     return (f1_score(y_true, y_pred))

# def maeOverFscore(y_true, y_pred):
#     return mae(y_true, y_pred) / (fscore(y_true, y_pred) + 1e-07)

# def score(y_val, pred):
#     f_value =  fscore(y_val, pred)
#     mae_value = maeOverFscore(y_val, pred)
#     print(f"fscore        : {f_value}")
#     print(f"maeOverFscore : {mae_value}")
    
#     return (f_value, mae_value)

# def maeOverFscore_lgb(y_true, y_pred):
#     return "maeOverFscore", mae(y_true, y_pred) / (fscore(y_true, y_pred) + 1e-07), False

# def fscore_lgb(y_true, y_pred):
#     return "fscore", fscore(y_true, y_pred), False

# maeOverFscore_sklearn = make_scorer(maeOverFscore)
# fscore_sklearn = make_scorer(fscore)

In [None]:
# class maeOverFscoreMetric(object):
#     def get_final_error(self, error, weight):
#         return error / (weight + 1e-07)

#     def is_max_optimal(self):
#         return False

#     def evaluate(self, approxes, target, weight):
        
#         accuracy_sum = 0
#         weight_sum = 0 

#         for i in range(len(approxes[0])):
#             w = 1.0 if weight is None else weight[i]
#             weight_sum += w
#             accuracy_sum += w * (best_class[i] == target[i])

#         return mae(target, approxes), fscore(target, approxes)

# data load

In [None]:
base = os.getcwd()
data_path = os.path.join(base, 'data')
submit_path = os.path.join(base, 'submit')
model_path = os.path.join(base, 'model')

def load_data(name):
    return np.load(os.path.join(data_path, f"{name}.npy"))

def reshape(data):
    return data.reshape(data.shape[0] * 40 * 40, data.shape[-1])

if not os.path.isdir(model_path):
    os.mkdir(model_path)

In [None]:
data = reshape(load_data('dl_train'))

# seperate dataset

In [None]:
X = data[:, :-1]
Y = data[:,  -1].reshape(data.shape[0], 1)
data = range(data.shape[0])
print(X.shape, Y.shape)

# CV

In [None]:
dataset = Pool(data=X[:36468480, :],
               label=Y[:36468480, :],
               cat_features=[])

In [None]:
params = {"iterations": 1000,
          "depth": 4,
          "loss_function": "MAE",
          "verbose": False}

In [None]:
scores = cv(dataset,
            params,
            fold_count=4,
            plot="True")

In [None]:
MetricVisualizer

In [None]:
kfold = KFold(n_splits=4, random_state=7, shuffle=False)
scores = list()
best_iterations = list()
best_scores = list()
cat_features = []

for i, (train_idx, val_idx) in enumerate(kfold.split(data)):
        train_dataset = Pool(data=X[train_idx, :],
                     label=Y[train_idx, :],
                     cat_features=cat_features)
        
        scores = cv(train_dataset,
            params,
            fold_count=5, 
            plot="True")
        
        break

# K-Fold

In [None]:
kfold = KFold(n_splits=4, random_state=7, shuffle=False)
scores = list()
best_iterations = list()
best_scores = list()
cat_features = []

for i, (train_idx, val_idx) in enumerate(kfold.split(data)):
    
    train_dataset = Pool(data=X[train_idx, :],
                     label=Y[train_idx, :],
                     cat_features=cat_features)

    eval_dataset = Pool(data=X[val_idx, :],
                        label=Y[val_idx, :],
                        cat_features=cat_features)
    
    
    clf = CatBoostRegressor(iterations=1000, learning_rate=0.1, \
                            depth=4, l2_leaf_reg=20, \
                            bootstrap_type='Bernoulli', subsample=0.6, \
                            eval_metric='RMSE', metric_period=50, \
                            od_type='Iter', od_wait=45, random_seed=7,\
                            allow_writing_files=True,
                            random_state =7)
    
    clf.fit(train_dataset, \
            eval_set=eval_dataset, \
            use_best_model=True, verbose=True)

    
    clf.save_model(os.path.join(model_path, f"cat_{i}"))
    
    scores.append(score(clf.predict(X[val_idx, :]), Y[val_idx, :]))

# CatBoostRegressor

In [None]:
clf = CatBoostRegressor(iterations=1000, learning_rate=0.1, \
                             depth=4, l2_leaf_reg=20, \
                             bootstrap_type='Bernoulli', subsample=0.6, \
                             eval_metric='RMSE', metric_period=50, \
                             od_type='Iter', od_wait=45, random_seed=17,\
                             allow_writing_files=False)

In [None]:
clf.fit(X, Y, \
        cat_features=[], use_best_model=True, verbose=True)

# 시각화

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
lgb.plot_importance(clf, ax)

# 제출

In [None]:
def submit(clf, name, preprocess=None):
    x_test = reshape(load_data('test'))
    
    pred = clf.predict(x_test)

    submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
    submission.iloc[:, 1:] = pred.reshape(-1, 1600)

    submission.to_csv(os.path.join(submit_path, f'{name}.csv'), index=False)

In [None]:
submit(clf, 'lightbgm_all_31_800')

- https://dacon.io/competitions/official/235591/mysubmission/
- D:\인공지능_공모전\github\submit