In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb

from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, make_scorer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
np.random.seed(7)

# metrics

In [3]:
def mae(y_true, y_pred) :
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    y_pred = y_pred.reshape(1, -1)[0]
    over_threshold = y_true >= 0.1
    
    return np.mean(np.abs(y_true[over_threshold] - y_pred[over_threshold]))

def fscore(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    y_pred = y_pred.reshape(1, -1)[0]
    remove_NAs = y_true >= 0
    
    y_true = np.where(y_true[remove_NAs] >= 0.1, 1, 0)
    y_pred = np.where(y_pred[remove_NAs] >= 0.1, 1, 0)
    
    return (f1_score(y_true, y_pred))

def maeOverFscore(y_true, y_pred):
    return mae(y_true, y_pred) / (fscore(y_true, y_pred) + 1e-07)

def score(y_val, pred):
    print(f"fscore        : {fscore(y_val, pred)}")
    print(f"maeOverFscore : {maeOverFscore(y_val, pred)}")

fscore_sklearn = make_scorer(fscore)
maeOverFscore_sklearn = make_scorer(maeOverFscore)

# data load

In [4]:
base = os.getcwd()
data_path = os.path.join(base, 'data')
submit_path = os.path.join(base, 'submit')

def load_data(name):
    return np.load(os.path.join(data_path, f"{name}.npy"))

# def load(name):
#     if name == "test" :
#         return load_data('x', 'test')
#     return (load_data(f'x_{name}'), load_data(f'y_{name}'))

def reshape(data):
    return data.reshape(data.shape[0] * 40 * 40, data.shape[-1])

In [6]:
data = load_data('EDA')
print(data.shape)

(121608234, 15)


# Feature Selection

## select K
![img](feacture_selection.PNG)

### selectK 7
- [False, False,  True,  True,  True,  True,  True,  True, False, False, False, False, False, True]

### selectK 8
- [False, False,  True,  True,  True,  True,  True,  True, False, False, False, True, False, True]

In [7]:
selectK_7 = [2, 3, 4, 5, 6, 7, 13]
selectK_8 = [2, 3, 4, 5, 6, 7, 11, 13]

# seperate dataset

In [8]:
X = data[:, selectK_8]
Y = data[:, -1]

del data

# Lightbgm

In [39]:
train = lgb.Dataset(x_train, y_train)
val = lgb.Dataset(x_val, y_val)

# Lightbgm train

In [48]:
params = {'learning_rate': 0.01, 'max_depth': -1, 'boosting': 'gbdt', 
          'objective': 'regression', 'metric': 'mae', 'is_training_metric': True, 
          'num_leaves': 1024, 'feature_fraction': 0.9, 'bagging_fraction': 0.7, 
          'bagging_freq': 5, 'seed':7}

model = lgb.train(params, train, 1000, val, verbose_eval=10, \
                  early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[10]	valid_0's l1: 0.146157
[20]	valid_0's l1: 0.146157
[30]	valid_0's l1: 0.146157
[40]	valid_0's l1: 0.146157
[50]	valid_0's l1: 0.146157
[60]	valid_0's l1: 0.146157
[70]	valid_0's l1: 0.146157
[80]	valid_0's l1: 0.146157
[90]	valid_0's l1: 0.146157
[100]	valid_0's l1: 0.146157
Early stopping, best iteration is:
[1]	valid_0's l1: 0.146157


In [49]:
pred = model.predict(x_val)

score(y_val, pred)

fscore        : 0.0
maeOverFscore : 21738683.940531477


# Lightbgm LGBMRegressor

In [31]:
# lightgbm.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, \
#                              n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, \
#                              min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, \
#                              subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, \
#                              n_jobs=- 1, silent=True, importance_type='split')

In [32]:
clf = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.01, \
                             n_estimators=400, subsample_for_bin=200000, objective=None, class_weight=None, \
                             min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, \
                             subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, \
                             n_jobs=- 1, silent=True, importance_type='split')

In [None]:
# fit(X, y, sample_weight=None, init_score=None, eval_set=None, eval_names=None, \
#     eval_sample_weight=None, eval_init_score=None, eval_metric=None, \
#     early_stopping_rounds=None, verbose=True, feature_name='auto', \
#     categorical_feature='auto', callbacks=None, init_model=None)

In [50]:
clf.fit(x_train, y_train, eval_set=[(x_val, y_val)], \
        eval_metric=maeOverFscore_sklearn, early_stopping_rounds=100, \
        verbose=True, eval_names='maeOverFscore')

# KFold

In [None]:
kfold = KFold(n_splits=4, random_state=7, shuffle=True)

for (train_idx, val_idx) in kfold.split(Y):
    ridge = Ridge(alpha=10.0).fit(X[train_idx, :], Y[train_idx])
    
    score(Y[val_idx], ridge.predict(X[val_idx, :]))

# 시각화

In [51]:
fig, ax = plt.subplots(figsize=(10,6))
lgb.plot_importance(model, ax)

In [20]:
# lgb_param = {
#     'objective': 'regression',
#     'metrics': maeOverFscore_sklearn,
#     'learning_rate' : 0.01,
#     'eval_metric': maeOverFscore_sklearn,
#     'early_stopping_rounds' : 100,
#     'eval_set': val_data,
#     'verbose': True,
#     'stratified':False,
#     'verbose_eval': 10,
#     'nfold': 5,
#     'num_boost_round': 99999,
# }

# cv_result = lgb.cv(
#     lgb_param,
#     train_data
# )

# lgb_model = lgb.train(
#   lgb_param,
#   train_data,
#   num_boost_round=len(cv_result['l1-mean'])
# )

# Submit

In [11]:
def submit(clf, name, preprocess=None):
    x_test = reshape(load_data('test'))
    
    if preprocess is not None:
        x_test = preprocess.transform(x_test)
        print("transform")
    
    pred = clf.predict(x_test)

    submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
    submission.iloc[:, 1:] = pred.reshape(-1, 1600)

    submission.to_csv(os.path.join(submit_path, f'{name}.csv'), index = False)

In [20]:
submit(lgb_model, 'lightbgm_selectK')

transform


- https://dacon.io/competitions/official/235591/mysubmission/
- D:\인공지능_공모전\github\submit