In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import optuna
import math
import pandas as pd
from tqdm import tqdm
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder

class BetaEncoder(object):
        
    def __init__(self, group):
        self.group = group
        self.stats = None
        
    # get counts from df
    def fit(self, df, target_col):
        self.prior_mean = np.mean(df[target_col]) 
        stats = df[[target_col, self.group]].groupby(self.group)
        stats = stats.agg(['sum', 'count'])[target_col]    
        stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True)
        stats.reset_index(level=0, inplace=True)           
        self.stats = stats

    # extract posterior statistics
    def transform(self, df, stat_type, N_min=1):
        
        df_stats = pd.merge(df[[self.group]], self.stats, how='left')
        n = df_stats['n'].copy()
        N = df_stats['N'].copy()
        
        # fill in missing
        nan_indexs = np.isnan(n)
        n[nan_indexs] = self.prior_mean
        N[nan_indexs] = 1.0
        
        # prior parameters
        N_prior = np.maximum(N_min-N, 0)
        alpha_prior = self.prior_mean*N_prior
        beta_prior  = (1-self.prior_mean)*N_prior
        
        # posterior parameters
        alpha =  alpha_prior + n
        beta =  beta_prior  + N-n
        
        # calculate statistics
        if stat_type=='mean':
            num = alpha
            dem = alpha+beta
                    
        elif stat_type=='mode':
            num = alpha-1
            dem = alpha+beta-2
            
        elif stat_type=='median':
            num = alpha-1/3
            dem = alpha+beta-2/3
        
        elif stat_type=='var':
            num = alpha*beta
            dem = (alpha+beta)**2*(alpha+beta+1)
                    
        elif stat_type=='skewness':
            num = 2*(beta-alpha)*np.sqrt(alpha+beta+1)
            dem = (alpha+beta+2)*np.sqrt(alpha*beta)

        elif stat_type=='kurtosis':
            num = 6*(alpha-beta)**2*(alpha+beta+1) - alpha*beta*(alpha+beta+2)
            dem = alpha*beta*(alpha+beta+2)*(alpha+beta+3)
            
        # replace missing
        value = num/dem
        value[np.isnan(value)] = np.nanmedian(value)
        return value

In [3]:
raw_data = pd.read_csv("./data/training_data.csv")
# display(raw_data)

train = pd.read_csv("./data/train_feat.csv")
# display(train.info())
feat_cols = ['土地面積','移轉層次','總樓層數','屋齡','建物面積','車位面積','車位個數','橫坐標','縱坐標','主建物面積','陽台面積',
                '附屬建物面積','N_lib_2000','avg_distances_高中','avg_distances_國小','avg_distances_火車','avg_distances_醫療',
                'avg_distances_公車','avg_distances_國中','avg_distances_大學','avg_distances_便利','avg_distances_AT',
                'avg_distances_金融','avg_distances_捷運','avg_distances_郵局',
                'avg_tax','density','edu_p', '縣市_台北市','縣市_台中市','縣市_台南市','縣市_新北市','縣市_高雄市','縣市_桃園市']
cat_cols = ['使用分區','主要用途','主要建材','建物型態','縣市_鄉鎮市區']

raw_data['縣市_鄉鎮市區'] = raw_data['縣市'] + '_' + raw_data['鄉鎮市區']

selected_X = train[feat_cols]
cat_X = raw_data[cat_cols+['單價']]
X = pd.concat([selected_X, cat_X], axis=1)
Y = pd.read_csv("./data/train_output.csv")
Y = np.log(Y)

test = pd.read_csv("./data/test_feat.csv")
raw_test_data = pd.read_csv("./data/public_dataset.csv")
raw_test_data['縣市_鄉鎮市區'] = raw_test_data['縣市'] + '_' + raw_test_data['鄉鎮市區']
selected_X = test[feat_cols]
cat_X = raw_test_data[cat_cols]
X_test = pd.concat([selected_X, cat_X], axis=1)


# display(X.head())
# display(Y)
# print(X_test.info())
# print(X_test['主要用途'].value_counts())
# print(X_test['使用分區'].value_counts())
# print(X_test['主要建材'].value_counts())
# print(X_test['建物型態'].value_counts())
# print(X_test['縣市_鄉鎮市區'].value_counts())

In [None]:
## best_p_list = []
best_s_list = []
best_score = math.inf
# N_min_list = [10,20,30,50,70,110]
# for i in N_min_list:
def objective(trial):
    global best_score
    params = {
        "metric": "mape",
        "num_iterations": trial.suggest_int("num_iterations", 300, 1000), 
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 64, 2**10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "max_bin": trial.suggest_int("max_bin", 128, 1024),
    }
    N_min = 72

    for col in cat_cols:
        # print(f"now at {col}")
        le = LabelEncoder()
        le.fit(np.concatenate([X[col], X_test[col]]))
        X[col] = le.transform(X[col])
        X_test[col] = le.transform(X_test[col])

    x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=507)
    x_train.reset_index(inplace=True, drop=True)
    x_valid.reset_index(inplace=True, drop=True)
    y_train.reset_index(inplace=True, drop=True)
    y_valid.reset_index(inplace=True, drop=True)
    y_valid = np.exp(y_valid)
    
    
    for c in cat_cols:
        # fit encoder
        be = BetaEncoder(c)
        be.fit(x_train, '單價')
        # mean
        feature_name = f'{c}_mean'
        x_train[feature_name] = be.transform(x_train, 'mean', N_min)
        x_valid[feature_name]  = be.transform(x_valid,  'mean', N_min)
        X_test[feature_name]  = be.transform(X_test,  'mean', N_min)

    x_train = x_train.drop(['單價']+cat_cols,axis=1)
    x_valid = x_valid.drop(['單價']+cat_cols,axis=1)
    X_test_d = X_test.drop(cat_cols,axis=1)

    
    model = LGBMRegressor(**params)
    model.fit(x_train, y_train)
    pred = model.predict(x_valid)
    pred = np.exp(pred)
    mape = mean_absolute_percentage_error(y_valid, pred)
    
    if(mape < best_score):
        x_train.to_csv('data/x_train.csv', index=False)
        y_train.to_csv('data/y_train.csv', index=False)
        x_valid.to_csv('data/x_valid.csv', index=False)
        y_valid.to_csv('data/y_valid.csv', index=False)
        X_test_d.to_csv('data/x_test.csv', index=False)
        # public_pred = model.predict(X_test_d)
        # public_pred.to_csv('pred.csv', index=False)
        best_score = mape

    return mape



In [5]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=80)
print('Best hyperparameters:', study.best_params)
print('Best MAPE:', study.best_value)

[I 2023-11-13 05:25:40,625] A new study created in memory with name: no-name-88dbd9c8-b714-41a7-9331-adc591691b51




[I 2023-11-13 05:25:42,961] Trial 0 finished with value: 0.09346875073169705 and parameters: {'num_iterations': 310, 'learning_rate': 0.036514289392378226, 'num_leaves': 791, 'subsample': 0.5922799226763265, 'colsample_bytree': 0.7777033506802487, 'min_data_in_leaf': 47, 'max_bin': 592, 'N_min': 287}. Best is trial 0 with value: 0.09346875073169705.




[I 2023-11-13 05:25:55,913] Trial 1 finished with value: 0.09343368612031824 and parameters: {'num_iterations': 701, 'learning_rate': 0.025889860366320672, 'num_leaves': 443, 'subsample': 0.9487872028660727, 'colsample_bytree': 0.9888038202083119, 'min_data_in_leaf': 10, 'max_bin': 170, 'N_min': 141}. Best is trial 1 with value: 0.09343368612031824.




[I 2023-11-13 05:26:05,142] Trial 2 finished with value: 0.21054973856725687 and parameters: {'num_iterations': 685, 'learning_rate': 0.0012765323307470122, 'num_leaves': 1014, 'subsample': 0.7961043442174119, 'colsample_bytree': 0.23403390073781602, 'min_data_in_leaf': 16, 'max_bin': 754, 'N_min': 122}. Best is trial 1 with value: 0.09343368612031824.




[I 2023-11-13 05:26:09,329] Trial 3 finished with value: 0.11561923298524794 and parameters: {'num_iterations': 881, 'learning_rate': 0.002240799140036408, 'num_leaves': 880, 'subsample': 0.6129247545519445, 'colsample_bytree': 0.667315498892915, 'min_data_in_leaf': 39, 'max_bin': 319, 'N_min': 238}. Best is trial 1 with value: 0.09343368612031824.




[I 2023-11-13 05:26:11,407] Trial 4 finished with value: 0.0937015690952173 and parameters: {'num_iterations': 834, 'learning_rate': 0.09298877487328733, 'num_leaves': 275, 'subsample': 0.7788495919958072, 'colsample_bytree': 0.2590436461254215, 'min_data_in_leaf': 93, 'max_bin': 398, 'N_min': 16}. Best is trial 1 with value: 0.09343368612031824.




[I 2023-11-13 05:26:15,087] Trial 5 finished with value: 0.0924706819760521 and parameters: {'num_iterations': 493, 'learning_rate': 0.03194350805595127, 'num_leaves': 364, 'subsample': 0.5809872496249076, 'colsample_bytree': 0.6584581500739689, 'min_data_in_leaf': 33, 'max_bin': 584, 'N_min': 190}. Best is trial 5 with value: 0.0924706819760521.




[I 2023-11-13 05:26:21,378] Trial 6 finished with value: 0.09247413560202257 and parameters: {'num_iterations': 584, 'learning_rate': 0.008541943123872792, 'num_leaves': 850, 'subsample': 0.5634843792149204, 'colsample_bytree': 0.5513379413694787, 'min_data_in_leaf': 15, 'max_bin': 209, 'N_min': 272}. Best is trial 5 with value: 0.0924706819760521.




[I 2023-11-13 05:26:29,564] Trial 7 finished with value: 0.09300700767898562 and parameters: {'num_iterations': 565, 'learning_rate': 0.009067865710563002, 'num_leaves': 583, 'subsample': 0.9838980913418123, 'colsample_bytree': 0.41024542475425874, 'min_data_in_leaf': 24, 'max_bin': 1009, 'N_min': 223}. Best is trial 5 with value: 0.0924706819760521.




[I 2023-11-13 05:26:32,945] Trial 8 finished with value: 0.12066275517384141 and parameters: {'num_iterations': 506, 'learning_rate': 0.003610558517925883, 'num_leaves': 503, 'subsample': 0.8357564899256087, 'colsample_bytree': 0.5577794407381064, 'min_data_in_leaf': 41, 'max_bin': 636, 'N_min': 143}. Best is trial 5 with value: 0.0924706819760521.




[I 2023-11-13 05:26:35,768] Trial 9 finished with value: 0.0926791972162126 and parameters: {'num_iterations': 841, 'learning_rate': 0.0343750383775364, 'num_leaves': 262, 'subsample': 0.6546301885069689, 'colsample_bytree': 0.8536723870619523, 'min_data_in_leaf': 79, 'max_bin': 588, 'N_min': 242}. Best is trial 5 with value: 0.0924706819760521.




[I 2023-11-13 05:26:36,662] Trial 10 finished with value: 0.10410692163632802 and parameters: {'num_iterations': 348, 'learning_rate': 0.09301853175769124, 'num_leaves': 96, 'subsample': 0.5109697931592563, 'colsample_bytree': 0.11282595901846099, 'min_data_in_leaf': 62, 'max_bin': 901, 'N_min': 73}. Best is trial 5 with value: 0.0924706819760521.




[I 2023-11-13 05:26:39,394] Trial 11 finished with value: 0.09580366738259988 and parameters: {'num_iterations': 471, 'learning_rate': 0.009900178383337462, 'num_leaves': 680, 'subsample': 0.5061347734916404, 'colsample_bytree': 0.5535505801001438, 'min_data_in_leaf': 26, 'max_bin': 415, 'N_min': 193}. Best is trial 5 with value: 0.0924706819760521.




[I 2023-11-13 05:26:43,157] Trial 12 finished with value: 0.09125704099197136 and parameters: {'num_iterations': 624, 'learning_rate': 0.01726659391873211, 'num_leaves': 333, 'subsample': 0.6611146511745446, 'colsample_bytree': 0.6712620380927301, 'min_data_in_leaf': 32, 'max_bin': 137, 'N_min': 297}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:26:45,073] Trial 13 finished with value: 0.09554998494979007 and parameters: {'num_iterations': 437, 'learning_rate': 0.018897365487316565, 'num_leaves': 339, 'subsample': 0.6875135939991652, 'colsample_bytree': 0.7037575898523076, 'min_data_in_leaf': 62, 'max_bin': 760, 'N_min': 181}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:26:49,701] Trial 14 finished with value: 0.09174477256786875 and parameters: {'num_iterations': 694, 'learning_rate': 0.017549475440297318, 'num_leaves': 143, 'subsample': 0.6909048365078382, 'colsample_bytree': 0.6655064655885835, 'min_data_in_leaf': 32, 'max_bin': 459, 'N_min': 88}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:26:52,965] Trial 15 finished with value: 0.09271435764064206 and parameters: {'num_iterations': 974, 'learning_rate': 0.015311001755256905, 'num_leaves': 78, 'subsample': 0.693493650522078, 'colsample_bytree': 0.4380703395110469, 'min_data_in_leaf': 56, 'max_bin': 301, 'N_min': 88}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:26:56,331] Trial 16 finished with value: 0.09779916437985772 and parameters: {'num_iterations': 748, 'learning_rate': 0.005358871115849664, 'num_leaves': 205, 'subsample': 0.7312779601878401, 'colsample_bytree': 0.7962586106843726, 'min_data_in_leaf': 49, 'max_bin': 129, 'N_min': 33}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:00,921] Trial 17 finished with value: 0.09216621048034371 and parameters: {'num_iterations': 626, 'learning_rate': 0.015875937203138752, 'num_leaves': 160, 'subsample': 0.6435836767789445, 'colsample_bytree': 0.887936795404512, 'min_data_in_leaf': 29, 'max_bin': 470, 'N_min': 90}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:03,405] Trial 18 finished with value: 0.09956485906478847 and parameters: {'num_iterations': 791, 'learning_rate': 0.0053013902191050505, 'num_leaves': 409, 'subsample': 0.7318675470517225, 'colsample_bytree': 0.7119269748063629, 'min_data_in_leaf': 70, 'max_bin': 244, 'N_min': 111}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:07,022] Trial 19 finished with value: 0.09307495311520789 and parameters: {'num_iterations': 618, 'learning_rate': 0.01433158942354488, 'num_leaves': 605, 'subsample': 0.6757026489988495, 'colsample_bytree': 0.6154850252989803, 'min_data_in_leaf': 40, 'max_bin': 442, 'N_min': 63}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:16,364] Trial 20 finished with value: 0.09273689881533573 and parameters: {'num_iterations': 949, 'learning_rate': 0.05590362065496074, 'num_leaves': 194, 'subsample': 0.6229903755497035, 'colsample_bytree': 0.7552287339865498, 'min_data_in_leaf': 22, 'max_bin': 700, 'N_min': 47}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:20,997] Trial 21 finished with value: 0.09219410492138835 and parameters: {'num_iterations': 646, 'learning_rate': 0.019408016569945508, 'num_leaves': 165, 'subsample': 0.6457584819727672, 'colsample_bytree': 0.8975326527516303, 'min_data_in_leaf': 30, 'max_bin': 481, 'N_min': 102}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:26,601] Trial 22 finished with value: 0.09264595171674948 and parameters: {'num_iterations': 727, 'learning_rate': 0.012193452368382708, 'num_leaves': 297, 'subsample': 0.7036229424216388, 'colsample_bytree': 0.8676461274678319, 'min_data_in_leaf': 34, 'max_bin': 520, 'N_min': 170}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:30,290] Trial 23 finished with value: 0.09189521558547031 and parameters: {'num_iterations': 564, 'learning_rate': 0.02133886311544852, 'num_leaves': 117, 'subsample': 0.6450783156262463, 'colsample_bytree': 0.9902144228892383, 'min_data_in_leaf': 20, 'max_bin': 349, 'N_min': 76}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:33,188] Trial 24 finished with value: 0.09276381450061459 and parameters: {'num_iterations': 565, 'learning_rate': 0.020945842805481157, 'num_leaves': 97, 'subsample': 0.7196571050909275, 'colsample_bytree': 0.973824479486304, 'min_data_in_leaf': 23, 'max_bin': 278, 'N_min': 125}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:37,315] Trial 25 finished with value: 0.0918283738721213 and parameters: {'num_iterations': 422, 'learning_rate': 0.026124870671760527, 'num_leaves': 213, 'subsample': 0.6651600462536382, 'colsample_bytree': 0.8009849668037612, 'min_data_in_leaf': 17, 'max_bin': 356, 'N_min': 65}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:41,584] Trial 26 finished with value: 0.09328631308453623 and parameters: {'num_iterations': 386, 'learning_rate': 0.05618631363883549, 'num_leaves': 242, 'subsample': 0.7628760577305651, 'colsample_bytree': 0.6118900639740712, 'min_data_in_leaf': 10, 'max_bin': 369, 'N_min': 48}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:43,609] Trial 27 finished with value: 0.09235661054750197 and parameters: {'num_iterations': 432, 'learning_rate': 0.02829464855282085, 'num_leaves': 475, 'subsample': 0.6810957525266055, 'colsample_bytree': 0.7500134435711973, 'min_data_in_leaf': 48, 'max_bin': 229, 'N_min': 299}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:47,869] Trial 28 finished with value: 0.09157942736519138 and parameters: {'num_iterations': 755, 'learning_rate': 0.012489986951827887, 'num_leaves': 355, 'subsample': 0.5616432674942196, 'colsample_bytree': 0.7868598675160696, 'min_data_in_leaf': 37, 'max_bin': 507, 'N_min': 11}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:51,396] Trial 29 finished with value: 0.0953910832944393 and parameters: {'num_iterations': 782, 'learning_rate': 0.007473023151315024, 'num_leaves': 354, 'subsample': 0.5526279708971408, 'colsample_bytree': 0.7200680196032792, 'min_data_in_leaf': 46, 'max_bin': 668, 'N_min': 22}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:55,922] Trial 30 finished with value: 0.09351722274906639 and parameters: {'num_iterations': 678, 'learning_rate': 0.011514966703157686, 'num_leaves': 688, 'subsample': 0.5972683447020727, 'colsample_bytree': 0.643568823524632, 'min_data_in_leaf': 37, 'max_bin': 805, 'N_min': 213}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:27:59,712] Trial 31 finished with value: 0.09310289428066873 and parameters: {'num_iterations': 328, 'learning_rate': 0.013946301954717933, 'num_leaves': 313, 'subsample': 0.6123706783685122, 'colsample_bytree': 0.8073272106627603, 'min_data_in_leaf': 18, 'max_bin': 510, 'N_min': 43}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:04,908] Trial 32 finished with value: 0.0913346498172898 and parameters: {'num_iterations': 733, 'learning_rate': 0.025177461990892696, 'num_leaves': 405, 'subsample': 0.5614861219379723, 'colsample_bytree': 0.7965158685988937, 'min_data_in_leaf': 29, 'max_bin': 550, 'N_min': 58}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:08,132] Trial 33 finished with value: 0.09167676240887736 and parameters: {'num_iterations': 719, 'learning_rate': 0.02339662211998348, 'num_leaves': 433, 'subsample': 0.5527901293032085, 'colsample_bytree': 0.7542001798087208, 'min_data_in_leaf': 44, 'max_bin': 539, 'N_min': 12}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:11,156] Trial 34 finished with value: 0.09286880555773788 and parameters: {'num_iterations': 745, 'learning_rate': 0.04309562960939726, 'num_leaves': 416, 'subsample': 0.5361500191873759, 'colsample_bytree': 0.7576867158421738, 'min_data_in_leaf': 55, 'max_bin': 556, 'N_min': 11}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:15,858] Trial 35 finished with value: 0.09228711796384513 and parameters: {'num_iterations': 792, 'learning_rate': 0.02359246119302104, 'num_leaves': 545, 'subsample': 0.5782394915691127, 'colsample_bytree': 0.8170426837445991, 'min_data_in_leaf': 43, 'max_bin': 626, 'N_min': 27}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:20,226] Trial 36 finished with value: 0.09213531670540558 and parameters: {'num_iterations': 901, 'learning_rate': 0.026286629341727305, 'num_leaves': 450, 'subsample': 0.5392457669281077, 'colsample_bytree': 0.7174271148874326, 'min_data_in_leaf': 52, 'max_bin': 864, 'N_min': 273}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:25,167] Trial 37 finished with value: 0.09271206180711351 and parameters: {'num_iterations': 831, 'learning_rate': 0.011898453669442713, 'num_leaves': 382, 'subsample': 0.5772633171431125, 'colsample_bytree': 0.8386341634474956, 'min_data_in_leaf': 36, 'max_bin': 532, 'N_min': 39}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:30,364] Trial 38 finished with value: 0.09178144112657594 and parameters: {'num_iterations': 715, 'learning_rate': 0.038533215706315314, 'num_leaves': 492, 'subsample': 0.5321577049294065, 'colsample_bytree': 0.9102383173486016, 'min_data_in_leaf': 28, 'max_bin': 691, 'N_min': 10}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:32,416] Trial 39 finished with value: 0.09428915444743072 and parameters: {'num_iterations': 678, 'learning_rate': 0.031006351236615604, 'num_leaves': 952, 'subsample': 0.5681993993387641, 'colsample_bytree': 0.7795919080209807, 'min_data_in_leaf': 95, 'max_bin': 626, 'N_min': 56}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:36,276] Trial 40 finished with value: 0.09266432217712141 and parameters: {'num_iterations': 877, 'learning_rate': 0.01751816146649112, 'num_leaves': 550, 'subsample': 0.5991070298519694, 'colsample_bytree': 0.6843159648749152, 'min_data_in_leaf': 44, 'max_bin': 412, 'N_min': 157}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:40,459] Trial 41 finished with value: 0.0920235467343741 and parameters: {'num_iterations': 701, 'learning_rate': 0.017227043973797585, 'num_leaves': 404, 'subsample': 0.6217397707025528, 'colsample_bytree': 0.6641538985385868, 'min_data_in_leaf': 33, 'max_bin': 478, 'N_min': 27}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:44,077] Trial 42 finished with value: 0.09279200488665726 and parameters: {'num_iterations': 659, 'learning_rate': 0.02356664773586553, 'num_leaves': 316, 'subsample': 0.5561174843013478, 'colsample_bytree': 0.7629099069598503, 'min_data_in_leaf': 39, 'max_bin': 608, 'N_min': 87}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:47,506] Trial 43 finished with value: 0.0933933380949478 and parameters: {'num_iterations': 599, 'learning_rate': 0.014011796934034095, 'num_leaves': 449, 'subsample': 0.5010653428935362, 'colsample_bytree': 0.6010857133351483, 'min_data_in_leaf': 31, 'max_bin': 583, 'N_min': 121}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:49,216] Trial 44 finished with value: 0.09880163684797066 and parameters: {'num_iterations': 764, 'learning_rate': 0.010204896250381531, 'num_leaves': 264, 'subsample': 0.5965492095382589, 'colsample_bytree': 0.6815483919348952, 'min_data_in_leaf': 100, 'max_bin': 179, 'N_min': 246}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:54,240] Trial 45 finished with value: 0.09219055680455508 and parameters: {'num_iterations': 721, 'learning_rate': 0.0182266208319415, 'num_leaves': 364, 'subsample': 0.521280640145218, 'colsample_bytree': 0.7342831352238637, 'min_data_in_leaf': 25, 'max_bin': 543, 'N_min': 138}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:28:58,604] Trial 46 finished with value: 0.09176629852707127 and parameters: {'num_iterations': 835, 'learning_rate': 0.03199955710799063, 'num_leaves': 625, 'subsample': 0.5492578309161107, 'colsample_bytree': 0.8361232577076705, 'min_data_in_leaf': 36, 'max_bin': 394, 'N_min': 21}. Best is trial 12 with value: 0.09125704099197136.




[I 2023-11-13 05:29:05,753] Trial 47 finished with value: 0.09056746318625584 and parameters: {'num_iterations': 531, 'learning_rate': 0.022211533935262557, 'num_leaves': 517, 'subsample': 0.5781755393245138, 'colsample_bytree': 0.6449946295668693, 'min_data_in_leaf': 14, 'max_bin': 445, 'N_min': 72}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:29:12,974] Trial 48 finished with value: 0.09079967145124086 and parameters: {'num_iterations': 495, 'learning_rate': 0.021682634436077933, 'num_leaves': 518, 'subsample': 0.573863767679367, 'colsample_bytree': 0.522560016144267, 'min_data_in_leaf': 13, 'max_bin': 763, 'N_min': 57}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:29:23,836] Trial 49 finished with value: 0.09162191599463068 and parameters: {'num_iterations': 518, 'learning_rate': 0.03649469591354819, 'num_leaves': 679, 'subsample': 0.581687640971656, 'colsample_bytree': 0.504014763906, 'min_data_in_leaf': 11, 'max_bin': 1013, 'N_min': 76}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:29:31,434] Trial 50 finished with value: 0.09078861092759134 and parameters: {'num_iterations': 529, 'learning_rate': 0.02009196414724007, 'num_leaves': 524, 'subsample': 0.5264480799653575, 'colsample_bytree': 0.5332861399111979, 'min_data_in_leaf': 14, 'max_bin': 970, 'N_min': 54}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:29:38,510] Trial 51 finished with value: 0.09144099352184976 and parameters: {'num_iterations': 529, 'learning_rate': 0.020876343462495346, 'num_leaves': 528, 'subsample': 0.5294720971847681, 'colsample_bytree': 0.4914688909019992, 'min_data_in_leaf': 14, 'max_bin': 925, 'N_min': 55}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:29:45,669] Trial 52 finished with value: 0.09150490166497734 and parameters: {'num_iterations': 537, 'learning_rate': 0.028633197926629963, 'num_leaves': 509, 'subsample': 0.519010539176704, 'colsample_bytree': 0.5100460028311042, 'min_data_in_leaf': 14, 'max_bin': 913, 'N_min': 59}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:29:52,460] Trial 53 finished with value: 0.09168992397651352 and parameters: {'num_iterations': 457, 'learning_rate': 0.020604733684807863, 'num_leaves': 583, 'subsample': 0.5314127232219897, 'colsample_bytree': 0.5657428851851534, 'min_data_in_leaf': 14, 'max_bin': 956, 'N_min': 70}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:29:57,443] Trial 54 finished with value: 0.09292544849937645 and parameters: {'num_iterations': 492, 'learning_rate': 0.015804397087169023, 'num_leaves': 520, 'subsample': 0.5039482026389246, 'colsample_bytree': 0.4484511828094499, 'min_data_in_leaf': 19, 'max_bin': 969, 'N_min': 96}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:30:06,258] Trial 55 finished with value: 0.09123390177660595 and parameters: {'num_iterations': 531, 'learning_rate': 0.02108637392543227, 'num_leaves': 637, 'subsample': 0.5733953743315642, 'colsample_bytree': 0.5811422393028312, 'min_data_in_leaf': 13, 'max_bin': 806, 'N_min': 52}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:30:17,613] Trial 56 finished with value: 0.09154995838545561 and parameters: {'num_iterations': 480, 'learning_rate': 0.02500765383453227, 'num_leaves': 628, 'subsample': 0.6287526910158195, 'colsample_bytree': 0.5913964714738305, 'min_data_in_leaf': 10, 'max_bin': 826, 'N_min': 35}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:30:23,532] Trial 57 finished with value: 0.09180532835339303 and parameters: {'num_iterations': 543, 'learning_rate': 0.01562005446768493, 'num_leaves': 745, 'subsample': 0.5868072561558632, 'colsample_bytree': 0.6367570093532685, 'min_data_in_leaf': 20, 'max_bin': 774, 'N_min': 107}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:30:28,451] Trial 58 finished with value: 0.09187402814330477 and parameters: {'num_iterations': 575, 'learning_rate': 0.01880052404193528, 'num_leaves': 576, 'subsample': 0.6036014511417707, 'colsample_bytree': 0.5345864360381731, 'min_data_in_leaf': 25, 'max_bin': 721, 'N_min': 50}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:30:36,884] Trial 59 finished with value: 0.09131880354643854 and parameters: {'num_iterations': 607, 'learning_rate': 0.029559698615725202, 'num_leaves': 806, 'subsample': 0.5711656936419909, 'colsample_bytree': 0.563932640917747, 'min_data_in_leaf': 16, 'max_bin': 860, 'N_min': 212}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:30:47,483] Trial 60 finished with value: 0.09125242191953192 and parameters: {'num_iterations': 613, 'learning_rate': 0.03130895058211525, 'num_leaves': 815, 'subsample': 0.6309174346713257, 'colsample_bytree': 0.5842292911274524, 'min_data_in_leaf': 16, 'max_bin': 976, 'N_min': 261}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:30:57,255] Trial 61 finished with value: 0.09172456845282946 and parameters: {'num_iterations': 619, 'learning_rate': 0.03216496804467351, 'num_leaves': 817, 'subsample': 0.6341283979082728, 'colsample_bytree': 0.5709575267356213, 'min_data_in_leaf': 16, 'max_bin': 874, 'N_min': 277}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:31:04,261] Trial 62 finished with value: 0.09242792895595867 and parameters: {'num_iterations': 595, 'learning_rate': 0.04254564473527171, 'num_leaves': 906, 'subsample': 0.607699632137436, 'colsample_bytree': 0.5883536639825996, 'min_data_in_leaf': 22, 'max_bin': 975, 'N_min': 291}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:31:17,286] Trial 63 finished with value: 0.09084128472342205 and parameters: {'num_iterations': 635, 'learning_rate': 0.021336976569225802, 'num_leaves': 743, 'subsample': 0.6569451134588903, 'colsample_bytree': 0.6276094467449463, 'min_data_in_leaf': 12, 'max_bin': 848, 'N_min': 247}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:31:27,823] Trial 64 finished with value: 0.09071390350775517 and parameters: {'num_iterations': 555, 'learning_rate': 0.02118259819372841, 'num_leaves': 743, 'subsample': 0.6519413717615045, 'colsample_bytree': 0.643254578836947, 'min_data_in_leaf': 13, 'max_bin': 819, 'N_min': 257}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:31:38,702] Trial 65 finished with value: 0.09071058644958978 and parameters: {'num_iterations': 549, 'learning_rate': 0.021246074427332846, 'num_leaves': 740, 'subsample': 0.6407953750624056, 'colsample_bytree': 0.6304618765593748, 'min_data_in_leaf': 12, 'max_bin': 816, 'N_min': 258}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:31:46,813] Trial 66 finished with value: 0.09085094472568843 and parameters: {'num_iterations': 401, 'learning_rate': 0.021580606240724436, 'num_leaves': 737, 'subsample': 0.6615000656587647, 'colsample_bytree': 0.6207072215524307, 'min_data_in_leaf': 12, 'max_bin': 812, 'N_min': 258}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:31:56,159] Trial 67 finished with value: 0.09094803770255815 and parameters: {'num_iterations': 402, 'learning_rate': 0.022450899361226455, 'num_leaves': 739, 'subsample': 0.6597945268401414, 'colsample_bytree': 0.634042802272719, 'min_data_in_leaf': 10, 'max_bin': 732, 'N_min': 256}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:00,751] Trial 68 finished with value: 0.0925696080629796 and parameters: {'num_iterations': 369, 'learning_rate': 0.013393839279070303, 'num_leaves': 757, 'subsample': 0.6540447408748138, 'colsample_bytree': 0.6308449947814839, 'min_data_in_leaf': 20, 'max_bin': 833, 'N_min': 231}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:10,397] Trial 69 finished with value: 0.09085673013215861 and parameters: {'num_iterations': 456, 'learning_rate': 0.01612005404495074, 'num_leaves': 701, 'subsample': 0.6807052449346912, 'colsample_bytree': 0.6143582134695772, 'min_data_in_leaf': 12, 'max_bin': 781, 'N_min': 256}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:12,390] Trial 70 finished with value: 0.09582714808562191 and parameters: {'num_iterations': 508, 'learning_rate': 0.01857650089974262, 'num_leaves': 870, 'subsample': 0.6418820179757734, 'colsample_bytree': 0.6934795282500283, 'min_data_in_leaf': 81, 'max_bin': 885, 'N_min': 279}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:21,731] Trial 71 finished with value: 0.0908442005593471 and parameters: {'num_iterations': 449, 'learning_rate': 0.01508553226741249, 'num_leaves': 709, 'subsample': 0.6732257599175931, 'colsample_bytree': 0.6559407197928531, 'min_data_in_leaf': 12, 'max_bin': 755, 'N_min': 256}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:33,144] Trial 72 finished with value: 0.0910625958665944 and parameters: {'num_iterations': 551, 'learning_rate': 0.027013500724373907, 'num_leaves': 772, 'subsample': 0.6715620306471092, 'colsample_bytree': 0.6525966779441964, 'min_data_in_leaf': 13, 'max_bin': 743, 'N_min': 265}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:37,554] Trial 73 finished with value: 0.09274250338063472 and parameters: {'num_iterations': 406, 'learning_rate': 0.01487627827361351, 'num_leaves': 710, 'subsample': 0.6171364754820425, 'colsample_bytree': 0.6573821291649747, 'min_data_in_leaf': 22, 'max_bin': 844, 'N_min': 220}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:44,484] Trial 74 finished with value: 0.09081277863961168 and parameters: {'num_iterations': 468, 'learning_rate': 0.02054599127785964, 'num_leaves': 659, 'subsample': 0.7036009121090151, 'colsample_bytree': 0.5409164176592607, 'min_data_in_leaf': 18, 'max_bin': 938, 'N_min': 245}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:51,062] Trial 75 finished with value: 0.09108954712118775 and parameters: {'num_iterations': 466, 'learning_rate': 0.017409615911256154, 'num_leaves': 663, 'subsample': 0.6935659430347177, 'colsample_bytree': 0.5400374163457622, 'min_data_in_leaf': 18, 'max_bin': 904, 'N_min': 243}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:32:56,095] Trial 76 finished with value: 0.09237630879455991 and parameters: {'num_iterations': 501, 'learning_rate': 0.02420850156685601, 'num_leaves': 659, 'subsample': 0.7103018695147507, 'colsample_bytree': 0.5375218660550928, 'min_data_in_leaf': 27, 'max_bin': 935, 'N_min': 200}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:33:02,873] Trial 77 finished with value: 0.09100938804210149 and parameters: {'num_iterations': 443, 'learning_rate': 0.019791150332901013, 'num_leaves': 780, 'subsample': 0.6733683497746827, 'colsample_bytree': 0.7055403130417466, 'min_data_in_leaf': 16, 'max_bin': 673, 'N_min': 231}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:33:11,280] Trial 78 finished with value: 0.09094144694860364 and parameters: {'num_iterations': 644, 'learning_rate': 0.013905978895550139, 'num_leaves': 597, 'subsample': 0.6985080504691474, 'colsample_bytree': 0.6053403775604315, 'min_data_in_leaf': 19, 'max_bin': 781, 'N_min': 249}. Best is trial 47 with value: 0.09056746318625584.




[I 2023-11-13 05:33:26,533] Trial 79 finished with value: 0.0908112474662479 and parameters: {'num_iterations': 558, 'learning_rate': 0.027154244871209758, 'num_leaves': 570, 'subsample': 0.6515887604858498, 'colsample_bytree': 0.6700470924398497, 'min_data_in_leaf': 10, 'max_bin': 1001, 'N_min': 269}. Best is trial 47 with value: 0.09056746318625584.


Best hyperparameters: {'num_iterations': 531, 'learning_rate': 0.022211533935262557, 'num_leaves': 517, 'subsample': 0.5781755393245138, 'colsample_bytree': 0.6449946295668693, 'min_data_in_leaf': 14, 'max_bin': 445, 'N_min': 72}
Best MAPE: 0.09056746318625584


In [10]:
X_TRAIN = pd.read_csv('data/x_train.csv')
Y_TRAIN = pd.read_csv('data/y_valid.csv')
display(Y_TRAIN.isna().value_counts())

單價   
False    2351
Name: count, dtype: int64