In [11]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_percentage_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def load_data(path):
    return pd.read_csv(path)


def logarithm(data):
    return data.apply(lambda x: np.log(x))


def stacking():
    level_0 = list()
    level_0.append(
        (
            'xgb', XGBRegressor(
                objective='reg:squarederror',
                n_estimators=1000,
                learning_rate=0.05,
                max_depth=7,
                reg_alpha= 0.05,
                random_state=42
            )
        )
    )
    level_0.append(
        (
            'cat', CatBoostRegressor(
                iterations=1000,
                depth=10,
                learning_rate=0.087,
                l2_leaf_reg=0.0715564,
                subsample=0.7963,
                colsample_bylevel=0.94634,
                bagging_temperature=0.0709,
                border_count=232,
                random_strength=0.63275,
                verbose=False,
                random_state=42
            )
        )
    )
    level_0.append(
        (
            'lgbm', LGBMRegressor(
                num_iterations = 588,
                learning_rate = 0.018049943310703906,
                num_leaves = 829,
                subsample = 0.8920214447324074,
                colsample_bytree = 0.5330930972309851,
                min_data_in_leaf = 25,
                max_bin = 505,
                random_state=42
            )
        )
    )
    level_1 = Ridge(alpha=0.5)
    stack_model = StackingRegressor(estimators=level_0, final_estimator=level_1, cv=5)
    return stack_model

class BetaEncoder(object):
        
    def __init__(self, group):
        self.group = group
        self.stats = None
        
    # get counts from df
    def fit(self, df, target_col):
        self.prior_mean = np.mean(df[target_col]) 
        stats = df[[target_col, self.group]].groupby(self.group)
        stats = stats.agg(['sum', 'count'])[target_col]    
        stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True)
        stats.reset_index(level=0, inplace=True)           
        self.stats = stats

    # extract posterior statistics
    def transform(self, df, stat_type, N_min=1):
        
        df_stats = pd.merge(df[[self.group]], self.stats, how='left')
        n = df_stats['n'].copy()
        N = df_stats['N'].copy()
        
        # fill in missing
        nan_indexs = np.isnan(n)
        n[nan_indexs] = self.prior_mean
        N[nan_indexs] = 1.0
        
        # prior parameters
        N_prior = np.maximum(N_min-N, 0)
        alpha_prior = self.prior_mean*N_prior
        beta_prior  = (1-self.prior_mean)*N_prior
        
        # posterior parameters
        alpha =  alpha_prior + n
        beta =  beta_prior  + N-n
        
        # calculate statistics
        if stat_type=='mean':
            num = alpha
            dem = alpha+beta
                    
        elif stat_type=='mode':
            num = alpha-1
            dem = alpha+beta-2
            
        elif stat_type=='median':
            num = alpha-1/3
            dem = alpha+beta-2/3
        
        elif stat_type=='var':
            num = alpha*beta
            dem = (alpha+beta)**2*(alpha+beta+1)
                    
        elif stat_type=='skewness':
            num = 2*(beta-alpha)*np.sqrt(alpha+beta+1)
            dem = (alpha+beta+2)*np.sqrt(alpha*beta)

        elif stat_type=='kurtosis':
            num = 6*(alpha-beta)**2*(alpha+beta+1) - alpha*beta*(alpha+beta+2)
            dem = alpha*beta*(alpha+beta+2)*(alpha+beta+3)
            
        # replace missing
        value = num/dem
        value[np.isnan(value)] = np.nanmedian(value)
        return value

In [20]:
# x_data = load_data(f"{os.getcwd()}/data/train_feat.csv")
# y_data = load_data(f"{os.getcwd()}/data/train_output.csv")
# y_train = load_data(f"{os.getcwd()}/data/y_train.csv")
# y_valid = load_data(f"{os.getcwd()}/data/y_valid.csv")
# public_train = load_data(f"{os.getcwd()}/data/test_feat.csv")
sub_data = load_data("/Users/abnerteng/Desktop/code/FW23_AICUP/data/public_private_submission_template.csv")
# # private_train = load_data(f"{os.getcwd()}/data/test_feat_v2.csv")
# x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data)
# print(
#     f"Train shape: {x_train.shape}, {y_train.shape}, \
#         Valid shape: {x_valid.shape}, {y_valid.shape}"
# )
# y_train, y_valid = logarithm(y_train), logarithm(y_valid)
# stack_model = stacking()
# stack_model.fit(x_train, y_train)
# y_pred = stack_model.predict(x_valid)
# y_pred, y_valid = np.exp(y_pred), np.exp(y_valid)
# mape = mean_absolute_percentage_error(y_valid, y_pred)
# print(f"MAPE: {mape * 100}")
# public_pred = np.exp(stack_model.predict(public_train))
# sub_data['predicted_price'] = public_pred
# sub_data.to_csv(f"{os.getcwd()}/data/public_submission_stack_v4.csv", index=False)


In [16]:
raw_data = pd.read_csv("/Users/abnerteng/Desktop/code/FW23_AICUP/data/training_data.csv")
# display(raw_data)

train = pd.read_csv("/Users/abnerteng/Desktop/code/FW23_AICUP/data/train_feat.csv")
# display(train.info())
feat_cols = ['土地面積','移轉層次','總樓層數','屋齡','建物面積','車位面積','車位個數','橫坐標','縱坐標','主建物面積','陽台面積',
                '附屬建物面積','N_lib_2000','avg_distances_高中','avg_distances_國小','avg_distances_火車','avg_distances_醫療',
                'avg_distances_公車','avg_distances_國中','avg_distances_大學','avg_distances_便利','avg_distances_AT',
                'avg_distances_金融','avg_distances_捷運','avg_distances_郵局',
                'avg_tax','density','edu_p']
# '縣市_台北市','縣市_台中市','縣市_台南市','縣市_新北市','縣市_高雄市','縣市_桃園市'
cat_cols = ['使用分區','主要用途','主要建材','建物型態','縣市']

# raw_data['縣市_鄉鎮市區'] = raw_data['縣市'] + '_' + raw_data['鄉鎮市區']

selected_X = train[feat_cols]
cat_X = raw_data[cat_cols+['單價']]
X = pd.concat([selected_X, cat_X], axis=1)
Y = pd.read_csv("/Users/abnerteng/Desktop/code/FW23_AICUP/data/train_output.csv")
Y = np.log(Y)

test = pd.read_csv("/Users/abnerteng/Desktop/code/FW23_AICUP/data/test_feat.csv")
private_org = pd.read_csv("/Users/abnerteng/Desktop/code/FW23_AICUP/data/private_dataset_org.csv") 
private_test = pd.read_csv("/Users/abnerteng/Desktop/code/FW23_AICUP/data/private_dataset.csv")
raw_test_data = pd.read_csv("/Users/abnerteng/Desktop/code/FW23_AICUP/data/public_dataset.csv")
# raw_test_data['縣市_鄉鎮市區'] = raw_test_data['縣市'] + '_' + raw_test_data['鄉鎮市區']
selected_X = test[feat_cols]
selected_private_X = private_test[feat_cols]
cat_X = raw_test_data[cat_cols]
cat_X_private = private_org[cat_cols]
X_test = pd.concat([selected_X, cat_X], axis=1)
private_X_test = pd.concat([selected_private_X, cat_X_private], axis=1)

In [17]:
N_min = 20

for col in cat_cols:
    # print(f"now at {col}")
    le = LabelEncoder()
    le.fit(np.concatenate([X[col], X_test[col]]))
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])
    private_X_test[col] = le.transform(private_X_test[col])

x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=507)
x_train.reset_index(inplace=True, drop=True)
x_valid.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_valid.reset_index(inplace=True, drop=True)
y_valid = np.exp(y_valid)


for c in cat_cols:
    # fit encoder
    be = BetaEncoder(c)
    be.fit(x_train, '單價')
    # mean
    feature_name = f'{c}_mean'
    x_train[feature_name] = be.transform(x_train, 'mean', N_min)
    x_valid[feature_name] = be.transform(x_valid, 'mean', N_min)
    X_test[feature_name] = be.transform(X_test, 'mean', N_min)
    private_X_test[feature_name] = be.transform(private_X_test, 'mean', N_min)

x_train = x_train.drop(['單價']+cat_cols,axis=1)
x_valid = x_valid.drop(['單價']+cat_cols,axis=1)
X_test = X_test.drop(cat_cols,axis=1)
private_X_test = private_X_test.drop(cat_cols,axis=1)
# x_train : training data x
# y_train : training data target label. Already log
# x_valid : validation data x
# y_valid : validation data target label. Already exp
# X_test : public training data x

In [19]:
print(
    f"Train shape: {x_train.shape}, {y_train.shape}, \
        Valid shape: {x_valid.shape}, {y_valid.shape}"
)

stack_model = stacking()
stack_model.fit(x_train, y_train)
y_pred = stack_model.predict(x_valid)
y_pred = np.exp(y_pred)
mape = mean_absolute_percentage_error(y_valid, y_pred)
print(f"MAPE: {mape * 100}")
public_pred = np.exp(stack_model.predict(X_test))
private_pred = np.exp(stack_model.predict(private_X_test))
sub_data.loc[:5876, 'predicted_price'] = public_pred
sub_data.loc[5876:, 'predicted_price'] = private_pred
sub_data.to_csv(f"{os.getcwd()}/data/public_private_submission_stack_v2.csv", index=False)

Train shape: (9400, 33), (9400, 1),         Valid shape: (2351, 33), (2351, 1)


  y = column_or_1d(y, warn=True)


















MAPE: 9.03757021728155


ValueError: Must have equal len keys and value when setting with an iterable

In [29]:
import optuna
from optuna import Trial

def objective_xgb(trial, xt, yt, xv, yv):
    config = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'random_state': 42
    }
    model = XGBRegressor(**config)
    model.fit(xt, yt)
    y_pred = model.predict(xv)
    y_pred = np.exp(y_pred)
    mape = mean_absolute_percentage_error(yv, y_pred)
    return mape * 100


def objective_cat(trial, xt, yt, xv, yv):
    config = {
        'iterations': 1000,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
        'verbose': False,
        'random_state': 42
    }
    model = CatBoostRegressor(**config)
    model.fit(xt, yt)
    y_pred = model.predict(xv)
    y_pred = np.exp(y_pred)
    mape = mean_absolute_percentage_error(yv, y_pred)
    return mape * 100


def objective_lgbm(trial, xt, yt, xv, yv):
    config = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 1.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'random_state': 42
    }
    model = LGBMRegressor(**config)
    model.fit(xt, yt)
    y_pred = model.predict(xv)
    y_pred = np.exp(y_pred)
    mape = mean_absolute_percentage_error(yv, y_pred)
    return mape * 100

In [30]:
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(lambda trial: objective_xgb(trial, x_train, y_train, x_valid, y_valid), n_trials=100)
cat_study = optuna.create_study(direction='minimize')
cat_study.optimize(lambda trail: objective_cat(trail, x_train, y_train, x_valid, y_valid), n_trials=100)
lgbm_study = optuna.create_study(direction='minimize')
lgbm_study.optimize(lambda trail: objective_lgbm(trail, x_train, y_train, x_valid, y_valid), n_trials=100)
print("=== XGB ===")
print(xgb_study.best_params)
print(xgb_study.best_value)
print(xgb_study.best_trial)
print("=== CAT ===")
print(cat_study.best_params)
print(cat_study.best_value)
print(cat_study.best_trial)
print("=== LGBM ===")
print(lgbm_study.best_params)
print(lgbm_study.best_value)
print(lgbm_study.best_trial)

[I 2023-11-13 16:40:35,259] A new study created in memory with name: no-name-cb370bb0-db40-4bfe-9ff3-800cef382af1
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
[I 2023-11-13 16:40:40,171] Trial 0 finished with value: 10.420857054125744 and parameters: {'n_estimators': 366, 'max_depth': 6, 'learning_rate': 0.4381908884694044, 'reg_alpha': 8.692306555367405e-08}. Best is trial 0 with value: 10.420857054125744.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
[I 2023-11-13 16:40:52,794] Trial 1 finished with value: 9.432456944583944 and parameters: {'n_estimators': 506, 'max_depth': 9, 'learning_rate': 0.06587986565979737, 'reg_alpha': 9.70099279755863e-07}. Best is trial 1 with value: 9.432456944583944.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'reg_alpha': trial.suggest_loguni

KeyboardInterrupt: 