In [1]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.neural_network import *
import xgboost as xgb
import catboost as cat
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold
import lightgbm as lgb
import sklearn
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train,test = pd.read_csv('train.csv'),pd.read_csv('test.csv')
data = pd.concat([train,test], ignore_index=True)
#some features are masked by 0, recover
data.loc[data.age==0, 'age'] = None
data.loc[data.charge_sensitivity==0, 'charge_sensitivity'] = data.charge_sensitivity.median()
data.internet_age = data.internet_age.apply(lambda x: x/12)

trn_data, test_data = data[:len(train)],data[len(train):]
trn_data.drop(['id','age',], axis=1, inplace=True)
del test_data['credit']
target = trn_data['credit']
del trn_data['credit']

X_trn, X_val, y_trn, y_val = train_test_split(trn_data, target, test_size=0.2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [2]:
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.neural_network import *

def base_model(regressor, X_trn, X_val, y_trn, y_val,K_fold=5,Scale=False):
    X_trn, y_trn = X_trn.to_numpy(), y_trn.to_numpy()
    X_val, y_val = X_val.to_numpy(), y_val.to_numpy()
    scaler = StandardScaler()
    if Scale:
        pipe = Pipeline([('PreTransformer', scaler),\
            ('Regressor', regressor)])
    else:
        pipe = Pipeline([('Regressor', regressor)])
    print(pipe)

    y_pred = np.zeros_like(y_val)
    folds = KFold(n_splits=K_fold,shuffle=True,random_state=0)
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_trn, y_trn)):
        print(f"\n======fold No.{fold_+1}======")
        print('CV train_sample size : %d' % len(trn_idx))
        trn_X, trn_y = X_trn[trn_idx], y_trn[trn_idx]
        val_X, val_y = X_trn[val_idx], y_trn[val_idx]
        #fit
        pipe.fit(trn_X, trn_y)
        #predict on k_fold splitted val_X
        val_y_pred = pipe.predict(val_X)
        print('CV MAE Loss: %.2f' % mean_absolute_error(val_y_pred, val_y))
        #predict on X_val
        y_pred += pipe.predict(X_val)
    #div K-fold number
    y_pred /= K_fold
    #final metric on val-set
    MAE = mean_absolute_error(y_pred, y_val)
    print('\nMAE Loss: %.5f' % MAE)
    print('Final Score: %.5f' % (1/(1+MAE)))

In [3]:
def lgb_model(X_trn, X_val, y_trn, y_val,K_fold=5):
    X_trn, y_trn = X_trn.to_numpy(), y_trn.to_numpy()
    X_val, y_val = X_val.to_numpy(), y_val.to_numpy()
    scaler = StandardScaler()
    scaler.fit(X_trn)
    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    y_pred = np.zeros_like(y_val)
    folds = KFold(n_splits=K_fold,shuffle=True,random_state=0)

    reg = lgb.LGBMRegressor(objective='regression',reg_alpha=0.6, n_estimators=10000)

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_trn, y_trn)):
        print(f"\n======fold No.{fold_+1}======")
        print('CV train_sample size : %d' % len(trn_idx))
        trn_X, trn_y = X_trn[trn_idx], y_trn[trn_idx]
        val_X, val_y = X_trn[val_idx], y_trn[val_idx]
        #fit
        reg.fit(trn_X, trn_y, eval_set = [(val_X,val_y)],early_stopping_rounds=500,verbose=10)
        #predict on k_fold splitted val_X
        val_y_pred = reg.predict(val_X, num_iteration = reg.best_iteration_)
        print('CV MAE Loss: %.2f' % mean_absolute_error(val_y_pred, val_y))
        #predict on X_val
        y_pred += reg.predict(X_val, num_iteration = reg.best_iteration_) / K_fold
    #final metric on val-set
    MAE = mean_absolute_error(y_pred, y_val)
    print('\nMAE Loss: %.5f' % MAE)
    print('Final Score: %.5f' % (1/(1+MAE)))
    return y_pred

In [5]:
# lgb_pred = lgb_model1(X_trn, X_val, y_trn, y_val,K_fold=5)

In [None]:
cat_params_v1 = {'depth': 6, 'learning_rate': 0.8, 'l2_leaf_reg': 2, 'num_boost_round': 10000, 'random_seed': 94,
                 'loss_function': 'MAE'}
def cat_model(X_trn, X_val, y_trn, y_val, K_fold=10, cat_params=cat_params_v1):
    X_trn, y_trn = X_trn.to_numpy(), y_trn.to_numpy()
    X_val, y_val = X_val.to_numpy(), y_val.to_numpy()
    y_pred = np.zeros_like(y_val)
    folds = KFold(n_splits=K_fold, shuffle=True, random_state=0)

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_trn, y_trn)):
        print(f"\n======fold No.{fold_+1}======")
        print('CV train_sample size : %d' % len(trn_idx))
        regor = cat.CatBoostRegressor(**cat_params)

        regor.fit(X_trn[trn_idx], y_trn[trn_idx], early_stopping_rounds=200, verbose_eval=1000,
                  use_best_model=True, eval_set=(X_trn[val_idx], y_trn[val_idx]))
        y_pred += regor.predict(X_val) / K_fold
    return y_pred


In [None]:
cat_pred = cat_model(X_trn, X_val, y_trn, y_val, K_fold=10, cat_params=cat_params_v1)

In [10]:
def xgb_sklearn(X_trn, X_val, y_trn, y_val, K_fold=5):
    X_trn, y_trn = X_trn.to_numpy(), y_trn.to_numpy()
    X_val, y_val = X_val.to_numpy(), y_val.to_numpy()
    y_pred = np.zeros_like(y_val)
    folds = KFold(n_splits=K_fold, shuffle=True, random_state=0)

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_trn, y_trn)):
        print(f"\n======fold No.{fold_+1}======")
        print('CV train_sample size : %d' % len(trn_idx))
        trn_X, trn_y = X_trn[trn_idx], y_trn[trn_idx]
        val_X, val_y = X_trn[val_idx], y_trn[val_idx]
        regor = xgb.XGBRegressor(n_estimators=1000, max_depth=6, learning_rate=0.004, subsample=0.5,
                                 colsample_bytree=0.5, reg_alpha=0.2, n_jobs=-1, verbosity=0)

        regor.fit(trn_X, trn_y, eval_metric='mae', eval_set = [(val_X,val_y)],
                  early_stopping_rounds=200, verbose=True)
        y_pred += regor.predict(X_val,
                                ntree_limit=regor.best_ntree_limit) / K_fold

    # final metric on val-set
    MAE = mean_absolute_error(y_pred, y_val)
    print('\nMAE Loss: %.5f' % MAE)
    print('Final Score: %.5f' % (1/(1+MAE)))
    return y_pred

In [9]:
xgb_pred = xgb_sklearn(X_trn, X_val, y_trn, y_val, K_fold=5)


CV train_sample size : 32000


XGBoostError: value 10 for Parameter verbosity exceed bound [0,3]
verbosity: Flag to print out detailed breakdown of runtime.

In [None]:
mean_absolute_error((lgb_pred + cat_pred + xgb_pred)/3, y_val)

def ensemble(X_val,y_val,models):
    X_val, y_val = X_val.to_numpy(), y_val.to_numpy()
    y_pred = np.zeros_like(y_val)
    for model in models:
        y_pred += model.predict(X_val,
                                num_iteration = model.best_iteration_) / len(models)
    # final metric on val-set
    MAE = mean_absolute_error(y_pred, y_val)
    print('\nMAE Loss: %.5f' % MAE)
    print('Final Score: %.5f' % (1/(1+MAE)))