## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.svm import SVR
from scipy.stats import randint
import lightgbm as ltb
from sklearn.ensemble import StackingRegressor

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Loading data

In [2]:
X_train = pd.read_pickle('/content/drive/MyDrive/Project/X_train.pkl')
y_train = pd.read_pickle('/content/drive/MyDrive/Project/y_train.pkl')
X_test = pd.read_pickle('/content/drive/MyDrive/Project/X_test.pkl')
test_ids = pd.read_pickle('/content/drive/MyDrive/Project/test_ids.pkl')

## Training models

In [3]:
# function to find the best params
def tune_hyperparameters(training_data, training_targets, model, param_grid):

  random_search = RandomizedSearchCV(
      model,
      param_distributions=param_grid,
      scoring='neg_mean_squared_error',
      n_jobs=-1,
      n_iter=20,
      cv=10)

  random_search.fit(np.array(training_data), np.array(training_targets))
  print('Best hyperparameters for', model, 'are:')
  print(random_search.best_params_)
  print('\n')
  return random_search.best_params_

In [4]:
# hyper-parameter tuning for XGBRegressor
param_xgboost = {'learning_rate':[0.01, 0.05, 0.1, 0.5, 1],
                 'n_estimators': np.arange(100,1000,10),
                 'iterations': np.arange(100,1000,100),
                 'max_depth': np.arange(1,100,10),}

xgboost_best_params = tune_hyperparameters(X_train, y_train, XGBRegressor(), param_xgboost)

Best hyperparameters for XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1) are:
{'n_estimators': 450, 'max_depth': 41, 'learning_rate': 0.05, 'iterations': 600}




In [5]:
# hyper-parameter tuning for LGBMRegressor
params_LGBM = {'reg_lambda': [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2],
                     'reg_alpha': [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2],
                     'min_child_samples': randint(1, 100),
                     'subsample': [x / 10 for x in range(1, 10, 1)], # bagging_fraction
                     'subsample_freq': randint(1, 200), # bagging_freq
                     'num_leaves': randint(1, 200),
                     'max_depth': list(range(1, 15, 1)),
                     'max_bin': randint(1, 700),
                     'learning_rate': [x / 200 for x in range(1, 10, 1)],
                     'colsample_bytree': [x / 10 for x in range(1, 11, 1)]} # feature_fraction 
                        
                    
LGBM_best_params = tune_hyperparameters(X_train, y_train, ltb.LGBMRegressor(), params_LGBM)

Best hyperparameters for LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0) are:
{'colsample_bytree': 0.3, 'learning_rate': 0.04, 'max_bin': 222, 'max_depth': 11, 'min_child_samples': 1, 'num_leaves': 25, 'reg_alpha': 0.5, 'reg_lambda': 0.05, 'subsample': 0.7, 'subsample_freq': 173}




In [6]:
# hyper-parameter tuning for SVR
params_svr = {'kernel' : ['rbf'],
              'C' : list(range(1, 100, 1)),
              'epsilon' : [x / 2000 for x in range(1, 50, 1)],
              'gamma' : [x / 10000 for x in range(1, 50, 1)]}


svr_best_params = tune_hyperparameters(X_train, y_train, SVR(), params_svr)

Best hyperparameters for SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) are:
{'kernel': 'rbf', 'gamma': 0.0003, 'epsilon': 0.0175, 'C': 6}




In [7]:
# hyper-parameter tuning for Lasso
Lasso_model = linear_model.Lasso()

params_lasso = {'alpha': [x / 25000 for x in range(1, 50, 1)],
          'tol': [0.0000001], 
          'max_iter': [3000]}

lasso_best_params = tune_hyperparameters(X_train, y_train, linear_model.Lasso(), params_lasso)

Best hyperparameters for Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False) are:
{'tol': 1e-07, 'max_iter': 3000, 'alpha': 0.00064}




In [8]:
# hyper-parameter tuning for ElasticNet
params_elasticNet = {'alpha': [x / 25000 for x in range(1, 25, 1)],
             'l1_ratio': [x / 100 for x in range(10, 100, 1)],
             'tol': [0.000001], 
             'max_iter': [4000]}
elasticNet_best_params = tune_hyperparameters(X_train, y_train, linear_model.ElasticNet(), params_elasticNet)


Best hyperparameters for ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False) are:
{'tol': 1e-06, 'max_iter': 4000, 'l1_ratio': 0.38, 'alpha': 0.00088}




In [9]:
# hyper-parameter tuning for BayesianRidge
params_br = {
    'alpha_1':np.arange(1, 1e5, 10),
    'alpha_2':np.arange(1, 1e5, 10),
    'lambda_1':np.arange(1, 1e5, 10),
    'lambda_2':np.arange(1, 1e5, 10)
}

br_best_params = tune_hyperparameters(X_train, y_train, BayesianRidge(), params_br)

Best hyperparameters for BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False) are:
{'lambda_2': 8601.0, 'lambda_1': 31471.0, 'alpha_2': 62851.0, 'alpha_1': 37461.0}




In [10]:
# define the base models
level0 = list()
level0.append(('lasso', linear_model.Lasso(**lasso_best_params)))
level0.append(('elasticNet', linear_model.ElasticNet(**elasticNet_best_params)))
level0.append(('xg_boost', XGBRegressor(**xgboost_best_params)))
level0.append(('lgbm', ltb.LGBMRegressor(**LGBM_best_params)))
level0.append(('svm', SVR(**svr_best_params)))
level0.append(('br', BayesianRidge(**br_best_params)))

# define meta learner model
level1 = linear_model.Lasso(**lasso_best_params)
# define the stacking ensemble
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
# fit the model on all available data
model.fit(X_train, y_train)
# make a prediction for one example
yhat = model.predict(X_test)




In [11]:
# predicting on test data
predictions = pd.concat([test_ids, pd.Series(np.expm1(yhat), name='SalePrice')], axis=1)
predictions.to_csv('submission.csv', index = False)
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>