# Imports

In [38]:
# Hide warnings
import warnings
warnings.filterwarnings("ignore")

# Imports
import time
from IPython.display import display, Javascript
import itertools
import duckdb
import pandas as pd
import numpy as np
import math

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

# Data process
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import PowerTransformer

# Train
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Models
import umap
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

# Install sqlite as a extension of duckdb
#duckdb.install_extension('sqlite')

### Import data procesed

In [58]:
X = pd.read_csv('../data/processed/X_without_outliers_dupli.csv')
y = pd.read_csv('../data/processed/y_without_outliers_dupli.csv')

In [43]:
X_test = pd.read_csv('../data/processed/x_test.csv')

# Machine learning

## Fine-Tuning models

#### XGBBoost

In [15]:
#param_Rodri = {'colsample_bytree': 0.95, 'gamma': 0.14, 'learning_rate': 0.012, 'max_depth': 7, 'missing': np.inf, 
#               'n_estimators': 1130, 'subsample': 0.8}

param_grid_xgb = {'n_estimators': [950],  # Number of trees in the forest.
                  'max_depth': [5],  # Maximum depth of the trees.
                  'subsample': [1],
                  'colsample_bytree': [0.8],
                  'lambda': [0.7],
                  'gamma': [0.05],
                  'learning_rate': [0.035]
                 }

xgb_reg = XGBRegressor(random_state=0)

xgb_grid_search = GridSearchCV(xgb_reg, param_grid_xgb, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True, verbose=1, n_jobs=-1)

xgb_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', xgb_grid_search.best_params_, '\n')
print('Best score: ', xgb_grid_search.best_score_, '\n')

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Best hyperparameters:  {'colsample_bytree': 0.8, 'gamma': 0.05, 'lambda': 0.7, 'learning_rate': 0.035, 'max_depth': 5, 'n_estimators': 950, 'subsample': 1} 

Best score:  -541.5312819563311 



#### LGBM Regressor

In [20]:
param_grid_lgbm = {'num_leaves': [30], 
                   'learning_rate': [0.02] ,
                   'n_estimators': [1300],
                   'max_depth': [-1]
                  }

lgbm_reg = LGBMRegressor(random_state=0)

lgbm_grid_search = GridSearchCV(param_grid_lgbm, param_grid, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True, verbose=1, n_jobs=-1)

lgbm_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', lgbm_grid_search.best_params_, '\n')
print('Best score: ', lgbm_grid_search.best_score_, '\n')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1196
[LightGBM] [Info] Number of data points in the train set: 40455, number of used features: 10
[LightGBM] [Info] Start training from score 3928.444469


Best hyperparameters:  {'learning_rate': 0.02, 'max_depth': -1, 'n_estimators': 1300, 'num_leaves': 30} 

Best score:  -540.2685352100276 



#### ExtraTreesRegressor

In [27]:
param_grid = {'n_estimators': [70, 100, 150], 
              'max_depth': [None],
              'criterion': ['squared_error'],
              'max_features': ['sqrt'],
              'min_samples_leaf': [1, 2],
              'min_samples_split': [3, 4]
             }

extrees_reg = ExtraTreesRegressor(random_state=0)

extrees_grid_search = GridSearchCV(extrees_reg, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)

extrees_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', extrees_grid_search.best_params_, '\n')
print('Best score: ', extrees_grid_search.best_score_, '\n')

Fitting 5 folds for each of 12 candidates, totalling 60 fits


Best hyperparameters:  {'criterion': 'squared_error', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100} 

Best score:  -623.2712676843757 



#### RandomForestRegressor

In [32]:
%%time
param_grid_rf = {'n_estimators': [1400],  # Number of trees in the forest.
                 'max_depth': [None],  # Maximum depth of the trees.
                 'min_samples_split': [7],  # Minimum number of samples required to split an internal node.
                 'min_samples_leaf': [1],  # Minimum number of samples required to be at a leaf node.
                 'max_features': [None]  # Number of features to consider when looking for the best split.
                }

rf_reg = RandomForestRegressor(random_state=0)

rf_grid_search = GridSearchCV(rf_reg, param_grid_rf, cv=5, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)

rf_grid_search.fit(X, y)

print('\n')
print('Best hyperparameters: ', rf_grid_search.best_params_, '\n')
print('Best score: ', rf_grid_search.best_score_, '\n')

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Best hyperparameters:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 1400} 

Best score:  -559.8022665932598 

CPU times: total: 7min 45s
Wall time: 25min 20s


In [35]:
%%time
# Define the model
mlp = MLPRegressor(max_iter=100)

# Set up the parameter grid
param_grid_mlp = {'hidden_layer_sizes': [(50,100,50)],
                  'activation': ['relu'],
                  'solver': ['adam'],
                  'alpha': [0.05],
                 }

# Instantiate GridSearchCV
mlp_grid_search = GridSearchCV(mlp, param_grid_mlp, cv=5)

# Fit the GridSearchCV object
mlp_grid_search.fit(X, y)

# Access the results
print('\n')
print('Best hyperparameters: ', mlp_grid_search.best_params_, '\n')
print('Best score: ', mlp_grid_search.best_score_, '\n')



Best hyperparameters:  {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50), 'solver': 'adam'} 

Best score:  0.9309345150809444 

CPU times: total: 2h 26min 15s
Wall time: 36min 37s


## Train the model
#### Estimators

In [36]:
param_grid_xgb = {'n_estimators': 950,  # Number of trees in the forest.
                  'max_depth': 5,  # Maximum depth of the trees.
                  'subsample': 1,
                  'colsample_bytree': 0.8,
                  'lambda': 0.7,
                  'gamma': 0.05,
                  'learning_rate': 0.035
                 }
param_grid_lgbm = {'num_leaves': 30, 
                   'learning_rate': 0.02 ,
                   'n_estimators': 1300,
                   'max_depth': -1
                  }
param_grid_rf = {'n_estimators': 1400,  # Number of trees in the forest.
                 'max_depth': None,  # Maximum depth of the trees.
                 'min_samples_split': 7,  # Minimum number of samples required to split an internal node.
                 'min_samples_leaf': 1,  # Minimum number of samples required to be at a leaf node.
                 'max_features': None  # Number of features to consider when looking for the best split.
                }
param_grid_mlp = {'hidden_layer_sizes': (50,100,50),
                  'activation': 'relu',
                  'solver': 'adam',
                  'alpha': 0.05,
                 }

### Stacking without final_estimator
#### Train stack model

In [59]:
%%time
xgb_model = XGBRegressor(**param_grid_xgb)
lgb_model = LGBMRegressor(**param_grid_lgbm)
rf_model = RandomForestRegressor(**param_grid_rf)
mlp_model = MLPRegressor(**param_grid_mlp)

estimators = [('xgb1', xgb_model),
              ('lgbm1', lgb_model),
              ('rf', rf_model),
              ('mlp', mlp_model)]
              #('extrees', extrees_model),]

stack_model = StackingRegressor(estimators=estimators, cv=None, n_jobs=-1, verbose=True, passthrough=True)   #cv=None

cv_results = []
cv_score = cross_val_score(stack_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)

hyperparameters = stack_model.get_params()
cv_score_mean = abs(np.mean(cv_results))
print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean)

stack_model.fit(X,y)
hyperparameters_all = stack_model.get_params()

Hyperparameters:  {'cv': None, 'estimators': [('xgb1', XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.05, grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=0.7, learning_rate=0.035,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=950, n_jobs=None,
             num_parallel_tree=None, ...)), ('lgbm1', LGBMRegressor(learning_rate=0.02, n_estimators=1300, num_leaves=30)), ('rf', RandomForestRegressor(max_features=None, min_samples_split=7, n_estimators=1400)), ('mlp', MLPRegressor(alpha=0.05, hidd

#### Result

In [41]:
hyperparameters = stack_model.get_params()
cv_score_mean = abs(np.mean(cv_results))
print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean)

stack_model.fit(X,y)
hyperparameters_all = stack_model.get_params()

Hyperparameters:  {'cv': None, 'estimators': [('lgbm1', XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.05, grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=0.7, learning_rate=0.035,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=950, n_jobs=None,
             num_parallel_tree=None, ...)), ('xgb1', LGBMRegressor(learning_rate=0.02, n_estimators=1300, num_leaves=30)), ('rf', RandomForestRegressor(max_features=None, min_samples_split=7, n_estimators=1400)), ('mlp', MLPRegressor(alpha=0.05, hidd

#### Obtain the prediction

In [60]:
y_pred = stack_model.predict(X_test)
len(y_pred)

13485

In [61]:
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.reset_index(inplace=True)
y_pred_df.columns = ['id', 'price']
y_pred_df.to_csv('../data/submisions/stacking_without_final_estimator_cv_remove_dupl_out.csv', index=False)

### Stacking with final_estimator

In [52]:
%%time
xgb_model = XGBRegressor(**param_grid_xgb)
lgb_model = LGBMRegressor(**param_grid_lgbm)
rf_model = RandomForestRegressor(**param_grid_rf)
mlp_model = MLPRegressor(**param_grid_mlp)

estimators = [('xgb1', xgb_model),
              ('lgbm1', lgb_model),
              ('rf', rf_model),
              ('mlp', mlp_model)]
              #('extrees', extrees_model),]

final_estimator = XGBRegressor()  # Definir el estimador final

stack_model_final_est = StackingRegressor(estimators=estimators, final_estimator=final_estimator,
                                cv=5, n_jobs=-1, verbose=True, passthrough=True)

cv_results = []
cv_score = cross_val_score(stack_model_final_est, X, y, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)
cv_score_mean = abs(np.mean(cv_results))

stack_model_final_est.fit(X,y)
hyperparameters_stacking_2 = stack_model_final_est.get_params()
print('Hyperparameters: ', hyperparameters_stacking_2, ' | cv_score_mean:', cv_score_mean)

Hyperparameters:  {'cv': 5, 'estimators': [('xgb1', XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.05, grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=0.7, learning_rate=0.035,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=950, n_jobs=None,
             num_parallel_tree=None, ...)), ('lgbm1', LGBMRegressor(learning_rate=0.02, n_estimators=1300, num_leaves=30)), ('rf', RandomForestRegressor(max_features=None, min_samples_split=7, n_estimators=1400)), ('mlp', MLPRegressor(alpha=0.05, hidden_

#### Obtain the prediction

In [53]:
y_pred_stack_wfe = stack_model_final_est.predict(X_test)
len(y_pred_stack_wfe)

13485

In [54]:
y_pred_stack_wfe_df = pd.DataFrame(y_pred_stack_wfe)
y_pred_stack_wfe_df.reset_index(inplace=True)
y_pred_stack_wfe_df.columns = ['id', 'price']
y_pred_stack_wfe_df.to_csv('../data/submisions/stacking_with_final_estimator.csv', index=False)

### Voting

In [48]:
%%time
xgb_model = XGBRegressor(**param_grid_xgb)
lgb_model = LGBMRegressor(**param_grid_lgbm)
rf_model = RandomForestRegressor(**param_grid_rf)
mlp_model = MLPRegressor(**param_grid_mlp)

estimators = [('xgb1', xgb_model),
              ('lgbm1', lgb_model),
              ('rf', rf_model),
              ('mlp', mlp_model)]
              #('extrees', extrees_model),]

voting_model = VotingRegressor(estimators=estimators, n_jobs=-1, verbose=True)

cv_results = []
cv_score = cross_val_score(voting_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
cv_results.append(cv_score)

hyperparameters = voting_model.get_params()
cv_score_mean = abs(np.mean(cv_results))
print('Hyperparameters: ', hyperparameters, ' | cv_score_mean:', cv_score_mean)

voting_model.fit(X,y)
hyperparameters_voting = voting_model.get_params()
print('Hyperparameters: ', hyperparameters_voting)

Hyperparameters:  {'estimators': [('xgb1', XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.05, grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=0.7, learning_rate=0.035,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=950, n_jobs=None,
             num_parallel_tree=None, ...)), ('lgbm1', LGBMRegressor(learning_rate=0.02, n_estimators=1300, num_leaves=30)), ('rf', RandomForestRegressor(max_features=None, min_samples_split=7, n_estimators=1400)), ('mlp', MLPRegressor(alpha=0.05, hidden_layer_siz

#### Obtain the prediction

In [50]:
y_pred_voting = voting_model.predict(X_test)
len(y_pred_voting)

13485

In [51]:
y_pred_voting_df = pd.DataFrame(y_pred_voting)
y_pred_voting_df.reset_index(inplace=True)
y_pred_voting_df.columns = ['id', 'price']
y_pred_voting_df.to_csv('../data/submisions/voting.csv', index=False)