# This notebook shows how a SuloRegressor() beat XGBRegressor() and LGBMRegressor() models as well as a Voting ensemble due to its superior design

### We are going to test it on a large dataset using the sample example code provided by:
https://machinelearningmastery.com/weighted-average-ensemble-with-python/

Thanks to Jason Brownlee for his Machine Learning Mastery blogs. They are absolutely great!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from featurewiz import SuloRegressor

  from pandas import MultiIndex, Int64Index


Imported version = 0.1.85.
from featurewiz import FeatureWiz
wiz = FeatureWiz(verbose=1)
X_train_selected = wiz.fit_transform(X_train, y_train)
X_test_selected = wiz.transform(X_test)
wiz.features  ### provides a list of selected features ###
                                


In [3]:
# evaluate a weighted average ensemble for classification compared to base model
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
# get a list of base models
lgbm = LGBMRegressor(random_state=0, n_estimators=100)
def get_models():
	models = list()
	models.append(('XGB', XGBRegressor()))
	models.append(('LGBM', lgbm))
	models.append(('DT', DecisionTreeRegressor()))
	return models
 
# evaluate each base model
def evaluate_models(models, X_train, X_val, y_train, y_val):
	# fit and evaluate the models
	scores = list()
	for name, model in models:
		# fit the model
		model.fit(X_train, y_train)
		# evaluate the model
		yhat = model.predict(X_val)
		acc = np.sqrt(mean_squared_error(y_val, yhat))
		# store the performance
		scores.append(acc)
		# report model performance
	return scores
rr = Ridge()
# define dataset
X, y = make_regression(n_samples=10000, noise=1000, random_state=0)
# split dataset into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
# split the full train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.20, random_state=1)
# create the base models
models = get_models()
# fit and evaluate each model
scores = evaluate_models(models, X_train, X_val, y_train, y_val)
print(scores)
# create the ensemble
#ensemble = VotingClassifier(estimators=models, voting='soft', weights=scores)
ensemble = SuloRegressor(base_estimator=rr, n_estimators=None)
# fit the ensemble on the training dataset
ensemble.fit(pd.DataFrame(X_train_full), pd.Series(y_train_full))
# make predictions on test set
yhat = ensemble.predict(X_test)
# evaluate predictions
score = np.sqrt(mean_squared_error(y_test, yhat))
print('SuloRegressor RMSE: %.3f' % (score))
# evaluate each standalone model
scores = evaluate_models(models, X_train_full, X_test, y_train_full, y_test)
for i in range(len(models)):
	print('>%s: %.3f' % (models[i][0], scores[i]))
# evaluate equal weighting
ensemble = VotingRegressor(estimators=models)
ensemble.fit(X_train_full, y_train_full)
yhat = ensemble.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, yhat))
print('Voting RMSE: %.3f' % (score))

[1131.474087915824, 1069.183635117781, 1435.372818461376]
Input data shapes: X = (8000, 100), y = (8000,)
No GPU available on this device. Using CPU for lightgbm and others.
    Number of estimators used in SuloRegressor = 5
No HPT tuning performed since base estimator is given by input...
    Fold 1: Average OOF Error (smaller is better): 995.101
    Fold 2: Average OOF Error (smaller is better): 1008.726
    Fold 3: Average OOF Error (smaller is better): 998.311
    Fold 4: Average OOF Error (smaller is better): 999.055
    Fold 5: Average OOF Error (smaller is better): 1012.118
Time Taken: 0 (seconds)
SuloRegressor RMSE: 1016.472
>XGB: 1091.456
>LGBM: 1037.188
>DT: 1483.353
Voting RMSE: 1103.755


In [6]:
X_train_full.shape

(8000, 100)