# This notebook shows how a SuloClassifier beat a single model as well as a Voting classifier due to its superior design

### We are going to test it on a large dataset using the sample example code provided by:
https://machinelearningmastery.com/weighted-average-ensemble-with-python/
Thanks to Jason Brownlee for his Machine Learning Mastery blogs. He is absolutely great!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from featurewiz import SuloClassifier

Imported version = 0.1.60.
from featurewiz import FeatureWiz
wiz = FeatureWiz(verbose=1)
X_train_selected = wiz.fit_transform(X_train, y_train)
X_test_selected = wiz.transform(X_test)
wiz.features  ### provides a list of selected features ###
                                


In [6]:
# evaluate a weighted average ensemble for classification compared to base model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier

# get a list of base models
lgbm = LGBMClassifier(random_state=0, n_estimators=100)
def get_models():
	models = list()
	models.append(('lr', LogisticRegression()))
	models.append(('LGBM', lgbm))
	models.append(('bayes', GaussianNB()))
	return models
 
# evaluate each base model
def evaluate_models(models, X_train, X_val, y_train, y_val):
	# fit and evaluate the models
	scores = list()
	for name, model in models:
		# fit the model
		model.fit(X_train, y_train)
		# evaluate the model
		yhat = model.predict(X_val)
		acc = accuracy_score(y_val, yhat)
		# store the performance
		scores.append(acc)
		# report model performance
	return scores
 
# define dataset
X, y = make_classification(n_samples=100000, n_features=50, n_informative=40, 
                           n_redundant=5, random_state=7)
# split dataset into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.50, random_state=1)
# split the full train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.33, random_state=1)
# create the base models
models = get_models()
# fit and evaluate each model
scores = evaluate_models(models, X_train, X_val, y_train, y_val)
print(scores)
# create the ensemble
#ensemble = VotingClassifier(estimators=models, voting='soft', weights=scores)
ensemble = SuloClassifier(base_estimator=None, n_estimators=None)
# fit the ensemble on the training dataset
ensemble.fit(X_train_full, y_train_full)
# make predictions on test set
yhat = ensemble.predict(X_test)
# evaluate predictions
score = accuracy_score(y_test, yhat)
print('Weighted Avg Accuracy: %.3f' % (score*100))
# evaluate each standalone model
scores = evaluate_models(models, X_train_full, X_test, y_train_full, y_test)
for i in range(len(models)):
	print('>%s: %.3f' % (models[i][0], scores[i]*100))
# evaluate equal weighting
ensemble = VotingClassifier(estimators=models, voting='soft')
ensemble.fit(X_train_full, y_train_full)
yhat = ensemble.predict(X_test)
score = accuracy_score(y_test, yhat)
print('Voting Accuracy: %.3f' % (score*100))

[0.8548484848484849, 0.9620606060606061, 0.8383030303030303]
Class weights used in classifier are: {0: 1, 1: 1}
Number of estimators = 4
Finding best params for base estimator using random search...
    best score is : 0.9830395311985007
    best estimator is : LGBMClassifier(boosting_type='goss', is_unbalance=True,
               learning_rate=0.5434049417909654, max_depth=10, metric='auc',
               n_estimators=200, num_class=1, num_leaves=398,
               objective='binary', scale_pos_weight=None)
    best Params is : {'learning_rate': 0.5434049417909654, 'n_estimators': 200, 'num_leaves': 398}
Time Taken for random search: 52 (seconds)
    base estimator = LGBMClassifier(boosting_type='goss', is_unbalance=True,
               learning_rate=0.5434049417909654, max_depth=10, metric='auc',
               n_estimators=200, num_class=1, num_leaves=398,
               objective='binary', scale_pos_weight=None)
    Fold 1: OOF Score: 98%
    Fold 2: OOF Score: 98%
    Fold 3: OOF