In [5]:

# evaluate a weighted average ensemble for classification compared to base model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
import numpy as np
import pandas as pd
# get a list of base models
def get_models():
    models = list()
    models.append(('lr', LogisticRegression()))
    models.append(('cart', DecisionTreeClassifier()))
    models.append(('bayes', GaussianNB()))
    return models
# evaluate each base model
def evaluate_models(models, X_train, X_val, y_train, y_val):
    # fit and evaluate the models
    scores = list()
    for name, model in models:
        # fit the model
        model.fit(X_train, y_train)
        # evaluate the model
        yhat = model.predict(X_val)
        acc = accuracy_score(y_val, yhat)
        # store the performance
        scores.append(acc)
        # report model performance
    return scores
# generate grid for weights
def generate_combinations(num_dimensions, step):
    # Define the ranges for each dimension (0 to 1 in steps of 'step')
    ranges = np.arange(0, 1 + step, step)
    # Create a meshgrid of all combinations
    meshgrid = np.meshgrid(*[ranges] * num_dimensions)
    # Stack the grid points into a single array
    combinations = np.vstack([x.flatten() for x in meshgrid]).T
    # Remove the combination (0, 0, 0)
    combinations = combinations[~np.all(combinations == 0, axis=1)]
    # Filter out combinations where the sum is not exactly 1
    valid_combinations = []
    for combo in combinations:
        if np.isclose(np.sum(combo), 1.0):
            valid_combinations.append(combo)
    
    return np.array(valid_combinations)
# for more granular search, make the step parameter closer
combinations = generate_combinations(num_dimensions=3, step=0.1)
# define dataset
X, y = make_classification(n_samples=10000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# split dataset into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.50, random_state=1)
# split the full train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.33, random_state=1)
# create the base models
models = get_models()
# fit and evaluate each model
scores = evaluate_models(models, X_train, X_val, y_train, y_val)
print(scores)


[0.8896969696969697, 0.8648484848484849, 0.8812121212121212]


In [6]:
# Grid Searching different model weights
ensemble_score = []
i = 1
for i, comb in enumerate(combinations, start=1):
    print(f"Processing combination {i}/{len(combinations)}", end="\r")
    weights = comb
    ensemble = VotingClassifier(estimators=models, voting='soft', weights=weights)
    # fit the ensemble on the training dataset
    ensemble.fit(X_train_full, y_train_full)
    # make predictions on test set
    y_pred = ensemble.predict(X_test)
    # evaluate predictions
    score = accuracy_score(y_test, y_pred)   
    ensemble_score.append(score)
df_scores = pd.DataFrame({'name': list(combinations), 'score': ensemble_score,})
print(df_scores.sort_values(by='score', ascending=False).iloc[0])

name     [0.4, 0.4, 0.2]6/66
score             0.9092
Name: 42, dtype: object


In [7]:
# evaluate each standalone model
scores = evaluate_models(models, X_train_full, X_test, y_train_full, y_test)
for i in range(len(models)):
    print('>%s: %.3f' % (models[i][0], scores[i]*100))
# evaluate equal weighting
ensemble = VotingClassifier(estimators=models, voting='soft')
ensemble.fit(X_train_full, y_train_full)
y_pred = ensemble.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Voting Accuracy: %.3f' % (score*100))

>lr: 87.800
>cart: 88.700
>bayes: 87.300
Voting Accuracy: 90.720
