# Meta Learning Algorithms

In [8]:
import time
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import sklearn.model_selection as cv
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    VotingClassifier,    
    GradientBoostingClassifier
)
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)  # To ignore warnings due to deprecated functions

%matplotlib inline

In [9]:
#LOAD DE DATASET
smoking = pd.read_csv(
    "./Dataset/Standardized_Smoking.csv",  # We can select a local file or pass an url
    sep = ',', 
)
X = smoking.iloc[:, :-1]
y = smoking.loc[:, "smoking"]
undersampler = RandomUnderSampler(sampling_strategy="majority", random_state=1)
print(f"We have {smoking.shape[0]} samples and {smoking.shape[1]} features.")  # Shape returns (nº samples, nº features)
print(X)
print(y)

We have 55494 samples and 25 features.
         gender       age    height    weight     waist    sightL    sightR  \
0     -1.319926 -0.344157 -1.050078 -0.457567 -0.079491  0.618616  0.021820   
1     -1.319926 -0.344157 -0.506364 -0.457567 -0.111843 -0.607480 -1.210286   
2      0.757618  0.900109  0.581062 -0.457567 -0.219682 -0.607480 -0.594233   
3      0.757618 -0.344157  0.037349  0.322412  0.643036  1.538188  1.561952   
4     -1.319926 -0.344157 -1.050078 -0.457567  0.427356  0.005568  0.021820   
...         ...       ...       ...       ...       ...       ...       ...   
55489 -1.319926 -0.344157  0.581062 -0.067577 -0.758881 -0.300956 -0.286207   
55490 -1.319926  0.070598 -0.506364 -1.237546 -1.298080  0.618616  0.637872   
55491 -1.319926  0.900109 -0.506364 -1.237546 -1.459840  0.005568  0.637872   
55492  0.757618  1.314865  0.037349 -0.457567 -0.435362 -0.607480  0.021820   
55493  0.757618  0.900109 -0.506364 -0.067577  0.319516 -0.300956 -0.902260   

         hea

In [10]:
# Define a function to measure execution's time. It will be use as a 'decorator'
# The idea behind this is just to use it to estimate the time it will take for the
# full code to run if you want to know before training with the complete dataset
# using a subset of size known in relationship with the full dataset.
def compute_executions_time(function):
    def wrapper():
        start_time = time.time()  # init measuring time
        function()  # execute function
        print(f"\n{time.time() - start_time} seconds")  # print execution time
    return wrapper

# Majority Voting

In [25]:
@compute_executions_time
def execute_voting_scheme_different_estimators_grid_search_and_cv(cv=10):
    naive_bayes = GaussianNB()

    clf2 = KNeighborsClassifier(
        n_neighbors = 27,
        weights = 'distance'
    )
    clf3 = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)
    
    for clf, label in zip([naive_bayes, clf2, clf3], ["Naive Bayes","Knn", "Dec. Tree", ]):
        pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', clf)])
        scores = cross_validate(
            pipeline, 
            X, 
            y, 
            cv = cv, 
            scoring=["f1", "recall", "precision"]
        )
        mean_f1 = scores['test_f1'].mean()
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()
        
        print(f"Algorithm:={label}")
        print(f"Mean F1 Score: {mean_f1:.3f}")
        print(f"Mean Recall: {mean_recall:.3f}")
        print(f"Mean Precision: {mean_precision:.3f}")
        print("-" * 30)
    
    voting_classifier = VotingClassifier(
        estimators=[
            ("nb", naive_bayes),
            ("knn", clf2),
            ("dt", clf3)
        ],
        voting="hard"
    )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', voting_classifier)])
    scores = cross_validate(
        pipeline,
        X,
        y,
        cv = cv,
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"Algorithm:= Majority Voting HARD")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)

    voting_classifier2 = VotingClassifier(
        estimators=[
            ("nb", naive_bayes),
            ("knn", clf2),
            ("dt", clf3)
        ],
        voting="soft",
        weights=[1,2,2]
    )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', voting_classifier2)])
    scores = cross_validate(
        pipeline,
        X,
        y,
        cv = cv,
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"Algorithm:= Majority Voting SOFT")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)



execute_voting_scheme_different_estimators_grid_search_and_cv()

Algorithm:=Naive Bayes
Mean F1 Score: 0.679
Mean Recall: 0.850
Mean Precision: 0.565
------------------------------
Algorithm:=Knn
Mean F1 Score: 0.765
Mean Recall: 0.915
Mean Precision: 0.658
------------------------------
Algorithm:=Dec. Tree
Mean F1 Score: 0.725
Mean Recall: 0.796
Mean Precision: 0.665
------------------------------
Algorithm:= Majority Voting HARD
Mean F1 Score: 0.765
Mean Recall: 0.922
Mean Precision: 0.655
------------------------------
Algorithm:= Majority Voting SOFT
Mean F1 Score: 0.753
Mean Recall: 0.832
Mean Precision: 0.688
------------------------------

17.901648998260498 seconds


# Bagging

## WITH KNN

### MAX FEATURES NOT FIXED

In [26]:
dt = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)
gb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=27,
    weights='distance'
)
for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    model = BaggingClassifier(
            estimator = knn,
            n_estimators = nest,
            #max_features=0.35
        )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', model)])
    
    scores = cross_validate(
        pipeline, 
        X, 
        y, 
        cv = 10, 
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"n_estimators={nest}")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)

n_estimators=1
Mean F1 Score: 0.732
Mean Recall: 0.869
Mean Precision: 0.632
------------------------------
n_estimators=2
Mean F1 Score: 0.751
Mean Recall: 0.894
Mean Precision: 0.648
------------------------------
n_estimators=5
Mean F1 Score: 0.760
Mean Recall: 0.911
Mean Precision: 0.653
------------------------------
n_estimators=10
Mean F1 Score: 0.763
Mean Recall: 0.914
Mean Precision: 0.655
------------------------------
n_estimators=20
Mean F1 Score: 0.764
Mean Recall: 0.916
Mean Precision: 0.656
------------------------------
n_estimators=50
Mean F1 Score: 0.765
Mean Recall: 0.917
Mean Precision: 0.657
------------------------------
n_estimators=100
Mean F1 Score: 0.765
Mean Recall: 0.917
Mean Precision: 0.657
------------------------------
n_estimators=200
Mean F1 Score: 0.765
Mean Recall: 0.918
Mean Precision: 0.657
------------------------------


### MAX FEATUERS FIXED

In [13]:
dt = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)
gb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=27,
    weights='distance'
)
for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    model = BaggingClassifier(
            estimator = knn,
            n_estimators = nest,
            max_features=0.35
        )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', model)])
    
    scores = cross_validate(
        pipeline, 
        X, 
        y, 
        cv = 10, 
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"n_estimators={nest}")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)

n_estimators=1
Mean F1 Score: 0.689
Mean Recall: 0.813
Mean Precision: 0.599
------------------------------
n_estimators=2
Mean F1 Score: 0.735
Mean Recall: 0.870
Mean Precision: 0.637
------------------------------
n_estimators=5
Mean F1 Score: 0.757
Mean Recall: 0.915
Mean Precision: 0.647
------------------------------
n_estimators=10
Mean F1 Score: 0.763
Mean Recall: 0.929
Mean Precision: 0.648
------------------------------
n_estimators=20
Mean F1 Score: 0.766
Mean Recall: 0.945
Mean Precision: 0.646
------------------------------
n_estimators=50
Mean F1 Score: 0.765
Mean Recall: 0.946
Mean Precision: 0.643
------------------------------
n_estimators=100
Mean F1 Score: 0.766
Mean Recall: 0.949
Mean Precision: 0.643
------------------------------
n_estimators=200
Mean F1 Score: 0.765
Mean Recall: 0.946
Mean Precision: 0.643
------------------------------


## WITH DECISION TREES


### MAX FEATURES NOT FIXED

In [14]:
dt = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)
gb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=27,
    weights='distance'
)
for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    model = BaggingClassifier(
            estimator = dt,
            n_estimators = nest,
            #max_features=0.35
        )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', model)])
    
    scores = cross_validate(
        pipeline, 
        X, 
        y, 
        cv = 10, 
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"n_estimators={nest}")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)

n_estimators=1
Mean F1 Score: 0.683
Mean Recall: 0.753
Mean Precision: 0.625
------------------------------
n_estimators=2
Mean F1 Score: 0.645
Mean Recall: 0.594
Mean Precision: 0.708
------------------------------
n_estimators=5
Mean F1 Score: 0.748
Mean Recall: 0.843
Mean Precision: 0.672
------------------------------
n_estimators=10
Mean F1 Score: 0.761
Mean Recall: 0.835
Mean Precision: 0.699
------------------------------
n_estimators=20
Mean F1 Score: 0.774
Mean Recall: 0.877
Mean Precision: 0.692
------------------------------
n_estimators=50
Mean F1 Score: 0.779
Mean Recall: 0.903
Mean Precision: 0.685
------------------------------
n_estimators=100
Mean F1 Score: 0.780
Mean Recall: 0.911
Mean Precision: 0.682
------------------------------
n_estimators=200
Mean F1 Score: 0.780
Mean Recall: 0.915
Mean Precision: 0.680
------------------------------


### MAX FEATURES FIXED

In [15]:
dt = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)
gb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=27,
    weights='distance'
)
for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    model = BaggingClassifier(
            estimator = dt,
            n_estimators = nest,
            max_features=0.35
        )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', model)])
    
    scores = cross_validate(
        pipeline, 
        X, 
        y, 
        cv = 10, 
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"n_estimators={nest}")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)

n_estimators=1
Mean F1 Score: 0.643
Mean Recall: 0.721
Mean Precision: 0.581
------------------------------
n_estimators=2
Mean F1 Score: 0.596
Mean Recall: 0.536
Mean Precision: 0.675
------------------------------
n_estimators=5
Mean F1 Score: 0.722
Mean Recall: 0.813
Mean Precision: 0.649
------------------------------
n_estimators=10
Mean F1 Score: 0.736
Mean Recall: 0.797
Mean Precision: 0.683
------------------------------
n_estimators=20
Mean F1 Score: 0.757
Mean Recall: 0.854
Mean Precision: 0.680
------------------------------
n_estimators=50
Mean F1 Score: 0.765
Mean Recall: 0.898
Mean Precision: 0.667
------------------------------
n_estimators=100
Mean F1 Score: 0.769
Mean Recall: 0.921
Mean Precision: 0.661
------------------------------
n_estimators=200
Mean F1 Score: 0.769
Mean Recall: 0.929
Mean Precision: 0.657
------------------------------


## WITH NAIVE BAYES

### MAX FEATURES NOT FIXED

In [16]:
dt = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)
gb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=27,
    weights='distance'
)
for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    model = BaggingClassifier(
            estimator = gb,
            n_estimators = nest,
            #max_features=0.35
        )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', model)])
    
    scores = cross_validate(
        pipeline, 
        X, 
        y, 
        cv = 10, 
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"n_estimators={nest}")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)

n_estimators=1
Mean F1 Score: 0.679
Mean Recall: 0.851
Mean Precision: 0.564
------------------------------
n_estimators=2
Mean F1 Score: 0.676
Mean Recall: 0.840
Mean Precision: 0.566
------------------------------
n_estimators=5
Mean F1 Score: 0.679
Mean Recall: 0.851
Mean Precision: 0.565
------------------------------
n_estimators=10
Mean F1 Score: 0.679
Mean Recall: 0.849
Mean Precision: 0.565
------------------------------
n_estimators=20
Mean F1 Score: 0.678
Mean Recall: 0.847
Mean Precision: 0.565
------------------------------
n_estimators=50
Mean F1 Score: 0.679
Mean Recall: 0.849
Mean Precision: 0.565
------------------------------
n_estimators=100
Mean F1 Score: 0.679
Mean Recall: 0.849
Mean Precision: 0.565
------------------------------
n_estimators=200
Mean F1 Score: 0.679
Mean Recall: 0.849
Mean Precision: 0.566
------------------------------


### MAX FEATURES FIXED

In [17]:
dt = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_split=2,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    max_depth=42
)
gb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=27,
    weights='distance'
)
for nest in [1, 2, 5, 10, 20, 50, 100, 200]:
    model = BaggingClassifier(
            estimator = gb,
            n_estimators = nest,
            max_features=0.35
        )
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', model)])
    
    scores = cross_validate(
        pipeline, 
        X, 
        y, 
        cv = 10, 
        scoring=["f1", "recall", "precision"]
    )
    mean_f1 = scores['test_f1'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_precision = scores['test_precision'].mean()
    
    print(f"n_estimators={nest}")
    print(f"Mean F1 Score: {mean_f1:.3f}")
    print(f"Mean Recall: {mean_recall:.3f}")
    print(f"Mean Precision: {mean_precision:.3f}")
    print("-" * 30)

n_estimators=1
Mean F1 Score: 0.614
Mean Recall: 0.730
Mean Precision: 0.563
------------------------------
n_estimators=2
Mean F1 Score: 0.667
Mean Recall: 0.824
Mean Precision: 0.563
------------------------------
n_estimators=5
Mean F1 Score: 0.653
Mean Recall: 0.780
Mean Precision: 0.565
------------------------------
n_estimators=10
Mean F1 Score: 0.671
Mean Recall: 0.829
Mean Precision: 0.565
------------------------------
n_estimators=20
Mean F1 Score: 0.679
Mean Recall: 0.854
Mean Precision: 0.565
------------------------------
n_estimators=50
Mean F1 Score: 0.679
Mean Recall: 0.850
Mean Precision: 0.566
------------------------------
n_estimators=100
Mean F1 Score: 0.678
Mean Recall: 0.846
Mean Precision: 0.566
------------------------------
n_estimators=200
Mean F1 Score: 0.678
Mean Recall: 0.844
Mean Precision: 0.566
------------------------------


# Random Forest

### MAX FEATURES NOT FIXED

In [18]:
@compute_executions_time
def execute_random_forest_with_different_estimators_and_cv(cv=10):
    for n_trees in [1, 2, 5, 10, 20, 50, 100, 200]:
        random_forest_classifier = RandomForestClassifier(
            n_estimators = n_trees,
            #max_features=0.35
        )
        pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', random_forest_classifier)])
        scores = cross_validate(
            estimator = pipeline,
            X = X,
            y = y,
            cv=cv,
            scoring=["f1", "recall", "precision"]
        )
        mean_f1 = scores['test_f1'].mean()
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()
        
        print(f"n_estimators={n_trees}")
        print(f"Mean F1 Score: {mean_f1:.3f}")
        print(f"Mean Recall: {mean_recall:.3f}")
        print(f"Mean Precision: {mean_precision:.3f}")
        print("-" * 30)
        
execute_random_forest_with_different_estimators_and_cv();

n_estimators=1
Mean F1 Score: 0.671
Mean Recall: 0.743
Mean Precision: 0.613
------------------------------
n_estimators=2
Mean F1 Score: 0.639
Mean Recall: 0.584
Mean Precision: 0.706
------------------------------
n_estimators=5
Mean F1 Score: 0.745
Mean Recall: 0.840
Mean Precision: 0.669
------------------------------
n_estimators=10
Mean F1 Score: 0.760
Mean Recall: 0.834
Mean Precision: 0.697
------------------------------
n_estimators=20
Mean F1 Score: 0.772
Mean Recall: 0.881
Mean Precision: 0.688
------------------------------
n_estimators=50
Mean F1 Score: 0.778
Mean Recall: 0.912
Mean Precision: 0.679
------------------------------
n_estimators=100
Mean F1 Score: 0.779
Mean Recall: 0.923
Mean Precision: 0.675
------------------------------
n_estimators=200
Mean F1 Score: 0.779
Mean Recall: 0.927
Mean Precision: 0.673
------------------------------

181.67126631736755 seconds


### MAX FEATUERS FIXED

In [19]:
@compute_executions_time
def execute_random_forest_with_different_estimators_and_cv(cv=10):
    for n_trees in [1, 2, 5, 10, 20, 50, 100, 200]:
        random_forest_classifier = RandomForestClassifier(
            n_estimators = n_trees,
            max_features=0.35
        )
        pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', random_forest_classifier)])
        scores = cross_validate(
            estimator = pipeline,
            X = X,
            y = y,
            cv=cv,
            scoring=["f1", "recall", "precision"]
        )
        mean_f1 = scores['test_f1'].mean()
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()
        
        print(f"n_estimators={n_trees}")
        print(f"Mean F1 Score: {mean_f1:.3f}")
        print(f"Mean Recall: {mean_recall:.3f}")
        print(f"Mean Precision: {mean_precision:.3f}")
        print("-" * 30)
        
execute_random_forest_with_different_estimators_and_cv();

n_estimators=1
Mean F1 Score: 0.679
Mean Recall: 0.751
Mean Precision: 0.620
------------------------------
n_estimators=2
Mean F1 Score: 0.644
Mean Recall: 0.591
Mean Precision: 0.709
------------------------------
n_estimators=5
Mean F1 Score: 0.747
Mean Recall: 0.844
Mean Precision: 0.670
------------------------------
n_estimators=10
Mean F1 Score: 0.760
Mean Recall: 0.831
Mean Precision: 0.700
------------------------------
n_estimators=20
Mean F1 Score: 0.774
Mean Recall: 0.878
Mean Precision: 0.692
------------------------------
n_estimators=50
Mean F1 Score: 0.776
Mean Recall: 0.903
Mean Precision: 0.681
------------------------------
n_estimators=100
Mean F1 Score: 0.778
Mean Recall: 0.915
Mean Precision: 0.678
------------------------------
n_estimators=200
Mean F1 Score: 0.780
Mean Recall: 0.921
Mean Precision: 0.677
------------------------------

319.6738233566284 seconds


# EXTRA TREES

In [20]:
@compute_executions_time
def execute_extra_tree_with_different_estimators_and_cv(cv=10):
    for n_trees in [1, 2, 5, 10, 20, 50, 100, 200]:
        extra_trees = ExtraTreesClassifier(
            n_estimators = n_trees,
            max_features=0.35
        )
        pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', extra_trees)])
        scores = cross_validate(
            estimator = pipeline,
            X = X,
            y = y,
            cv=cv,
            scoring=["f1", "recall", "precision"]
        )
        mean_f1 = scores['test_f1'].mean()
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()
        
        print(f"n_estimators={n_trees}")
        print(f"Mean F1 Score: {mean_f1:.3f}")
        print(f"Mean Recall: {mean_recall:.3f}")
        print(f"Mean Precision: {mean_precision:.3f}")
        print("-" * 30)
        
execute_extra_tree_with_different_estimators_and_cv();

n_estimators=1
Mean F1 Score: 0.717
Mean Recall: 0.788
Mean Precision: 0.658
------------------------------
n_estimators=2
Mean F1 Score: 0.703
Mean Recall: 0.674
Mean Precision: 0.742
------------------------------
n_estimators=5
Mean F1 Score: 0.755
Mean Recall: 0.846
Mean Precision: 0.682
------------------------------
n_estimators=10
Mean F1 Score: 0.761
Mean Recall: 0.835
Mean Precision: 0.699
------------------------------
n_estimators=20
Mean F1 Score: 0.772
Mean Recall: 0.877
Mean Precision: 0.690
------------------------------
n_estimators=50
Mean F1 Score: 0.776
Mean Recall: 0.905
Mean Precision: 0.680
------------------------------
n_estimators=100
Mean F1 Score: 0.779
Mean Recall: 0.917
Mean Precision: 0.677
------------------------------
n_estimators=200
Mean F1 Score: 0.779
Mean Recall: 0.922
Mean Precision: 0.675
------------------------------

130.08190751075745 seconds


# ADA BOOST

In [21]:
@compute_executions_time
def execute_ada_boost_classifier_for_different_classifiers(cv=10):
    warnings.filterwarnings("ignore", category=FutureWarning)
    for n_estimators in [1, 2, 5, 10, 20, 50, 100, 200]:
        ada_boos_classifier = AdaBoostClassifier(
            n_estimators=n_estimators
        )
        pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', ada_boos_classifier)])
        scores = cross_validate(
            pipeline,
            X,
            y,
            cv = cv,
            scoring=["f1", "recall", "precision"]
        )
        mean_f1 = scores['test_f1'].mean()
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()
        
        print(f"n_estimators={n_estimators}")
        print(f"Mean F1 Score: {mean_f1:.3f}")
        print(f"Mean Recall: {mean_recall:.3f}")
        print(f"Mean Precision: {mean_precision:.3f}")
        print("-" * 30)

execute_ada_boost_classifier_for_different_classifiers()

n_estimators=1
Mean F1 Score: 0.702
Mean Recall: 0.958
Mean Precision: 0.554
------------------------------
n_estimators=2
Mean F1 Score: 0.702
Mean Recall: 0.958
Mean Precision: 0.554
------------------------------
n_estimators=5
Mean F1 Score: 0.706
Mean Recall: 0.940
Mean Precision: 0.565
------------------------------
n_estimators=10
Mean F1 Score: 0.709
Mean Recall: 0.900
Mean Precision: 0.585
------------------------------
n_estimators=20
Mean F1 Score: 0.712
Mean Recall: 0.880
Mean Precision: 0.598
------------------------------
n_estimators=50
Mean F1 Score: 0.717
Mean Recall: 0.874
Mean Precision: 0.607
------------------------------
n_estimators=100
Mean F1 Score: 0.717
Mean Recall: 0.871
Mean Precision: 0.609
------------------------------
n_estimators=200
Mean F1 Score: 0.717
Mean Recall: 0.870
Mean Precision: 0.610
------------------------------

88.95444440841675 seconds


# Gradient Boosting

In [22]:
@compute_executions_time
def execute_gradient_boosting_classifier_for_different_classifiers(cv=10):
    warnings.filterwarnings("ignore", category=FutureWarning)
    for n_estimators in [1, 2, 5, 10, 20, 50, 100, 200]:
        gradient = GradientBoostingClassifier(
            n_estimators=n_estimators
        )
        pipeline = Pipeline(steps=[('undersampler', undersampler), ('model', gradient)])
        scores = cross_validate(
            pipeline,
            X,
            y,
            cv = cv,
            scoring=["f1", "recall", "precision"]
        )
        mean_f1 = scores['test_f1'].mean()
        mean_recall = scores['test_recall'].mean()
        mean_precision = scores['test_precision'].mean()
        
        print(f"n_estimators={n_estimators}")
        print(f"Mean F1 Score: {mean_f1:.3f}")
        print(f"Mean Recall: {mean_recall:.3f}")
        print(f"Mean Precision: {mean_precision:.3f}")
        print("-" * 30)

execute_gradient_boosting_classifier_for_different_classifiers()

n_estimators=1
Mean F1 Score: 0.704
Mean Recall: 0.949
Mean Precision: 0.559
------------------------------
n_estimators=2
Mean F1 Score: 0.704
Mean Recall: 0.950
Mean Precision: 0.559
------------------------------
n_estimators=5
Mean F1 Score: 0.705
Mean Recall: 0.953
Mean Precision: 0.559
------------------------------
n_estimators=10
Mean F1 Score: 0.705
Mean Recall: 0.954
Mean Precision: 0.559
------------------------------
n_estimators=20
Mean F1 Score: 0.705
Mean Recall: 0.951
Mean Precision: 0.560
------------------------------
n_estimators=50
Mean F1 Score: 0.715
Mean Recall: 0.919
Mean Precision: 0.585
------------------------------
n_estimators=100
Mean F1 Score: 0.718
Mean Recall: 0.897
Mean Precision: 0.599
------------------------------
n_estimators=200
Mean F1 Score: 0.720
Mean Recall: 0.884
Mean Precision: 0.608
------------------------------

207.97303462028503 seconds
