<a href="https://colab.research.google.com/github/AbrahamOtero/MLiB/blob/main/6_MetaModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Voting

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

url = 'https://raw.githubusercontent.com/AbrahamOtero/MLiB/main/datasets/diabetes.csv'

diabetes = pd.read_csv(url)

# The featrures
X = diabetes.iloc[:, :-1]
# The class
y = diabetes.iloc[:,-1]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(random_state=42))
    ]
)
voting_clf.fit(X_train, y_train)

# dual='auto' avoids the generation of warnings.
# It chooses the algorithm to use for training depending on the size of the data set and the number of features.
svm_clf = LinearSVC(  dual='auto',  random_state=42)

cross_val_score(voting_clf, X, y, cv=10, scoring="accuracy")

array([0.72727273, 0.79220779, 0.77922078, 0.7012987 , 0.74025974,
       0.79220779, 0.79220779, 0.83116883, 0.68421053, 0.82894737])

In [None]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

lr = 0.7291666666666666
rf = 0.734375
svc = 0.7291666666666666


## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, n_jobs=-1, random_state=42)
cross_val_score(bag_clf, X, y, cv=10, scoring="accuracy")

array([0.76623377, 0.81818182, 0.76623377, 0.68831169, 0.71428571,
       0.79220779, 0.79220779, 0.81818182, 0.72368421, 0.81578947])

### Random Forests

A Random Forest is equivalent to a bag of decision trees:


In [None]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                                 n_jobs=-1, random_state=42)

cross_val_score(rnd_clf, X, y, cv=10, scoring="accuracy")

array([0.76623377, 0.76623377, 0.76623377, 0.68831169, 0.7012987 ,
       0.81818182, 0.77922078, 0.83116883, 0.69736842, 0.80263158])

Feature importance

In [None]:
rnd_clf.fit(X, y)

for score, name in zip(rnd_clf.feature_importances_, diabetes.columns):
    print(round(score, 2), name)

0.07 Pregnancies
0.38 Glucose
0.04 BloodPressure
0.04 SkinThickness
0.06 Insulin
0.18 BMI
0.08 DiabetesPedigreeFunction
0.15 Age


## Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=30,
    learning_rate=0.5, random_state=42)

cross_val_score(ada_clf, X, y, cv=10, scoring="accuracy")

array([0.72727273, 0.79220779, 0.77922078, 0.66233766, 0.71428571,
       0.76623377, 0.81818182, 0.80519481, 0.73684211, 0.81578947])

### Gradient Boosting

**GradientBoostingRegressor**


In [None]:
from sklearn.ensemble import GradientBoostingClassifier


gbrt = GradientBoostingClassifier(max_depth=2, n_estimators=3,
                                 learning_rate=1.0, random_state=42)


cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")



array([0.71428571, 0.77922078, 0.76623377, 0.63636364, 0.72727273,
       0.76623377, 0.77922078, 0.80519481, 0.69736842, 0.78947368])

## Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5  # number of cross-validation folds
)


cross_val_score(stacking_clf, X, y, cv=10, scoring="accuracy")




array([0.72727273, 0.77922078, 0.71428571, 0.66233766, 0.77922078,
       0.77922078, 0.79220779, 0.80519481, 0.73684211, 0.75      ])