__Test the random forest class using the following protocol:__ 
1. Use the iris.csv dataset 
2. Split the data into train and test sets 
3. Create the RandomForestClassifier model 
4. Train the model. What is the score of the model on the test set?

In [1]:
import numpy as np
import pandas as pd
from si.Data.dataset import Dataset
from si.IO.CSV import read_csv
from si.model_selection.split import train_test_split
from si.models.random_forest_classifier import RandomForestClassifier  

breast_bin = read_csv(r'C:\Users\bruna\PycharmProjects\SIB\datasets\breast_bin\breast-bin.csv', sep=",", label=True, features=True)

In [2]:
train, test = train_test_split(breast_bin, test_size=0.33, random_state=42)

model = RandomForestClassifier(n_estimators=100, max_features=None, min_sample_split=2, max_depth=10, mode='gini', seed=42)

model.fit(train)


<si.models.random_forest_classifier.RandomForestClassifier at 0x1d7168d0650>

In [3]:
score = model.score(test)

print(f"Accuracy on the test set: {score:.2f}")

ValueError: Number of features must match the number of columns in X

__Test the StackingClassifier model using the following protocol:__
1. Use the breast-bin.csv dataset 
2. Split the data into train and test sets 
3. Create a KNNClassifier model 
4. Create a LogisticRegression model
5. Create a DecisionTree model 
6. Create a second KNNClassifier model (final model) 
7. Create a StackingClassifier model using the previous classifiers. The second KNNClassifier model must be used as the final model.
8. Train the StackingClassifier model. What is the score of the model on the test set?

In [16]:
from si.models.knn_classifier import KNNClassifier
from sklearn.preprocessing import StandardScaler
from si.models.logistic_regression import LogisticRegression
from si.ensemble.stacking_classifier import StackingClassifier
from si.models.decision_tree_classifier import DecisionTreeClassifier
from si.statistics.euclidean_distance import euclidean_distance

In [9]:
# standardization
breast_bin.X = StandardScaler().fit_transform(breast_bin.X)
breast_bin.X

array([[ 0.20885295, -0.69912815, -0.74242297, ..., -1.0000359 ,
        -0.61132565, -0.34418721],
       [-0.8578253 , -0.69912815, -0.74242297, ..., -0.58991542,
        -0.61132565, -0.34418721],
       [-0.8578253 , -0.69912815, -0.74242297, ..., -0.17979494,
        -0.61132565, -0.34418721],
       ...,
       [ 0.20885295, -0.37139715, -0.40592217, ..., -1.0000359 ,
        -0.61132565,  0.23892607],
       [-0.8578253 , -0.04366616, -0.40592217, ..., -0.17979494,
        -0.61132565, -0.34418721],
       [ 0.91997179,  0.93952681,  0.94008103, ...,  1.46068699,
        -0.61132565, -0.34418721]])

In [10]:
# split dataset
train_dataset, test_dataset = train_test_split(breast_bin)
test_dataset.y

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0.,
       1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 1.])

In [18]:
# initialize the KNN, Logistic classifier and final model
knn = KNNClassifier(k=3)
lg_model = LogisticRegression(l2_penalty=1, alpha=0.001, max_iter=1000)
dt_model = DecisionTreeClassifier()
final_model = KNNClassifier(k=2, distance=euclidean_distance)

In [19]:
# initialize the stacking classifier
stacking = StackingClassifier([knn, lg_model, dt_model], final_model)

In [20]:
stacking.fit(train_dataset)

<si.ensemble.stacking_classifier.StackingClassifier at 0x1d71b24fb90>

In [21]:
stacking.predict(test_dataset)

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0.,
       1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 1.])

In [25]:
score = stacking.score(test_dataset)
print(f"Score of the StackingClassifier on the test set: {score}")

Score of the StackingClassifier on the test set: 0.9640287769784173
