__Test the StackingClassifier model using the following protocol:__
1. Use the breast-bin.csv dataset 
2. Split the data into train and test sets 
3. Create a KNNClassifier model 
4. Create a LogisticRegression model
5. Create a DecisionTree model 
6. Create a second KNNClassifier model (final model) 
7. Create a StackingClassifier model using the previous classifiers. The second KNNClassifier model must be used as the final model.
8. Train the StackingClassifier model. What is the score of the model on the test set?

In [1]:
from si.models.knn_classifier import KNNClassifier
from sklearn.preprocessing import StandardScaler
from si.models.logistic_regression import LogisticRegression
from si.ensemble.stacking_classifier import StackingClassifier
from si.models.decision_tree_classifier import DecisionTreeClassifier
from si.statistics.euclidean_distance import euclidean_distance
from si.IO.CSV import read_csv
from si.model_selection.split import train_test_split
from si.model_selection.grid_search import grid_search_cv
from si.model_selection.cross_validate import k_fold_cross_validation
breast_bin = read_csv(r'C:\Users\bruna\PycharmProjects\SIB\datasets\breast_bin\breast-bin.csv', sep=",", label=True, features=True)

In [9]:
# standardization
breast_bin.X = StandardScaler().fit_transform(breast_bin.X)
breast_bin.X

array([[ 0.20885295, -0.69912815, -0.74242297, ..., -1.0000359 ,
        -0.61132565, -0.34418721],
       [-0.8578253 , -0.69912815, -0.74242297, ..., -0.58991542,
        -0.61132565, -0.34418721],
       [-0.8578253 , -0.69912815, -0.74242297, ..., -0.17979494,
        -0.61132565, -0.34418721],
       ...,
       [ 0.20885295, -0.37139715, -0.40592217, ..., -1.0000359 ,
        -0.61132565,  0.23892607],
       [-0.8578253 , -0.04366616, -0.40592217, ..., -0.17979494,
        -0.61132565, -0.34418721],
       [ 0.91997179,  0.93952681,  0.94008103, ...,  1.46068699,
        -0.61132565, -0.34418721]])

In [10]:
train_dataset, test_dataset = train_test_split(breast_bin)
test_dataset.y

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0.,
       1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 1.])

In [18]:
# initialize the KNN, Logistic classifier and final model
knn = KNNClassifier(k=3)
lg_model = LogisticRegression(l2_penalty=1, alpha=0.001, max_iter=1000)
dt_model = DecisionTreeClassifier()
final_model = KNNClassifier(k=2, distance=euclidean_distance)

In [19]:
# initialize the stacking classifier
stacking = StackingClassifier([knn, lg_model, dt_model], final_model)

In [20]:
stacking.fit(train_dataset)

<si.ensemble.stacking_classifier.StackingClassifier at 0x1d71b24fb90>

In [21]:
stacking.predict(test_dataset)

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0.,
       1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 1.])

In [25]:
score = stacking.score(test_dataset)
print(f"Score of the StackingClassifier on the test set: {score}")

Score of the StackingClassifier on the test set: 0.9640287769784173


__Test the grid_search_cv function using the following protocol:__
1. Use the breast-bin.csv dataset 
2. Create a LogisticRegression model
3. Perform grid search using the following hyperparameters: - l2_penalty: 1, 10 - alpha: 0.001, 0.0001 - max_iter: 1000, 2000
4. Use 3 folds for the cross validation. 
5. Which scores do you obtain? What are the best score and best hyperparameters?

In [2]:
LG = LogisticRegression()
scores = k_fold_cross_validation(LG, breast_bin, cv=5)
scores

[0.6330935251798561,
 0.6474820143884892,
 0.6762589928057554,
 0.7266187050359713,
 0.5971223021582733]

In [3]:
LG = LogisticRegression()

# parameter grid
parameter_grid = {
    'l2_penalty': (1, 10),
    'alpha': (0.001, 0.0001, 0.00001),
    'max_iter': (1000, 2000, 3000, 4000, 5000, 6000)
}

# cross validate the model
scores = grid_search_cv(LG,
                        breast_bin,
                        hyperparameter_grid=parameter_grid,
                        cv=3)

scores

{'scores': [0.6551724137931035,
  0.6566091954022989,
  0.6580459770114943,
  0.6551724137931035,
  0.6551724137931034,
  0.6551724137931035,
  0.6551724137931035,
  0.6566091954022988,
  0.6566091954022988,
  0.6566091954022989,
  0.6566091954022989,
  0.6566091954022989,
  0.6566091954022989,
  0.6566091954022989,
  0.6551724137931035,
  0.6551724137931035,
  0.6551724137931035,
  0.6551724137931035,
  0.6551724137931035,
  0.6566091954022989,
  0.6566091954022989,
  0.6566091954022988,
  0.6551724137931034,
  0.6551724137931035,
  0.6551724137931035,
  0.6580459770114943,
  0.6566091954022989,
  0.6566091954022988,
  0.6566091954022988,
  0.6566091954022989,
  0.6551724137931034,
  0.6566091954022989,
  0.6566091954022989,
  0.6580459770114943,
  0.6551724137931035,
  0.6566091954022989],
 'hyperparameters': [{'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 1000},
  {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 2000},
  {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 3000},
  {'l2_penalt

In [4]:
scores['scores']

[0.6551724137931035,
 0.6566091954022989,
 0.6580459770114943,
 0.6551724137931035,
 0.6551724137931034,
 0.6551724137931035,
 0.6551724137931035,
 0.6566091954022988,
 0.6566091954022988,
 0.6566091954022989,
 0.6566091954022989,
 0.6566091954022989,
 0.6566091954022989,
 0.6566091954022989,
 0.6551724137931035,
 0.6551724137931035,
 0.6551724137931035,
 0.6551724137931035,
 0.6551724137931035,
 0.6566091954022989,
 0.6566091954022989,
 0.6566091954022988,
 0.6551724137931034,
 0.6551724137931035,
 0.6551724137931035,
 0.6580459770114943,
 0.6566091954022989,
 0.6566091954022988,
 0.6566091954022988,
 0.6566091954022989,
 0.6551724137931034,
 0.6566091954022989,
 0.6566091954022989,
 0.6580459770114943,
 0.6551724137931035,
 0.6566091954022989]