# Notebook overview 
The objective of this notebook is to familiarize yourself with the most popular tools used for Machine Learning in Python:

* Numpy
* Pandas
* Sklearn

In [None]:
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
SEED = 2024 # Seeds are used to guarantee reproducibility. Make sure to use this seed ALWAYS!

# Exploring the IRIS dataset

In [None]:
iris_dataset = load_iris() # This returns a dictionary with the attributes of the dataset, let's build it.

In [None]:
iris_dataset.keys()

In [None]:
iris_dataset["data"]

In [None]:
iris_dataset["target"]

In [None]:
print(iris_dataset["frame"])

In [None]:
iris_dataset["target_names"]

In [None]:
print(iris_dataset["DESCR"])

In [None]:
iris_dataset["feature_names"]

In [None]:
iris_dataset["filename"]

In [None]:
def build_dataframe(dataset: dict) -> pd.DataFrame:
    """
    This function takes as input a dictionary such as
    iris_dataset and returns a pandas dataframe
    with each column having the proper feature name.
    The target value is also a column of this dataframe
    with name `target`. It should contain the names of the target
    `setosa`, etc. and not simply the encoded numbers.
    """
    
    # Write your code here

In [None]:
df = build_dataframe(iris_dataset)
assert df.shape == (150, 5)
answer_columns =  sorted(df.columns)
answer_unique_targets = sorted(df["target"].unique())

print("Columns", answer_columns)
print("Targets", answer_unique_targets)

# Preparing the dataset for training
Now that we have our dataset (df) ready, we can proceed to prepare it for Machine Learing. For this we will:

* Split it into two sets: training and testing.
* Create a pipeline to normalize our dataset and use SVM for clasification.

In [None]:
y = df.pop("target")
X = df.copy()

## Splitting the dataset into train and test

Split the dataset into train and test using the method `train_test_split` (remember the seed!)
Make sure that the test dataset represents 20% of the total rows (look at parameter `test_size`)

In [None]:
# Write your code here

In [None]:
assert X_train.shape == (120, 4)
assert X_test.shape == (30, 4)
assert y_train.shape == (120,)
assert y_test.shape == (30,)

answer_y_test = sorted(y_test.index)
print("y_test index", answer_y_test)

## Generate Sklearn Pipeline
Before proceeding you should take a closer look at [Sklearn pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

Let's create a pipeline where the first step is a Standard Scaler and the second step is an SVM classifier

In [None]:
""" 
Crete a pipeline where the first step is a `StandardScaler` (use the name 'scaler') and the second one 
an SVM classifier `SVC` (use the name 'model' and remember the SEED!)
"""

# Write your code here

In [None]:
assert pipe.steps[0][0] == "scaler"
assert pipe.steps[1][0] == "model"

assert isinstance(pipe.steps[0][1], StandardScaler)
assert isinstance(pipe.steps[1][1], SVC)

## Training the model
Now it is time to train the model!

In [None]:
"""
Finally, we are ready to train the model. Use the training dataset
to train the model and predict the test dataset using the pipeline.
The predictions for the test dataset should be stored in the variable `y_pred`
Also, calcualte the accuracy of the model in both: train and test and save them
as `acc_train` and `acc_test`.
"""

# Write your code here

In [None]:
assert np.allclose(acc_train, 0.9833333333333333)
assert np.allclose(acc_test, 0.9666666666666667)
answer_predictions = Counter(y_pred)

print("Predition count", answer_predictions)

In [None]:
print(str(answer_columns))
print(str(answer_predictions))
print(str(answer_y_test))
print(str(answer_unique_targets))