# Model Iris Dataset

Create model from Iris dataset.

## Acknowledgements and Citations:
Data Creator(s):
- R.A. Fisher

Data Donor(s):
- Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)

Data Source(s):
- scikit-learn: https://scikit-learn.org/stable/datasets/toy_dataset.html#iris-dataset

In [None]:
import pickle

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# checkout versioned dataset
!python -m dvc checkout -f

In [None]:
# read the checked out dataset
df = pd.read_parquet("./data/iris.parquet")
df.head()

In [None]:
df.info()

In [None]:
# create copy of dataset for modeling
df_modeling = df.drop("species", axis=1).copy()
df_modeling

In [None]:
# scale features
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(
        X=df_modeling[
            [
                "sepal length (cm)",
                "sepal width (cm)",
                "petal length (cm)",
                "petal width (cm)",
            ]
        ]
    )
)
df_scaled.head()

In [None]:
# create training and testing splits
X_train, X_test, y_train, y_test = train_test_split(
    df_scaled, df_modeling["target"], test_size=0.3, shuffle=True
)
len(X_train)

In [None]:
# experiment with n_neighbors to find a good fit
selected_n_neighbors = 2
selected_p = 1
selected_knn_model = KNeighborsClassifier()
best_score = 0.0
for i in range(2, 21):

    # create, fit, and score knn clasifier with variable n_neighbors
    knn = KNeighborsClassifier(
        n_neighbors=i, p=2, weights="distance", metric="minkowski"
    )
    knn.fit(X_train, y_train)
    score = knn.score(X_test, y_test)

    # if current score is better than our best score,
    # and not too perfect (100% accurate), store it for use
    if score > best_score and score < 0.97:
        selected_n_neighbors = i
        selected_p = x
        selected_knn_model = knn

    prev_score = score

print("selected_n_neighbors: ", selected_n_neighbors)

In [None]:
# save the model to disk
filename = "gpr_model.sav"
with open("./data/iris_knn_model.pkl", "wb") as f:
    pickle.dump(selected_knn_model, f)

In [None]:
# add the saved visualization using dvc
!python -m dvc add data/iris_knn_model.pkl

In [None]:
# push the dataset using dvc
!python -m dvc push