In [None]:
%reload_ext nb_black

In [None]:
import warnings

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

In [None]:
feat_names = [
    "area",
    "perimeter",
    "compactness",
    "length",
    "width",
    "asymmetry",
    "length_groove",
]

target_name = ["species"]

col_names = feat_names + target_name

In [None]:
# Data downloaded from this link:
# https://archive.ics.uci.edu/ml/datasets/seeds
# More info on the data can be seen there.
data_path = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt"
)
seeds = pd.read_csv(data_path, sep="\t+", header=None, names=col_names, engine="python")
seeds.head()

* The `species` column maps to the names: `{1: 'Kama', 2: 'Rosa', 3: 'Canadian'}`
* Replace the numbers in the `species` column with the species names

## Viz

* Create a pair plot of the data colored by species.
* Interpret what you see
    * Do we seem to have predictive features to separate the classes?
    * Which classes do you expect to be the most likely to be confused? least likely to be confused?

* Create a heatmap of the features

* Assess multicollinearity
    * Even though we aren't drawing a line, highly correlated features in KNN isn't good; we would be double counting the same distance between neighbors if we have 2 redundant features

## Model prep

* Prep the data for a KNN model to be fit

* Fit a model using `KNeighborsClassifier()`
    * Identify the best values of the `n_neighbors` and `weights` hyperparameters using `GridSearchCV()`

* What are the best parameters?

* How is the model performing?