In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from sklearn.svm import SVC, LinearSVC, NuSVC


In [None]:
df_train = pd.read_csv("datasets/feuilles/dataset.csv")
df_train.head()


In [None]:
df_test = pd.read_csv("datasets/feuilles/test.csv")
df_test.head()


In [None]:
df_train.isnull().sum().sort_values()


In [None]:
le = LabelEncoder().fit(df_train.species)
labels = le.transform(df_train.species)
labels


In [None]:
classes = list(le.classes_)
len(classes), classes


In [None]:
df_train.drop(["species", "id"], axis=1, inplace=True)
df_test.drop(["id"], axis=1, inplace=True)


In [None]:
X = df_train.values
y = labels


In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.25)
split.get_n_splits(X, y)


In [None]:
for train_index, test_index in split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


In [None]:
knc = KNeighborsClassifier(n_neighbors=5)
knc.fit(X_train, y_train)

pred = knc.predict(X_test)
acc = accuracy_score(y_test, pred)
acc


In [None]:
params = dict(n_neighbors=range(2, 15, 2), weights=["uniform", "distance"])
gs = GridSearchCV(KNeighborsClassifier(), param_grid=params, cv=5)
gs.fit(X_train, y_train)
res = pd.DataFrame(gs.cv_results_)
for i in [x for x in res.columns if "time" in x or "split" in x]:
    res.drop(i, axis=1, inplace=True)
res


In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, figsize=(12, 5))

weights = res.param_weights.unique()

for i, weight in enumerate(weights):
    axs[i].set_title(f"scores for {weight} per neighbors")
    x = range(2, 15, 2)
    y = res.loc[res.param_weights == weight, "mean_test_score"]
    axs[i].plot(x, y)


In [None]:
gs.best_params_


# SVM

In [None]:
params = dict(C=range(1, 7), kernel=["rbf", "poly", "linear"])
gs = GridSearchCV(SVC(), param_grid=params, cv=5)
gs.fit(X_train, y_train)
res = pd.DataFrame(gs.cv_results_)
for i in [x for x in res.columns if "time" in x or "split" in x]:
    res.drop(i, axis=1, inplace=True)
res.sort_values(by="rank_test_score")


In [None]:
kernels = res.param_kernel.unique()

fig, axs = plt.subplots(1, len(kernels), sharey=True, figsize=(12, 5))

for i, kernel in enumerate(kernels):
    axs[i].set_title(f"scores for {kernel}")
    x = range(int(len(res.param_kernel) / len(kernels)))
    y = res.loc[res.param_kernel == kernel, "mean_test_score"]
    axs[i].plot(x, y)
