# Iris dataset

In [93]:
import pandas as pd 
import numpy as np 
np.random.seed = 2021
from sklearn.datasets import load_iris

iris = load_iris()
X, y, labels, feature_names  = iris.data, iris.target, iris.target_names, iris['feature_names']
df_iris= pd.DataFrame(X, columns= feature_names) 
df_iris['label'] =  y
features_dict = {k:v for k,v in  enumerate(labels)}
df_iris['label_names'] = df_iris.label.apply(lambda x: features_dict[x])

df_iris


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


# Spliting to traing and test

In [95]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_iris)

X_train = df_train[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']]
y_train = df_train['label']

X_test = df_test[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']]
y_test = df_test['label']

#Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


# KNN 

In [97]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Крос-валідація
k_values = range(1, 20)
cv_scores = []

for k in k_values:
    сlassifier = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(сlassifier, X_train, y_train, cv=5, scoring='accuracy')  # 5 кратна крос-валідація
    cv_scores.append(scores.mean())
    
k_best = k_values[np.argmax(cv_scores)]
сlassifier = KNeighborsClassifier(n_neighbors=k_best)
сlassifier.fit(X_train, y_train)
score_best = сlassifier.score(X_test, y_test)

print ('The best k = {} , score = {}'.format(k_best, score_best ))

The best k = 12 , score = 0.9473684210526315


# Synthetic dataset

In [99]:
from sklearn.datasets import make_blobs
np.random.seed= 2021
X_D2, y_D2 = make_blobs(n_samples = 300, n_features = 2, centers = 8,
                       cluster_std = 1.3, random_state = 4)
y_D2 = y_D2 % 2

# Preparing data

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2)

from sklearn.preprocessing import MinMaxScaler 
scaler= MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# KNN regressor

In [103]:
from sklearn.neighbors import KNeighborsRegressor
k_values = range(1, 20)
cv_scores = []

for k in k_values:
    regressor = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(regressor, X_train, y_train, cv=20, scoring='r2')
    cv_scores.append(scores.mean())
    
k_best = k_values[np.argmax(cv_scores)]
regressor = KNeighborsRegressor(n_neighbors=k_best)
regressor.fit(X_train, y_train)
score_best = regressor.score(X_test, y_test)

print ('The best k = {} , score = {}'.format(k_best,score_best ))

The best k = 13 , score = 0.7777908709082798


# Visualization

In [105]:
import ML_mst as mst

mst.plot_decision_boundary(regressor, X_train, y_train, X_test=X_test, y_test= y_test,
                               title='KNN regrestion K= {}, score = {:.2f}'.format(k_best, score_best))


Call prediction for all grid values (precision of drawing = 0.01, you may configure to speed up e.g. precision=0.05)
