# K nearest neighbors

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Exploratory Data Analysis and Visualization

In [None]:
first_team = np.array([4.3,2.5,5.7,6.1,7.4,3.4,3.1,6.3,5.5,7.7,6.6,6.6,5.4,5.2,8.3,4.2,3,4.9,8.5,7.2,6.6,6.3,3.3,5.4,3.4,9.3,7.8,2.4,6.4,2.5,3.6,4.6,3.3,9.7,5,5.7,6.8,5.9,5,2.5,6.2,6.7,6.1,4.6,5.4,4.8,7.8,7.2,8.3,5,3.5,6.1,6.9,6.5,6.2,9.3,2.7,3.9,8.4,3.7,5,3.9,4.1,6.2,7.1,2.6,3.3,1.8,4.1,6.3])
second_team = np.array([3.9,6.3,3.9,6.2,3.4,7.5,6.8,4.9,8.6,3.5,2.9,4.5,7.2,5.7,5.4,9.3,7.7,6.8,5.6,4.2,3.7,7.7,7.5,2.9,5.3,6.4,3.5,5.6,6.4,5,6.9,5.4,6.5,4.8,5.3,7.2,3.2,7.2,5.3,4.8,6.5,6.1,7.7,7.3,5.1,6.5,2.3,3.9,4.1,4.6,7.8,2.5,6,2.5,6.8,6.6,6.2,7.7,3.4,8.6,8.4,8.8,7.1,5.1,4.7,6.3,6.2,5,4,7.7])
scoring_player = np.array([1,0,1,0,1,0,0,1,0,1,1,0,0,1,1,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0])

df = pd.DataFrame({'first_team': first_team, 'second_team': second_team, 'scoring_player': scoring_player})
df.head()

In [None]:
sns.scatterplot(data=df, x='first_team', y='second_team', hue='scoring_player')

## Train and Test Data Splits 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('scoring_player', axis=1)
y = df['scoring_player']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Scaling Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

## Creating the model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = KNeighborsClassifier(n_neighbors=1)

In [None]:
model.fit(scaled_X_train, y_train)

## Predictions on the test data

In [None]:
y_pred = model.predict(scaled_X_test)

## Choosing K Value

In [None]:
full_test = pd.concat([X_test, y_test], axis=1)

In [None]:
len(full_test)

In [None]:
sns.scatterplot(data=full_test, x='first_team', y='second_team', hue='scoring_player')

## Model Performance

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Accuracy

In [None]:
accuracy_score(y_test, y_pred)

### Confusion matrix

In [None]:
confusion_matrix(y_test, y_pred)

### Classification report

In [None]:
print(classification_report(y_test, y_pred))

## Methods for choosing K value

### Elbow method

In [None]:
test_error_rates = []

for k in range(1, 15):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(scaled_X_train, y_train) 
   
    y_pred_test = knn_model.predict(scaled_X_test)
    
    test_error = 1 - accuracy_score(y_test, y_pred_test)
    test_error_rates.append(test_error)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, 15),test_error_rates, label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")

### Full Cross Validation Grid Search method

In [None]:
scaler = StandardScaler()
knn = KNeighborsClassifier()

In [None]:
knn.get_params().keys()

In [None]:
operations = [('scaler', scaler), ('knn', knn)]

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(operations)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
k_values = list(range(1, 20))

In [None]:
param_grid = {'knn__n_neighbors': k_values}

In [None]:
full_cv_classifier = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

In [None]:
full_cv_classifier.fit(X_train, y_train)

In [None]:
full_cv_classifier.best_estimator_.get_params()

In [None]:
full_cv_classifier.cv_results_['mean_test_score']

In [None]:
scores = full_cv_classifier.cv_results_['mean_test_score']
plt.plot(k_values, scores, 'o-')
plt.xlabel("K")
plt.ylabel("Accuracy")

## Final model evaluation

In [None]:
full_pred = full_cv_classifier.predict(X_test)
print(classification_report(y_test, full_pred))