# K Nearest Neighbors with Python

You've been given a classified data set from a company! They've hidden the feature column names but have given you the data and the target classes. 


# Step - 0

Import Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Step - 1

Get the data.

In [None]:
df = pd.read_csv("Classified Data",index_col=0)
df.head()

# Step - 2

Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_features = scaler.fit_transform(df.drop('TARGET CLASS',axis=1))

In [None]:
df_scaled = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_scaled.head()

# Step - 3

Prepare data for the algorithm. Remember that we are trying to come up with a model to predict whether someone will **TARGET CLASS** or not. We'll start with k=1.

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(scaled_features,df['TARGET CLASS'],
                                                    test_size=0.30)


from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)

In [None]:
#Predict the test data.

pred = knn.predict(X_test)

# Step - 4

Evaluation of the results.

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

## Choosing the right K value.

The elbow method is used to pick the right K value.

In [None]:
error_rate = []

# Will take some time to get the results.
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='green', linestyle='dashed', marker='o',
         markerfacecolor='yellow', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

From the graph we can observe that the for K = 23 the model gives low error rate, you can try for different values from the graph but lets go with 23.

In [None]:
# NOW WITH K=23
knn = KNeighborsClassifier(n_neighbors=23)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('FOR K=23')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

--- 
                                    THE END