## Import Libraties

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

## Pre-processing Stage

### Read The Original Dataset

In [None]:
data = pd.read_csv('Social_Network_Ads.csv')
print(data.describe())
print(data.info())
data.head()

### Feature Engineering

In [None]:
my_df = data[['Gender','Age','EstimatedSalary','Purchased']]
my_df.head()

### Label Encoding

In [None]:
my_df['Gender'] = LabelEncoder().fit_transform(my_df['Gender'])
my_df.head()

### Standerzation

In [None]:
my_df[['Age','EstimatedSalary']] = StandardScaler().fit_transform(my_df[['Age','EstimatedSalary']])
my_df.head()

## Split the data into Training and Testing

In [None]:
X = my_df.iloc[:,0:3].values
Y = my_df.iloc[:,[3]].values
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.25,random_state = 7)
print("Size of all data is ",len(X))
print("Size of training data is ",len(x_train))
print("Size of testing data is ",len(x_test))

## Trainig Phase

In [None]:
k = 5
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(x_train,y_train)

## Testing Phase

In [None]:
y_pred = knn_model.predict(x_test)
y_pred

### Calculating the Accuracy of the Model

In [None]:
accuracy = accuracy_score(y_test,y_pred)
print('the accuracy of the model is ',accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

## Visualization

### Visualising the Training set results

In [None]:
X_set, y_set = x_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn_model.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()


### Visualising the Test set results

In [None]:
X_set, y_set = x_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn_model.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

### test with different Ks

In [140]:
for i in range(1,15):
    knn_model = KNeighborsClassifier(n_neighbors=i)
    knn_model.fit(x_train,y_train)
    y_pred = knn_model.predict(x_test)
    accuracy = accuracy_score(y_test,y_pred)
    print('when k = %d : accuracy of the model is ' %i ,accuracy)

  This is separate from the ipykernel package so we can avoid doing imports until


when k = 1 : accuracy of the model is  0.84
when k = 2 : accuracy of the model is  0.86
when k = 3 : accuracy of the model is  0.89
when k = 4 : accuracy of the model is  0.91
when k = 5 : accuracy of the model is  0.93
when k = 6 : accuracy of the model is  0.93
when k = 7 : accuracy of the model is  0.91
when k = 8 : accuracy of the model is  0.92
when k = 9 : accuracy of the model is  0.91
when k = 10 : accuracy of the model is  0.92
when k = 11 : accuracy of the model is  0.92
when k = 12 : accuracy of the model is  0.91
when k = 13 : accuracy of the model is  0.9
when k = 14 : accuracy of the model is  0.89


In [None]:
## acording to above result the best k is 5, so we will select k=5