## dataset
- The data files train.csv and test.csv contain gray-scale images of hand-drawn digits, from zero through nine.

    Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.

In [53]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_blobs

In [54]:
data = pd.read_csv("./../data_sets/digit-recognizer/train.csv")

In [55]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
data.shape

(42000, 785)

In [57]:
X = data.iloc[:,1:].values
y = data.iloc[:,0].values

In [58]:
print(X.shape)
print(y.shape)

(42000, 784)
(42000,)


In [59]:
from sklearn.model_selection import train_test_split

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(33600, 784)
(33600,)
(8400, 784)
(8400,)


In [62]:
def draw(img,y):
    plt.imshow(img.reshape(28,28),cmap="gray") 
    # automatically applies a color even o gray scale
    # cmap="gray" to check if grayscale
    # plt.imshow(img.reshape(28,28),cmap="grey")
    plt.title(y,color="white")
    plt.show()

    

In [63]:
def dist(X1,X2):
    return np.sqrt(np.sum(np.power((X1-X2),2)))

In [78]:
def knn(X,y,Q,k=5):
    """
    x-> (33600, 784) np array
    y-> (33600,) np array
    x_query-> (1, 784) np array
    k-> scaler int
    fo knn for classifictation
    
    """
    val = []
    for i in range(X.shape[0]):
        d = dist(X[i],Q)
        val.append((d,y[i]))
        
    #     return int(max(vals[:,-1]))  
    vals = np.array(sorted(val)[:k])
    vals = np.unique(vals[:,1],return_counts=True)
    
#     draw(Q,vals[0][np.argmax(vals[1])])
    
    return vals[0][np.argmax(vals[1])]


    
    

## idea
- the Euclidean distance between the pixels of similar images is close to 0

In [79]:
knn(X_train,y_train,X_test[69])

3.0

### Accuracy

In [80]:
def accuracy(X_test,y_test,limit=100):
    acc = []
    for i in range(limit):
        pred = knn(X_train,y_train,X_test[i])
        acc.append(pred)
    return np.sum(np.array(acc)==y_test[:limit])/limit
        

In [81]:
accuracy(X_test,y_test)

0.97