## Using Decision Tree and Nearest Neighbor Methods in a Handwritten Digit Problem

In [None]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.datasets import load_digits #dataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
data = load_digits()
X, y = data.data, data.target

X[0,:].reshape([8,8]) #a picture – matrix 8х8 turns into a vector

### Drawing numbers

In [None]:
f, axes = plt.subplots(1,10,figsize=(16,4))
for i in range(10):
    axes[i].imshow(X[i,:].reshape([8,8]));

### Test and hold-out

In [None]:
# 30% – hold-out (X_holdout, y_holdout)
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3, random_state=17) 

In [None]:
tree = DecisionTreeClassifier(max_depth=5, random_state=17) # random parameters
knn = KNeighborsClassifier(n_neighbors=10)

tree.fit(X_train, y_train)
knn.fit(X_train, y_train)

### Accuracy scores of Decision Tree and Nearest Neighbor Methods

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
tree_pred = tree.predict(X_holdout)
knn_pred = knn.predict(X_holdout)
print(f"Tree: {accuracy_score(y_holdout, tree_pred).round(2)}", f"kNN: {accuracy_score(y_holdout, knn_pred).round(2)}")

### Cross-validation

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [None]:
tree_params = {'max_depth': [1, 2, 3, 5, 10, 20, 25, 30, 40, 50, 64],
'max_features': [1, 2, 3, 5, 10, 20 ,30, 50, 64]}

tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)

tree_grid.fit(X_train, y_train)

In [None]:
print(f"Tree: {(tree_grid.best_score_).round(2)}",\
      f"kNN: {(np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=1), X_train, y_train, cv=5))).round(2)}" )

#### Result: Nearest Neighbour method works better than Decision Tree in this case. The accuracy is 99%.