In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

In [3]:
def write_ans(name, ans, verbose=True):
    if verbose:
        print(ans)
    with open(f"{name}.txt", 'w') as f:
        f.write(str(ans))

In [4]:
class kNN:
    def __init__(self, k=4):
        self.k = k
    
    def fit(self, X, y):
        self.data_pool = (X, y)
    
    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)
    
    def predict(self, X):
        if not hasattr(self, 'data_pool'):
            raise ValueError('not fitted model')
        
        # X (N, D)
        distances = self.data_pool[0][None,:,:] - X[:,None,:]  # (N, train_N, D)
        distances = np.sum(distances**2, axis=2)
        
        idx = np.argpartition(distances, self.k - 1, axis=1)
        idx = idx[:,:self.k]  # (N, K)
        
        knn = np.take_along_axis(self.data_pool[1][:,None], idx, axis=0) # (N, K)
        
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), 1, knn)

In [5]:
knn = kNN(k=1)
knn.fit(X_train, y_train)

In [6]:
write_ans('task1', np.mean(knn.predict(X_test) != y_test))

0.03777777777777778


In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000)

In [12]:
write_ans('task2', np.mean(clf.predict(X_test) != y_test))

0.06444444444444444
