In [None]:
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import joblib
from mlops2023 import classifiers

In [5]:
%%writefile train.py
import joblib
import pandas as pd
import numpy as np
from mlops2023 import classifiers
file_path = './trained_model.joblib'

X = np.array(pd.read_csv('./data/mnist_train.csv').iloc[:,1:])
y = np.array(pd.read_csv('./data/mnist_target_train.csv').iloc[:,1])

clf = classifiers.KNNClassifier(10, weight_samples=False)
clf.fit(X, y)
joblib.dump(clf, file_path)
print("Модель успешно обучена и сохранена в", file_path)

Overwriting train.py


In [9]:
%%writefile infer.py
import joblib
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

file_path = './trained_model.joblib'
relut_file = './prediction.csv'

X_test = np.array(pd.read_csv('./data/mnist_test.csv').iloc[:,1:])
y_test = np.array(pd.read_csv('./data/mnist_target_test.csv').iloc[:,1])

knn_model = joblib.load(file_path)
y_pred = knn_model.predict(X_test)
np.savetxt(relut_file, y_pred, delimiter=',', fmt='%d')
print('Classifier Accuracy =', accuracy_score(y_test, y_pred))
print('Предсказания сохранены в', relut_file)

Overwriting infer.py


In [17]:
%%writefile classifiers.py
from collections import Counter
import numpy as np
class KNNClassifier(object):
    def __init__(self, k_neighbours: int, weight_samples: bool = True, weight_param = 0.8):

        self._k_neighbours = k_neighbours
        self._weight_samples = weight_samples
        self._weight_param = weight_param

        self._X, self._y = None, None

    def fit(self, X: np.array, y: np.array) -> None:
        '''
        When fit() method called -- model just saves the Xs and ys
        '''
        self._X = X
        self._y = y

    def predict(self, X: np.array) -> np.array:
        '''Non-optimized version (python loop-based)'''

        # Assertion check -- if model is fitted or not
        assert (self._X is not None and self._y is not None), f"Model is not fitted yet!"

        ys_pred: np.array = np.zeros(shape=(X.shape[0], 1)) # Predictions matrix allocation

        '''
        For each sample in X calculate distances to the points in self._X, using the self._metric()
        calculate distances and get K nearest points.
        '''
        for sample_id, X_this in enumerate(X):
            distances = dict(enumerate(np.sqrt(np.sum((self._X - X_this)**2, axis=1))))
            sorted_distances = self._sort_dict(distances)
            y_pred: int = self._get_nearest_class(sorted_distances)
            ys_pred[sample_id] = y_pred

        return ys_pred

    @staticmethod
    def _sort_dict(unsort_dict, ascending=False):
        return dict(sorted(unsort_dict.items(), key=lambda item: item[1]))

    def _get_nearest_class(self, sorted_distances: list) -> int:
        sorted_distances_top_k = list(sorted_distances.keys())[:self._k_neighbours]
        labels_top_k = [dict(enumerate(self._y))[sample] for sample in sorted_distances_top_k]
        predicted_label: int = self._decision_rule(labels_top_k)
        return predicted_label

    def _decision_rule(self, labels_top_k) -> int:
        if self._weight_samples:

          # Создаем словарь для подсчета весов по классам
          class_weights = {}
          weights = np.power(self._weight_param, np.arange(len(labels_top_k))) #пусть вес убывает экспоненциально
          # Суммируем веса для каждого класса
          for label, weight in zip(labels_top_k, weights):
              class_weights[label] = class_weights.get(label, 0) + weight

          # Находим класс с наибольшей суммой весов
          max_weight_class = max(class_weights, key=class_weights.get)
          return max_weight_class

        else:
          return Counter(labels_top_k).most_common()[0][0]

Writing classifiers.py


In [14]:
sk_knn_clf = OptimizedKNNClassifier(10, weight_samples=False)
sk_knn_clf.fit(X, y)

y_pred = sk_knn_clf.predict(X_test)
print('OptimizedKNNClassifier Accuracy =', accuracy_score(y_test, y_pred))

OptimizedKNNClassifier Accuracy = 0.9888888888888889


In [50]:
file_path = "knn_model.joblib"

['knn_model.joblib']

In [5]:
knn_model = joblib.load("knn_model.joblib")

In [6]:
y_pred = knn_model.predict(X_test)
print('OptimizedKNNClassifier Accuracy =', accuracy_score(y_test, y_pred))

OptimizedKNNClassifier Accuracy = 0.9888888888888889
