In [50]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

class Point:

    def __init__(self, axis):
        self.axis = np.array(axis)

    def distance(self, other):
        if not isinstance(other, Point):
            other = Point(other)
        return sum((self - other) ** 2) ** 0.5

    def to_numpy(self):
        return self.axis

    def to_list(self):
        return self.axis.tolist()

    def __add__(self, other):
        if isinstance(other, Point):
            return Point(self.axis + other.axis)
        return Point(self.axis + np.array(other))

    def __radd__(self, other):
        return self.__add__(other)

    def __sub__(self, other):
        if isinstance(other, Point):
            return Point(self.axis - other.axis)
        return Point(self.axis - np.array(other))

    def __rsub__(self, other):
        return self.__sub__(other)

    def __mul__(self, other):
        if isinstance(other, Point):
            return Point(self.axis * other.axis)
        return Point(self.axis * np.array(other))

    def __rmul__(self, other):
        return self.__mul__(other)

    def __truediv__(self, other):
        if isinstance(other, Point):
            return Point(self.axis / other.axis)
        return Point(self.axis / np.array(other))

    def __rtruediv__(self, other):
        return self.__truediv__(other)

    def __floordiv__(self, other):
        if isinstance(other, Point):
            return Point(self.axis // other.axis)
        return Point(self.axis // np.array(other))

    def __rfloordiv__(self, other):
        return self.__floordiv__(other)

    def __pow__(self, power, modulo=None):
        if modulo:
            return self.axis ** power % modulo
        return self.axis ** power

    def __eq__(self, other):
        if isinstance(other, Point):
            return max(self.axis == other.axis)
        return max(self.axis == other)

    def __getitem__(self, item):
        return self.axis[item]

    def __repr__(self):
        return f'Point{tuple(self.axis)}'


In [23]:
from operator import itemgetter


class MyKNeighborsClassifier:

    def __init__(self, k=3):
        self.k = int(k)
        self._fit_data = []

    def fit(self, x, y):
        assert len(x) == len(y)
        self._fit_data = [(Point(coordinates), label) for coordinates, label in zip(x, y)]

    def predict(self, x):
        predicts = []
        for coordinates in x:
            predict_point = Point(coordinates)

            distances = []
            for data_point, data_label in self._fit_data:
                distances.append((predict_point.distance(data_point), data_label))
        
            distances = sorted(distances, key=itemgetter(0))[:self.k]
        
            predicts.append([max(distances, key=itemgetter(1))[1]])

        return predicts


In [24]:
def load_data(path):
    marks_df = pd.read_csv(path)
    return marks_df

In [25]:
data = load_data("datasets/clean_tmdb.csv")

X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X = X.to_numpy()
y = y.to_numpy()

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [55]:
model = MyKNeighborsClassifier(k=5)
model.fit(X_train, y_train)

predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("My knn:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)


My knn:

Train:

accuracy: 0.6488502175264139
report:
              precision    recall  f1-score   support

           0       0.18      1.00      0.31       251
           1       1.00      0.62      0.76      2967

    accuracy                           0.65      3218
   macro avg       0.59      0.81      0.54      3218
weighted avg       0.94      0.65      0.73      3218

*******************************************************

Test:

accuracy: 0.6126182965299685
report:
              precision    recall  f1-score   support

           0       0.12      0.75      0.20       102
           1       0.97      0.60      0.74      1483

    accuracy                           0.61      1585
   macro avg       0.54      0.68      0.47      1585
weighted avg       0.92      0.61      0.71      1585



In [53]:
# Using scikit-learn
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("Scikit-learn:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)

Scikit-learn:

Train:

accuracy: 0.7921068986948415
report:
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      1402
           1       0.81      0.82      0.82      1816

    accuracy                           0.79      3218
   macro avg       0.79      0.79      0.79      3218
weighted avg       0.79      0.79      0.79      3218

*******************************************************

Test:

accuracy: 0.6971608832807571
report:
              precision    recall  f1-score   support

           0       0.66      0.63      0.65       692
           1       0.72      0.75      0.74       893

    accuracy                           0.70      1585
   macro avg       0.69      0.69      0.69      1585
weighted avg       0.70      0.70      0.70      1585

