# Programming Assignment 1
## KNN

Aluno: Francisco Edyvalberty Alenquer Cordeiro \
Matrícula: 518659


# Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt


# Utility Functions

## Metrics

In [3]:
def accuracy(y_true, y_pred):
    right_prediction = y_true == y_pred
    accuracy = right_prediction.sum() / len(y_true)
    return accuracy


def recall(y_true, y_pred):
    array = np.hstack([y_true.reshape(-1, 1), y_pred.reshape(-1, 1)])
    array = array[array[:, 0] == 1]

    right_prediction = array[:, 0] == array[:, 1]
    recall = right_prediction.sum() / len(array)
    return recall


def precision(y_true, y_pred):
    array = np.hstack([y_true.reshape(-1, 1), y_pred.reshape(-1, 1)])
    array = array[array[:, 1] == 1]

    right_prediction = array[:, 0] == array[:, 1]
    precision = right_prediction.sum() / len(array)

    return precision


def f1_score(y_true, y_pred):
    precision_score = precision(y_true, y_pred)
    recall_score = recall(y_true, y_pred)

    f1_score = 2 * (precision_score * recall_score) / \
        (precision_score + recall_score)

    return f1_score


## Standardization


In [4]:
class StandardScaler:
    def __init__(self):
        self.fitted = False

    def fit_transform(self, data):
        self.mean = data.mean(axis=0)
        self.std = data.std(axis=0)
        self.fitted = True

        scaled_data = (data - self.mean) / self.std
        return scaled_data

    def transform(self, data):
        if not self.fitted:
            raise Exception('Scaler not fitted!')

        scaled_data = (data - self.mean) / self.std
        return scaled_data

    def inverse_transform(self, scaled_data):
        if not self.fitted:
            raise Exception('Scaler not fitted!')

        original_data = (scaled_data * self.std) + self.mean
        return original_data


## Cross Validation

In [5]:
def kfolds_cross_validation(data, n_folds=10, shuffle=False, random_state=12894):
    indexes = np.arange(data.shape[0])

    if shuffle:
        np.random.seed(12894)
        np.random.shuffle(indexes)

    slices = np.array_split(indexes, n_folds)
    all_elements = np.hstack(slices)

    for i in range(n_folds):
        train_idx = all_elements[~np.isin(all_elements, slices[i])]
        test_idx = slices[i]

        yield train_idx, test_idx


## Train Test Split

In [6]:
def train_test_split(data, train_size_perc, random_seed=264852):
    """
        Obs: Target variable must be the last one on the right
    """
    N = data.shape[0]
    train_size = int(train_size_perc * N)

    indexes = np.arange(0, N, 1)

    np.random.seed(random_seed)
    train_idx = np.random.choice(indexes, train_size, replace=False)
    test_idx = np.delete(indexes, train_idx)

    train_data = data[train_idx]
    test_data = data[test_idx]

    X_train = train_data[:, :-1]
    y_train = train_data[:, [-1]]

    X_test = test_data[:, :-1]
    y_test = test_data[:, [-1]]

    return X_train, X_test, y_train, y_test


# Task 1 - KNN

In [7]:
data = np.genfromtxt('../data/kc2.csv', delimiter=',')
print('Shape:', data.shape)
data[:3, :]


Shape: (522, 22)


array([[1.1000000e+00, 1.4000000e+00, 1.4000000e+00, 1.4000000e+00,
        1.3000000e+00, 1.3000000e+00, 1.3000000e+00, 1.3000000e+00,
        1.3000000e+00, 1.3000000e+00, 1.3000000e+00, 1.3000000e+00,
        2.0000000e+00, 2.0000000e+00, 2.0000000e+00, 2.0000000e+00,
        1.2000000e+00, 1.2000000e+00, 1.2000000e+00, 1.2000000e+00,
        1.4000000e+00, 0.0000000e+00],
       [1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00],
       [4.1500000e+02, 5.9000000e+01, 5.0000000e+01, 5.1000000e+01,
        1.1590000e+03, 8.4113100e+03, 1.0000000e-02, 1.0353000e+02,
        8.1240000e+01, 8.7084858e+05, 2.8000000e+00, 4.8380480e+04,
        3.5900000e+02, 3.5000000e+01, 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    data, 0.8, random_seed=12354)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)


X_train shape: (417, 21)
y_train shape: (417, 1)
X_test shape: (105, 21)
y_test shape: (105, 1)


In [9]:
X1 = X_train[[0], :]
X2 = X_train


def euclidian_distance(i, j):
    return np.sqrt(np.sum((i - j)**2, axis=1))


distances = euclidian_distance(X1, X2)

distances


array([0.00000000e+00, 1.85715446e+02, 1.31874903e+01, 1.48942070e+04,
       6.96351936e+03, 4.28472815e+02, 4.56378094e+02, 1.31874903e+01,
       1.70764357e+04, 1.27353938e+02, 7.12161793e+01, 2.86089874e+04,
       5.35803375e+03, 1.31874903e+01, 1.31874903e+01, 1.27353938e+02,
       3.63290984e+03, 6.77588948e+03, 1.73044625e+04, 1.51600078e+02,
       1.30368507e+04, 6.00587676e+03, 4.00105463e+02, 1.99806106e+01,
       2.01550192e+01, 7.51968257e+01, 1.03875642e+02, 2.01053426e+01,
       1.31874903e+01, 1.03875642e+02, 1.32631030e+01, 1.31874903e+01,
       4.29784642e+03, 1.03875642e+02, 3.27543246e+03, 1.47479413e+04,
       8.82329683e+03, 1.54307862e+04, 1.61407005e+03, 2.41271521e+03,
       4.48900098e+03, 2.26786981e+04, 2.72245320e+05, 4.89843990e+03,
       8.54252176e+03, 6.71799098e+03, 2.49466110e+01, 1.44569853e+04,
       2.99435634e+04, 9.57546737e+03, 5.47437402e+04, 4.10592401e+01,
       1.11739176e+02, 5.54756354e+02, 1.60202824e+03, 4.72365935e+04,
      

In [41]:
class MyKNN():
    def __init__(self, k_neighbours):
        self.fitted = False
        self.K = k_neighbours

    def euclidian_distance(self, i, j):
        distances = np.sqrt(np.sum((i - j)**2, axis=1))
        return distances.reshape(-1, 1)

    def fit(self, X, y):
        self.X = X
        self.y = y

    def get_more_frequently(self, arr):
        unique, counts = np.unique(arr, return_counts=True)
        return unique[np.argmax(counts)]

    def predict(self, X, distance='euclidian'):
        preds = []
        for i in range(len(X)):
            if distance == 'euclidian':
                distances = self.euclidian_distance(X[[i], :], self.X)

            knn = np.hstack([distances, self.y])
            knn = knn[knn[:, 0].argsort()]

            preds.append(self.get_more_frequently(knn[:self.K, 1]))

        return np.array(preds).reshape(-1, 1)


my_knn = MyKNN(3)
my_knn.fit(X_train, y_train)
my_knn.predict(X_train[0:10, :])


array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [42]:
from sklearn.neighbors import KNeighborsClassifier

sk_knn = KNeighborsClassifier(3)
sk_knn.fit(X_train, y_train)
sk_knn.predict(X_train[0:10, :])


  return self._fit(X, y)


array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])