# Models
## KNN
Lets first create a K-NN model. During "training" it saves all dataframes in an internal storrage
During testing, it calculates the distance between the input and all stored samples and selects the (majority vote) label of the K nearest neighbors.

A good K can be found by using k-folds cross validation

In [1]:
import numpy as np
import pandas as pd
import math
from src.preprocess_data import preprocess_dataset, pca_dataset
import datetime
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
import torch
from torch.utils.data import DataLoader, TensorDataset
from collections import deque

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
class KNN():
    MAX_MATRIX_ENTRIES = 2e8

    def __init__(self, num_cols: int, k: int) -> None:
        """Creates an empty KNN classifier

        Args:
            num_cols (int): The number of columns (features) the data is going to have, including the label.
            k (int): The number of neighbours that should be considered.
        """
        self.data = np.empty([0, num_cols])
        self.shape = self.data.shape
        self.k = k
        self.__verbose = False
        self.use_dims = None
        self.__pc = None
        return

    def __len__(self) -> int:
        return len(self.data)

    def __call__(self, *args, **kwds):
        return self.top_k(*args, **kwds)

    def verbose(self, verbose=True) -> None:
        """Sets the model to the verbose mode, where it comments on what it is doing.

        Args:
            verbose (bool, optional): Defaults to True.
        """
        self.__verbose = True
        return None

    def set_reduce_dimensions(self, num_leading_dim: int) -> None:
        """Sets KNN to use PCA in order to reduce the dimensionality of the data.

        Args:
            num_leading_dim (int): The number of PCs that should be used for the trasnformation.
        """
        self.use_dims = num_leading_dim
        return

    def get_pca(self):
        if self.use_dims is None:
            raise Exception("PCA can only be performed after setting a number of dimensions for reduction. Use ```KNN.set_reduce_dimensions```")
        if self.__pc is None:
            self.__pca(None)
        return self.__pc.components_
    
    def store_data(self, df: pd.DataFrame) -> None:
        """Stores a single dataframe into the internal storage

        Args:
            df (pd.DataFrame): The data to be saved.
        """
        self.data = np.concatenate([self.data, df.to_numpy()])
        self.shape = self.data.shape

        if self.__verbose:
            print("New shape is {}.".format(self.shape))
        # Reset internal
        self.__dat = None
        self.__lab = None
        self.__pc = None
        return

    def set_k(self, k: int) -> None:
        """Set another k hyperparameter

        Args:
            k (int): The new k
        """
        self.k = k

    def __pca(self, data: np.ndarray) -> np.ndarray:
        """Performs PCA on the internal saved data to calculate the transformed data"""
        # Check if saved PCs are available
        
        if self.__pc is None:
            # Calculate PCs
            self.__pc = PCA(n_components = self.use_dims).fit(self.data[:,:-1])
            
            self.data = np.concatenate([self.__pc.transform(self.data[:,:-1]), self.data[:,-1, None] ], axis = 1)
            self.shape = self.data.shape
            
        if data is None:
            return   
        
        # Transform data
        data = self.__pc.transform(data)
    
        return data

    def top_k(self, datapoint: np.ndarray) -> np.ndarray:
        """Calculates the label using the cosine similarity between each datapoint and the stored data points

        Args:
            datapoint (np.narray): An array of shape [N,C], where N is the number of datasamples (rows) and C the number of features (columns), not including the label

        Returns:
            np.ndarray: An array of shape [N], containing the label prediction 
        """
        

        datapoint = datapoint.copy()

        if self.use_dims is not None:
            datapoint = self.__pca(datapoint)
        
        assert datapoint.shape[1] == self.shape[1]-1
        
        
        # Pepare internal data representation for calculation
        if self.__dat == None:

            # Remove label column
            self.__dat = self.data[:, :-1]
            self.__lab = self.data[:, -1].astype(int)

            # Normalize each row to length 1
            self.__dat = self.__dat / \
                np.linalg.norm(self.__dat, axis=1, keepdims=True)

        datapoint = datapoint / \
            np.linalg.norm(datapoint, axis=1, keepdims=True)

        # Split input array into smaller ones if N*n is to big
        step_size = max(1, int(KNN.MAX_MATRIX_ENTRIES // len(self)))
        labels_accumulator = []

        max_i = math.ceil(len(datapoint) / step_size)

        if self.__verbose:
            print("Starts calculating on self.data ({}) and input ({}).".format(
                self.shape, datapoint.shape))
            _st_t = datetime.datetime.now()

        for i in range(max_i):

            if self.__verbose:
                _en_t = datetime.datetime.now()
                _t = _en_t - _st_t
                if i == 0:
                    _t_ex = datetime.timedelta(seconds=0)
                else:
                    _t_ex = _t * (max_i/i)

                print("Finsihed datapoints {}/{} ({}/{})    {}m{}s/{}m{}s".format(
                    i * step_size, len(datapoint),
                    i, max_i,
                    _t.seconds//60, _t.seconds % 60,
                    _t_ex.seconds//60, _t_ex.seconds % 60,
                ),
                    end="\r")

            # Calculate dot product of matrices
            cos_sim = datapoint[i*step_size:(i+1)*step_size] @ self.__dat.T

            # Get indices of highest value
            ind = np.argpartition(cos_sim, -self.k, axis=1)[:, -self.k:]

            # Get corresponding labels from internal data
            labels = np.array([self.__lab[ind[i]] for i in range(len(ind))])

            # Do majority vote for each input data point
            labels = np.array([np.argmax(np.bincount(labels[i]))
                              for i in range(len(labels))])

            labels_accumulator.append(labels)

        if self.__verbose:
            _en_t = datetime.datetime.now()
            _t = _en_t - _st_t
            _t_ex = _t

            print("Finsihed datapoints {}/{} ({}/{})    {}m{}s/{}m{}s".format(
                len(datapoint), len(datapoint),
                max_i, max_i,
                _t.seconds//60, _t.seconds % 60,
                _t_ex.seconds//60, _t_ex.seconds % 60,
            ),)

        return np.concatenate(labels_accumulator)


def calculate_accuraccy(true: np.ndarray, pred: np.ndarray) ->  float:
    """Calculates the accuraccy of a prediction in refference to the ground-truth

    Args:
        true (np.ndarray): Ground-truth of shape [N]
        pred (np.ndarray): Prediction of shape [N]

    Returns:
        float: The accuracy over all datasamples
    """
    assert true.shape == pred.shape

    p = true-pred
    wrong = np.count_nonzero(p)
    correct = len(true) - wrong

    return correct/len(true)* 100

def precission_recall(true: np.ndarray, pred: np.ndarray) ->  tuple:
    """Calculates the accuraccy of a prediction in refference to the ground-truth per class

    Args:
        true (np.ndarray): Ground-truth of shape [N]
        pred (np.ndarray): Prediction of shape [N]

    Returns:
        Tuple: precission and recall
    """
    
    precission = precision_score(true,pred, average = None)
    recall =        recall_score(true,pred, average = None)

    return (precission * 100,recall * 100)


print(calculate_accuraccy(
    np.array([0,1,1,2,2,2,2,2]),
    np.array([0,1,2,1,2,2,2,0])
      ))
print(precission_recall(
    np.array([0,1,1,2,2,2,2,2]),
    np.array([0,1,2,1,2,2,2,0])
      ))

62.5
(array([50., 50., 75.]), array([100.,  50.,  60.]))


In [6]:
if 0:
    dfs = preprocess_dataset("condensed", 10, False, persons=[1, 2], )


Finished p1_d1. Took 0 minutes, 17 seconds
Finished p1_d2. Took 0 minutes, 19 seconds
Finished p2_d1. Took 0 minutes, 17 seconds
Finished p2_d2. Took 0 minutes, 18 seconds


In [7]:
if 0:
    knn = KNN(len(dfs[list(dfs.keys())[0]].columns), 5)
    knn.shape
    for k in dfs.keys():
        knn.store_data(dfs[k])
        print(knn.shape)
    print("Training completed")
    
    
    dp = dfs["p1_d1"].to_numpy()[:1000,]

    knn.set_reduce_dimensions(5)
    knn.get_pca()
    
    knn.verbose()
    
    print("\n\nStart testing")
    y = knn(dp[:,:-1])
    print("Accuracy {}  Recall {}".format(*calculate_accuraccy(
        y, dp[:, -1])))


(420758, 17)
(857298, 17)
(1273277, 17)
(1720523, 17)
Training completed


Start testing
Starts calculating on self.data ((1720523, 6)) and input ((1000, 5)).
Finsihed datapoints 1000/1000 (9/9)    0m53s/0m53s


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

### Analyze performance
Show how good we can predict one day by the previous day per person

Problems: It takes too long because data is too high-dimensional $~(|4e5| \cdot |4e5|) = ~ 10^{11}$ matrix values that have to be computed

Ideas to improve performance:
- Use tensors on GPU → Not practicable with current memory usage

Ideas to reduce dimensionality: 
- 1. Make bigger time windows (Reduce rows) → From 10 (= 0.1s) to 3000 (=30s) "Reason: Non reading can also have short glimpses of reading. Like in original study"
- 2. Use PCA on stored data and input data (Reduce columns)

In [20]:
if 0:
    dfs = preprocess_dataset(
    "condensed", 3000, overlapping=False, persons=list(range(1, 11, 1)))


Finished p1_d1. Took 0 minutes, 15 seconds
Finished p1_d2. Took 0 minutes, 17 seconds
Finished p2_d1. Took 0 minutes, 14 seconds
Finished p2_d2. Took 0 minutes, 19 seconds
Finished p3_d1. Took 0 minutes, 15 seconds
Finished p3_d2. Took 0 minutes, 15 seconds
Finished p4_d1. Took 0 minutes, 14 seconds
Finished p4_d2. Took 0 minutes, 16 seconds
Finished p5_d1. Took 0 minutes, 16 seconds
Finished p5_d2. Took 0 minutes, 12 seconds
Finished p6_d1. Took 0 minutes, 16 seconds
Finished p6_d2. Took 0 minutes, 14 seconds
Finished p7_d1. Took 0 minutes, 11 seconds
Finished p7_d2. Took 0 minutes, 12 seconds
Finished p8_d1. Took 0 minutes, 17 seconds
Finished p8_d2. Took 0 minutes, 22 seconds
Finished p9_d1. Took 0 minutes, 20 seconds
Finished p9_d2. Took 0 minutes, 11 seconds
Finished p10_d1. Took 0 minutes, 13 seconds
Finished p10_d2. Took 0 minutes, 14 seconds


In [3]:
def overall_acc_for_k(_dfs, k):
    ov_acc = []

    for p in range(1,11, 1):
        df = _dfs["p{}_d2".format(p)]
        knn = KNN(len(df.columns), k)
        # print("")
        # knn.verbose()

        knn.store_data(_dfs["p{}_d1".format(p)])

        pred = knn(df.to_numpy()[:, :-1])
        acc = calculate_accuraccy(pred, df.to_numpy()[:, -1])

        #print("Accuraccy for p = {} is {:.2f}".format(p,acc))
        ov_acc.append(acc)


    print("\nOverall accuraccy in predicting the second day was {:.2f}({:.2f}) for K = {}".format(
        np.mean(ov_acc), np.std(ov_acc), k))
    
    return ov_acc


def log_and_plot_KNN(_dfs, name):
    log = pd.DataFrame()
    for K in [1, 2, 3, 5, 7, 10, 15]:
        acc = overall_acc_for_k(_dfs, K)
        
        log = pd.concat([log, pd.DataFrame({
            "K" : [str(K) for _ in range(len(acc))], "acc" : acc
        })])
        
    sns.boxplot(data=log, x="acc", y="K")

    sns.despine(trim = True)

    plt.grid(axis = "x", alpha = 0.6)
    plt.savefig("res/{}.png".format(name))
    plt.savefig("res/{}.pdf".format(name))
    plt.close()
    return log

For all k the accuraccy in predicting day 2 from day 1 per person is around 58% for $1<k\leq15$

In [4]:
if 1:
    log = log_and_plot_KNN(_dfs = preprocess_dataset(
    "condensed",normalize= True),
                name="KNN_norm")

    log = log_and_plot_KNN(_dfs = preprocess_dataset(
        "condensed", normalize= False),
                    name="KNN_raw")

    log = log_and_plot_KNN(_dfs = pca_dataset(
        preprocess_dataset(
        "condensed", normalize= True), num_pc=2),
                    name="KNN_norm_pca2")
    log = log_and_plot_KNN(_dfs = pca_dataset(
        preprocess_dataset(
        "condensed", normalize= True), num_pc=5),
                    name="KNN_norm_pca5")
    log = log_and_plot_KNN(_dfs = pca_dataset(
        preprocess_dataset(
        "condensed",normalize= True), num_pc=10),
                    name="KNN_norm_pca10")

    log = log_and_plot_KNN(_dfs = pca_dataset(
        preprocess_dataset(
        "condensed", normalize= False), num_pc=2),
                    name="KNN_raw_pca2")
    log = log_and_plot_KNN(_dfs = pca_dataset(
        preprocess_dataset(
        "condensed", normalize= False), num_pc=5),
                    name="KNN_raw_pca5")
    log = log_and_plot_KNN(_dfs = pca_dataset(
        preprocess_dataset(
        "condensed", normalize= False), num_pc=10),
                    name="KNN_raw_pca10")


TypeError: preprocess_dataset() got an unexpected keyword argument 'overlapping'

| **Normalized** | **PCA** | **Best Acc**   | K        |
|----------------|---------|----------------|----------|
|        +       | None    | 58             | 7/**10**/15  |
|        +       | 2       | 62             | 7/10/**15**   |
|        +       | 5       | 57             | 7/10/15   |
|        +       | 10       | 57             | **2**/10/15   |
|        -       | None       | 59            | 2/10/**15**   |
|        -       | 2       | 60             | 2/5/7/**10**/15   |
|        -       | 5       | 58             | **2**   |
|        -       | 10       | 61             | **2**   |


# NN
We now try simple linear neural networks instead

In [6]:
dfs = preprocess_dataset(
    "condensed", 100, overlapping=False, persons=list(range(1, 11, 1)))
for key in ["p1_d1", "p1_d2"]:
     value = dfs.pop(key)
     dfs_test = {key : value }

In [15]:
class Linear_Model(torch.nn.Module):
    def __init__(self, in_features = 16, out_features = 4, num_hidden = []) -> None:
        super().__init__()
        
        
        # Add input and output to layers
        num_hidden = [in_features] + num_hidden + [out_features]
        layers = []
        
        for i in range(len(num_hidden)-1):
            layers.append(
                torch.nn.Linear(in_features=num_hidden[i], out_features= num_hidden[i+1]))
            layers.append(torch.nn.ReLU())
            
        # Remove last ReLu
        del layers[-1]
            
    
        self.model = torch.nn.Sequential(*layers)
        
    def forward(self,x):
        return self.model(x)


def train_epoch(model : torch.nn.Module, dataloader : torch.utils.data.DataLoader, optimizer : torch.optim.Optimizer, loss_function = torch.nn.CrossEntropyLoss(), verbose = False) -> float:
    """Trains the model on the dataset

    Args:
        model (torch.nn.Module): The model to be trained.
        dataloader (torch.utils.data.DataLoader): The dataset.
        optimizer (torch.optim.Optimizer): The optimizer for the given model

    Returns:
        float: The accuraccy of the model
    """
    
    __device = next(model.parameters()).device
    model.train()
    correct, total, acc = 0,0, 0
    loss_window = deque()
    loss_window.extend([0]*5)

    
    for itr, (datas,labels) in enumerate(dataloader):
        if verbose:
            print("Training. Batch {}/{}. Running Acc: {:.2f}%     Loss window: {:.4f}".format(itr,len(dataloader), acc, sum(list(loss_window))/len(loss_window)), end = "\r")
        optimizer.zero_grad()

        # Predict
        output = model(datas.to(__device))
        
        # Metrics
        _, pred = output.cpu().max(1, keepdims=True)
        correct += pred.eq(labels).sum().item()
        total += len(datas)
        acc = correct / total * 100
        
        # Update
        loss = loss_function(output, labels.squeeze(-1))
        
        loss_window.append(loss)
        loss_window.popleft()
        
        
        loss.backward()
        optimizer.step()
        
    if verbose:
        print("Training. Batch {}/{}. Running Acc: {:.2f}%".format(itr + 1,len(dataloader), acc))
        
    return acc

def test_epoch(model : torch.nn.Module, dataloader : torch.utils.data.DataLoader, verbose = False) -> float : 
    """Evaluates the model on the dataset 

    Args:
        model (torch.nn.Module): The model to be evaluated.
        dataloader (torch.utils.data.DataLoader): The dataset.

    Returns:
        float: The accuraccy of the model
    """
    model.eval()
    correct, total, acc = 0,0, 0
   

    for itr, (datas,labels) in enumerate(dataloader):
        if verbose:
            print("Training. Batch {}/{}. Running Acc: {:.2f}%".format(itr,len(dataloader), acc), end = "\r")
       
        # Predict
        output = model(datas)
        
        # Metrics
        _, pred = output.cpu().max(1, keepdims=True)
        correct += pred.eq(labels).sum().item()
        total += len(datas)
        acc = correct / total * 100
        
    if verbose:
        print("Training. Batch {}/{}. Running Acc: {:.2f}%".format(itr + 1,len(dataloader), acc))
        
    return acc

    
def create_dataloader(datasets : dict, batch_size : int, shuffle : bool = True, ) -> torch.utils.data.DataLoader:
    """Creates a single dataloader from all data provided in datasets

    Args:
        datasets (dict): A dictionary of datasets
        batch_size (int): The batch size forwarded to the data_loader
        shuffle (bool, optional): If the samples sould be reshuffled each epoch. Defaults to True.
        
    Returns:
        torch.utils.data.DataLoader: The dataloader
    """
    # Get datasets 
    datasets = list(datasets.values())
    
    # Transform to numpy
    datasets = [ds.to_numpy() if ds is pd.DataFrame else ds for ds in datasets]
    
    # Concat into single numpy matrix
    datasets = np.concatenate(datasets)
    
    # Split data from label
    data = torch.from_numpy(datasets[:,:-1].astype(np.float32))
    labels = torch.from_numpy(datasets[:,-1, None].astype(np.int64))
    
    # Construct TensorDataset
    dataloader = DataLoader(TensorDataset(data, labels), batch_size = batch_size, shuffle = shuffle, num_workers=0)
    
    return dataloader



BATCH_SIZE = 256

def evaluate_model(model : torch.nn.Module, num_epochs, dfs = None):
    """Evaluates a model, by training/testing in a "leave-one-participant-out" fashion

    Args:
        model (torch.nn.Module)
    """

    if dfs is None:
        dfs = dfs = preprocess_dataset( "condensed", 100, overlapping=False,)
    
    # Iterate over all candidates to leave out
    for leave_out in range(1,11,1) :
        keys = ["p{}_d1".format(leave_out),"p{}_d2".format(leave_out)]
        
        loader_train = create_dataloader(dfs_,BATCH_SIZE,)
        loader_test = create_dataloader(dfs_test,BATCH_SIZE,)



model = Linear_Model(num_hidden=[])


loader_train = create_dataloader(dfs_,BATCH_SIZE,)
loader_test = create_dataloader(dfs_test,BATCH_SIZE,)

optim = torch.optim.Adam(model.parameters(),lr = 1e-2, weight_decay=1e-5)


print("Start Training")
for batch in range(5):
    print("Epoch {}".format(batch))
    train_epoch(model,loader_train,optim, verbose =True)
    test_epoch(model,loader_test, verbose =True)

Start Training
Batch 0
Training. Batch 2789/2789. Running Acc: 65.94%     Loss window: 0.9843
Training. Batch 171/171. Running Acc: 64.17%
Batch 1
Training. Batch 2789/2789. Running Acc: 65.95%     Loss window: 0.9527
Training. Batch 171/171. Running Acc: 64.17%
Batch 2
Training. Batch 2789/2789. Running Acc: 65.95%     Loss window: 0.9607
Training. Batch 171/171. Running Acc: 64.17%
Batch 3
Training. Batch 2789/2789. Running Acc: 65.95%     Loss window: 0.9569
Training. Batch 171/171. Running Acc: 64.16%
Batch 4
Training. Batch 2789/2789. Running Acc: 65.95%     Loss window: 0.9126
Training. Batch 171/171. Running Acc: 64.17%


### Things we want to investigate
- Linear model on condensed dataset
    - Training acc ~70%, Testing acc ~60% → more layers do not introduce more generalization, only overfitting
- Linear model on condensed dataset (unnormalized)

- In person prediction: Linear model vs KNN

- Linear on concatenated data
- LSTM on concatenated data
- CNN on concatenated data