In [24]:
import os
from typing import List, Tuple

import numpy as np
import pandas as pd
import glob

from src.NoXi.preprocessing.labels_preprocessing import load_all_labels_by_paths

In [25]:
# load labels
path_to_labels=r'D:\Databases\NoXi\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data'
output_path=r'D:\Databases\NoXi\NoXi_annotations_reliable_gold_standard_classification_with_additional_train_data'

all_label_paths=glob.glob(os.path.join(path_to_labels,'**','**','**','annotation*.txt'))
all_labels=load_all_labels_by_paths(all_label_paths)

In [26]:
list(all_labels.items())[3][1]

array([0.482118, 0.484025, 0.48365 , ..., 0.670662, 0.649756, 0.641281],
      dtype=float32)

In [27]:
all_label_paths

['D:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\dev\\034_2016-04-07_Nottingham\\annotation_expert.txt',
 'D:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\dev\\034_2016-04-07_Nottingham\\annotation_novice.txt',
 'D:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\test\\028_2016-04-06_Nottingham\\annotation_expert.txt',
 'D:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\test\\028_2016-04-06_Nottingham\\annotation_novice.txt',
 'D:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\026_2016-04-06_Nottingham\\annotation_expert.txt',
 'D:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\026_2016-04-06_Nottingham\\annotation_novic

<h2> Labels softening </h2>

In [53]:
# PROVED
def convert_to_soft_one_hot_encoding(value:float, class_values:Tuple[float,...])->np.ndarray:
    """Converts given value to the soft one-hot encoding vector, taking into account provided class_values.

    We soft labels using the following procedure:
    We have 5 classes - 0, 0.25, 0.5, 0.75, 1
    If the label is between two classes, we create a weighted soft one-hot encoding according to the closeness to classes.
    For example, for the value 0.7, we have two concurent classes - 0.25 and 0.5. TO calculate weights, we need to do the following:
    Calculate the distance between classes 0.75 - 0.5 = 0.25. Calculate the "location" of the point on this segment via substracting the value of the lowest class from him 0.7 - 0.5 = 0.2.
    To get the value of soft one-hot encoding for "right" class (counting from the point), we need to divide the "location" of the point on the distance 0.2/0.25=0.8. The value of "left" class is simply 1.-0.8 (calculated "right" value).
    Thus, we got an one-hot encoding vector [0. 0. 0.2 0.8 0.] for the point 0.7

    :param value: float
            value of the point to convert to the soft one-hot encoding
    :param class_values: Tuple[float,...]
            the values, which classes can be equal to
    :return: List[float]
            soft one-hot encoding
    """
    # check the requirements for the value variable
    if value<0 or value>1:
        raise Exception("The value of the variable \"value\" is more than 1 or less than 0")
    # find the indexes of the classes, between which the value lies
    idx_right_class=next(i for i,v in enumerate(class_values) if value<=v)
    idx_left_class=idx_right_class-1
    # calculate the "distance" as it is described in the function description
    distance=class_values[idx_right_class]-class_values[idx_left_class]
    # calculate "location" as it is described in the function description
    location=value-class_values[idx_left_class]
    # create one-hot encoding vector with zeros
    one_hot_vector=np.zeros(len(class_values))
    # calculate the probability values of the one-hot encoding vector (for nearest right and left from the point classes)
    one_hot_vector[idx_right_class]=location/distance
    one_hot_vector[idx_left_class]=1.-one_hot_vector[idx_right_class]
    return one_hot_vector


In [74]:
# TODO: Function for converting all labels to the soft ones (except for the test and validation set)
# TODO: Function for converting all test and validation labels to the categorical ones (in one-hot encoding appearance) - try for edge poitns (for example, 0.375) the Hamming window, or just a mode for the fixed-sized window.

In [73]:
list(all_labels.items())[10]

('D:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\030_2016-04-06_Nottingham\\annotation_expert.txt',
 array([0.5 , 0.5 , 0.5 , ..., 0.75, 0.75, 0.75], dtype=float32))

In [69]:
a= [convert_to_soft_one_hot_encoding(x, (0., 0.25, 0.5, 0.75, 1.)) for x in var]

In [72]:
a

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]])