In [61]:
import os
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import glob

from scipy.stats import stats

from src.NoXi.preprocessing.labels_preprocessing import load_all_labels_by_paths

In [62]:
# load labels
path_to_labels=r'E:\Databases\NoXi\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data'
output_path=r'E:\Databases\NoXi\NoXi_annotations_reliable_gold_standard_classification_with_additional_train_data'

all_label_paths=glob.glob(os.path.join(path_to_labels,'**','**','**','annotation*.txt'))
all_train_label_paths=glob.glob(os.path.join(path_to_labels,'**','train','**','annotation*.txt'))
all_dev_label_paths=glob.glob(os.path.join(path_to_labels,'**','dev','**','annotation*.txt'))
all_test_label_paths=glob.glob(os.path.join(path_to_labels,'**','test','**','annotation*.txt'))


all_labels=load_all_labels_by_paths(all_label_paths)
all_train_labels=load_all_labels_by_paths(all_train_label_paths)
all_dev_labels=load_all_labels_by_paths(all_dev_label_paths)
all_test_labels=load_all_labels_by_paths(all_test_label_paths)


In [63]:
all_train_labels

{'E:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\026_2016-04-06_Nottingham\\annotation_expert.txt': array([0.402416, 0.405325, 0.405499, ..., 0.630929, 0.636819, 0.636116],
       dtype=float32),
 'E:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\026_2016-04-06_Nottingham\\annotation_novice.txt': array([0.250783, 0.289286, 0.290491, ..., 0.429497, 0.427062, 0.427217],
       dtype=float32),
 'E:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\027_2016-04-06_Nottingham\\annotation_expert.txt': array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5], dtype=float32),
 'E:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\027_2016-04-06_Nottingham\\annotation_novice.txt': array([0.5   , 0.5   , 0.5   , ..., 0.6875, 0.6875, 0.625 ], dtype=float3

<h2> Labels softening (train) </h2>

In [64]:

def convert_to_soft_one_hot_encoding(value:float, class_values:Tuple[float,...])->np.ndarray:
    """Converts given value to the soft one-hot encoding vector, taking into account provided class_values.

    We soft labels using the following procedure:
    We have 5 classes - 0, 0.25, 0.5, 0.75, 1
    If the label is between two classes, we create a weighted soft one-hot encoding according to the closeness to classes.
    For example, for the value 0.7, we have two concurent classes - 0.25 and 0.5. TO calculate weights, we need to do the following:
    Calculate the distance between classes 0.75 - 0.5 = 0.25. Calculate the "location" of the point on this segment via substracting the value of the lowest class from him 0.7 - 0.5 = 0.2.
    To get the value of soft one-hot encoding for "right" class (counting from the point), we need to divide the "location" of the point on the distance 0.2/0.25=0.8. The value of "left" class is simply 1.-0.8 (calculated "right" value).
    Thus, we got an one-hot encoding vector [0. 0. 0.2 0.8 0.] for the point 0.7

    :param value: float
            value of the point to convert to the soft one-hot encoding
    :param class_values: Tuple[float,...]
            the values, which classes can be equal to
    :return: List[float]
            soft one-hot encoding
    """
    # check the requirements for the value variable
    if value<0 or value>1:
        raise Exception("The value of the variable \"value\" is more than 1 or less than 0")
    # find the indexes of the classes, between which the value lies
    idx_right_class=next(i for i,v in enumerate(class_values) if value<=v)
    idx_left_class=idx_right_class-1
    # calculate the "distance" as it is described in the function description
    distance=class_values[idx_right_class]-class_values[idx_left_class]
    # calculate "location" as it is described in the function description
    location=value-class_values[idx_left_class]
    # create one-hot encoding vector with zeros
    one_hot_vector=np.zeros(len(class_values))
    # calculate the probability values of the one-hot encoding vector (for nearest right and left from the point classes)
    one_hot_vector[idx_right_class]=location/distance
    one_hot_vector[idx_left_class]=1.-one_hot_vector[idx_right_class]
    return one_hot_vector


In [65]:
# TODO: Function for converting all labels to the soft ones (except for the test and validation set) done
# TODO: Function for converting all test and validation labels to the categorical ones (in one-hot encoding appearance) - try for edge poitns (for example, 0.375) the Hamming window, or just a mode for the fixed-sized window.
# TODO: save function

In [66]:
# converting all labels in training set to the soft one-hot encodings
def convert_all_labels_to_soft_one_hot_encoding_by_paths(labels:Dict[str, np.ndarray], class_values:Tuple[float,...])->Dict[str, np.ndarray]:
    """Converts all labels presented in the Dict[path->values] to the soft one-hot encodings using provided class_values.

    :param labels: Dict[str, np.ndarray]
            labels in the format Dict[path->np.ndarray]
    :param class_values: Tuple[float,...]
            Possible values of the classes, ranked in ascended order
    :return: Dict[str, np.ndarray]
            converted to the soft one-hot encodings labels
    """
    for key, values in labels.items():
        labels[key]=np.array([convert_to_soft_one_hot_encoding(x, class_values) for x in values])
    return labels

In [67]:
class_values=(0, 0.25, 0.5, 0.75, 1.)
all_train_labels_converted=convert_all_labels_to_soft_one_hot_encoding_by_paths(all_train_labels, class_values)

In [68]:
all_train_labels_converted

{'E:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\026_2016-04-06_Nottingham\\annotation_expert.txt': array([[0.        , 0.39033604, 0.60966396, 0.        , 0.        ],
        [0.        , 0.37870002, 0.62129998, 0.        , 0.        ],
        [0.        , 0.37800395, 0.62199605, 0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.47628403, 0.52371597, 0.        ],
        [0.        , 0.        , 0.45272398, 0.54727602, 0.        ],
        [0.        , 0.        , 0.45553589, 0.54446411, 0.        ]]),
 'E:\\Databases\\NoXi\\NoXi_annotations_reliable_gold_standard_regression_with_additional_train_data\\English\\train\\026_2016-04-06_Nottingham\\annotation_novice.txt': array([[0.        , 0.99686801, 0.00313199, 0.        , 0.        ],
        [0.        , 0.84285605, 0.15714395, 0.        , 0.        ],
        [0.        , 0.83803594, 0.16196406, 0.        , 0.        ],
        ...,
      

<h2> Labels softening (dev and test) </h2>

In [98]:
# PROVED
# function for calculation the one-got encoding vector according to its neighborhood
def get_one_hot_encoding_according_to_neighborhood(array:np.ndarray, point_idx:int)->np.ndarray:
    """ calculate hard one-hot encoding based on the neighborhood of the point.
    The value will be calculated as a mode of the whole window, which is presented by array.
    The array should be passed as a 2d-ndarray, representing the one-hot encoding vectors

    :param array: np.ndarray
            2d np.ndarray, representing the "array" of the one-hot encoding vectors
    :param point_idx: int
            index of the point, for which the one-hot encoding should be calculated.
    :return: np.ndarray
            1d np.ndarray. Calculated as a mode of the whole array (window) and represented as one-hot encoding vector
    """
    # save the length of the one-hot encoding vectors
    length_one_hot=array.shape[1]
    # delete the point from the consideration of evaluation
    array=np.delete(array, point_idx, axis=0)
    # delete all points, which can be edge points (then their hard encodings contain only zeros)
    array=array[~np.all(array == 0, axis=1)]
    # convert the array of one/hot encodings to the class numbers
    array=np.argmax(array, axis=1)
    # evaluate the mode of the array
    mode=int(stats.mode(array, axis=None)[0])
    # convert found class for the point to one-hot encoding
    result=np.zeros((length_one_hot,))
    result[mode]=1.
    return result

def convert_points_according_to_neighborhood(array:np.ndarray, point_idxes:np.ndarray, neighborhood_length:int)->np.ndarray:
    """Convert points in the array according to their neighborhood (as a mode of their neighborhood)

    :param neighborhood_length: int
            the half-length of the "window", in which the mode should be calculated.
            Passed as a half of the window to take from the left and right side the same number of neighbors.
    :param array: np.ndarray
            np.ndarray, in which converting should be done.
    :param point_idxes: np.ndarray
            array of points' indexes, which should be changed.
    :return: np.ndarray
            np.ndarray with converted points
    """
    # copy original array
    array=array.copy()
    # go through all points
    for point_idx in point_idxes:
        # define the start and end indexes
        start=point_idx-neighborhood_length
        end=point_idx+neighborhood_length
        # check if we are not outside the array
        if start<0:
            start=0
            point_idx_in_window=point_idx # :), yes, I have written it to understand the logic
        elif end>=array.shape[0]:
            end=array.shape[0]-1
            point_idx_in_window=neighborhood_length # new array will have the position for the point on the distance of neighborhood_length from start
        else:
            point_idx_in_window=neighborhood_length
        # take the window for mode calculation
        values=array[start:end+1]
        result_point=get_one_hot_encoding_according_to_neighborhood(values,point_idx_in_window)
        # change this point in the array
        array[point_idx]=result_point

    return array



In [71]:
# TODO: complete function
# converting dev and test labels to the hard one-hot encodings
def convert_labels_to_hard_one_hot_encodings(labels:Dict[str, np.ndarray], class_values:Tuple[float,...])->Dict[str, np.ndarray]:
    """Converts all labels presented in the Dict[path->values] to the hard one-hot encodings using provided class_values.

    For the validation and test sets we need to have hard one-hot encodings, since it is not possible to assign some distinct class, when we have values such as 0.375 (it is right in the middle between two classes: 0.25 and 0.5).
    For all other cases, the closest class will be chosen (for example, for value 0.4, then 0.5 class will be chosen).
    Regarding these "edge cases" like 0.375, I want to go through them with the Hamming window.

    :param labels: Dict[str, np.ndarray]
            labels in the format Dict[path->np.ndarray]
    :param class_values: Tuple[float,...]
            Possible values of the classes, ranked in ascended order
    :return: Dict[str, np.ndarray]
            converted to the soft one-hot encodings labels
    """
    # make a copy to not change the original data
    labels=labels.copy()
    # calculate possible edge points
    possible_edge_points=(np.array(class_values)[:-1]+np.array(class_values)[1:])/2.
    # detect all "edge points"
    edge_points_idx={}
    for key, values in labels.items():
        edge_points_idx[key]=np.isin(values, possible_edge_points)
    # change all labels according to the closeness to the nearest class
    for key, values in labels.items():
        # convert to the soft one-hot encoding vector
        result = np.array([convert_to_soft_one_hot_encoding(x, class_values) for x in values])
        # make np.round to every value so that vectors become as hard one-hot encoding.
        result = np.round(result)
        # done
        labels[key] = result
    # change all "edge points" based on the mode of the window (10 past frames and 10 next frames)
    for key, values in labels.items():
        pass

    pass

In [103]:
# tests for convert_points_according_to_neighborhood function
array=np.array([[0., 0, 1., 0, 0],      # 2
               [1., 0, 0., 0, 0],       # 0
               [1., 0, 0, 0, 0],        # 0
               [1, 0, 0., 0, 0],        # 0
               [0, 1., 0, 0, 0],        # 1
               [0, 0, 0.5, 0.5, 0],     # HZ -> 0
               [0, 0., 0.5, 0.5, 0],    # HZ -> 0
               [0, 0, 0.5, 0.5, 0],     # HZ -> 1
               [0, 0, 1., 0, 0],        # 2
               [0, 0, 0.5, 0.5, 0.],    # HZ -> 2
               [0, 0, 0, 1., 0]]        # 3
               )
point_idxes=np.array([5])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=3), axis=1)[5])

point_idxes=np.array([6])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=3), axis=1)[6])

point_idxes=np.array([7])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=3), axis=1)[7])

point_idxes=np.array([9])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=3), axis=1)[9])

0
0
1
2


In [108]:
array=np.array([[0., 0.5, 0.5, 0, 0],       # HZ -> 3
               [0., 0, 0., 1, 0],           # 3
               [1., 0, 0, 0, 0],            # 0
               [0., 0, 0., 1, 0],           # 3
               [0.5, 0.5, 0, 0, 0],         # HZ -> 3
               [0, 0, 0, 1., 0],            # 3
               [0, 0., 0, 0, 1.],           # 4
               [0, 0, 0, 1., 0],            # 3
               [0, 0, 0., 1, 0],            # 3
               [0, 0.5, 0.5, 0., 0.],       # HZ -> 3
               [0, 0.5, 0.5, 0., 0],        # HZ -> 3
               [1, 0, 0, 0., 0.],           # 0
               [0, 0, 1, 0., 0.],           # 2
               [0, 0, 0, 0., 1.],           # 4
               [0, 1, 0, 0., 0.],           # 2
               [0, 0, 0, 0.5, 0.],          # HZ -> 0
               [1, 0, 0, 0., 0.],           # 0
               [0, 0, 0, 0.5, 0.5]],        # HZ -> 0 (because previous one HZ turned out to be 0)
               )
point_idxes=np.array([0])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=5), axis=1)[0])

point_idxes=np.array([4])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=5), axis=1)[4])

point_idxes=np.array([9])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=5), axis=1)[9])

point_idxes=np.array([10])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=5), axis=1)[10])

point_idxes=np.array([15])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=5), axis=1)[15])

point_idxes=np.array([17])
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), point_idxes, neighborhood_length=5), axis=1)[17])

print(np.argmax(np.round(array), axis=1),'\n')
print(np.argmax(convert_points_according_to_neighborhood(np.round(array), np.array([0,4,9,10,15,17]), neighborhood_length=5), axis=1))

3
3
3
3
0
0
[0 3 0 3 0 3 4 3 3 0 0 0 2 4 1 0 0 0] 

[3 3 0 3 3 3 4 3 3 3 3 0 2 4 1 0 0 0]
