In [1]:
import pandas as pd
import numpy as np
import os
import glob
from typing import List
from typing import Dict

<h1>Sort data on train/dev/test</h1>

In [2]:
path_to_labels=r"D:\Databases\NoXi\NoXi_annotations_reliable"

'''
THe annotation files are located in directory path_to_labels. Then, they are pre-sorted on languages - ENglish, French, German, and Others.
Within these directories there are subdirectories with names like "026_2016-04-06_Nottingham". However, some of them could contain several annotation files, because the authors have not
calculated "the gold standard" of annotations. Therefore, we need to average them depending on the confidence score (every annotation contains the label itsefl and the confidence score, which can be seen as confidence of the rater in the annotation). Moreover, the sorting on train/dev/test within one language should be done. Therefore, the consequence of the actions is the following:
1) Identify all video filenames within every language
2) Separate randomly some video filenames within every language
3) Process separated annotations:
    - identify novice_ and expert_ annotations and separate them in different lists
    - average the labels according to their confidence
4) save processed annotations in given directory path_to_sorted_labels
'''

'\nTHe annotation files are located in directory path_to_labels. Then, they are pre-sorted on languages - ENglish, French, German, and Others.\nWithin these directories there are subdirectories with names like "026_2016-04-06_Nottingham". However, some of them could contain several annotation files, because the authors have not\ncalculated "the gold standard" of annotations. Therefore, we need to average them depending on the confidence score (every annotation contains the label itsefl and the confidence score, which can be seen as confidence of the rater in the annotation). Moreover, the sorting on train/dev/test within one language should be done. Therefore, the consequence of the actions is the following:\n1) Identify all video filenames within every language\n2) Separate randomly some video filenames within every language\n3) Process separated annotations:\n    - identify novice_ and expert_ annotations and separate them in different lists\n    - average the labels according to the

In [3]:
# Identifying video filenames within language
def get_abs_paths_to_video_filenames(path_to_dir:str)->List[str]:
    """Returns absolute paths to every video directory in Noxi.

    :param path_to_dir: str
            path to directory with annotations for videos
    :return: List[str]
            List of absolute paths to every video directory in Noxi, normally within one language
    """
    video_filenames=glob.glob(os.path.join(path_to_dir,'**'))
    return video_filenames

def get_abs_paths_to_video_filename_for_every_language(path_to_dir:str)->Dict[str,List[str]]:
    """Returns absolute paths to every video directory in NoXi taking into account the language.

    :param path_to_dir: str
            Path to dir with all annotations pre-separated on languages
    :return: Dict[str, List[str]]
            Dictionary with the structure Dict[language->List_of_abs_paths_to_video_filenames]
    """
    languages=glob.glob(os.path.join(path_to_dir,"**"))
    abs_paths={}
    for language in languages:
        video_filenames=get_abs_paths_to_video_filenames(language)
        abs_paths[language[language.rfind('\\')+1:]]=video_filenames
    return abs_paths



In [4]:
paths_with_lang=get_abs_paths_to_video_filename_for_every_language(path_to_labels)

In [5]:
paths_with_lang

{'English': ['D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\026_2016-04-06_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\027_2016-04-06_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\028_2016-04-06_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\029_2016-04-06_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\030_2016-04-06_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\034_2016-04-07_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\052_2016-04-12_Nottingham'],
 'French': ['D:\\Databases\\NoXi\\NoXi_annotations_reliable\\French\\001_2016-03-17_Paris',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\French\\002_2016-03-17_Paris',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\French\\003_2016-03-17_Paris',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\French\\004_2016-03-18_Paris',
  'D:\\Databases\\NoXi\\NoXi_annot

In [6]:
# separation of train/dev/test
def separate_on_train_dev_test(elements:List[str], train_prop:float=0.7, dev_prop:float=0.2, test_prop:float=0.1)->List[List[str]]:
    """Separates elements within the list on train/dev/test sub samples using provided proportions.

    :param elements: List[str]
            Elements needed to be separated on train/dev/test
    :param train_prop: float
            proportion in the forming train subset
    :param dev_prop: float
            proportion in the forming dev subset
    :param test_prop: float
            proportion in the forming test subset
    :return: List[List[str]]
            List of lists. In every sublist the train or dev or test subset will be presented.
    """
    if not np.isclose(train_prop+dev_prop+test_prop, 1.):
        raise ValueError("The sum of all probabilities should be equal to 1. Your sum is %f"%(train_prop+dev_prop+test_prop))
    elements=np.array(elements)
    # generate permutations of indexes to get randomly chosen train/dev/test subsets
    permutations=np.random.permutation(elements.shape[0])
    # calculate indexes for taking train/dev/test subsets according to the given probabilities
    train_prop_idx=int(np.round(permutations.shape[0]*train_prop))
    dev_prop_idx=int(np.round(permutations.shape[0]*dev_prop))+train_prop_idx

    print('Elements shape:%i, train_idx:%i, dev_idx:%i'%(elements.shape[0], train_prop_idx, dev_prop_idx))

    result_list=[]
    # train
    needed_idx_train=permutations[:train_prop_idx]
    result_list.append(elements[needed_idx_train].tolist())
    # dev
    needed_idx_dev=permutations[train_prop_idx:dev_prop_idx]
    result_list.append(elements[needed_idx_dev].tolist())
    # test
    needed_idx_test=permutations[dev_prop_idx:]
    result_list.append(elements[needed_idx_test].tolist())

    return result_list

In [7]:
separate_on_train_dev_test(paths_with_lang['English'])

Elements shape:7, train_idx:5, dev_idx:6


[['D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\029_2016-04-06_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\034_2016-04-07_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\026_2016-04-06_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\052_2016-04-12_Nottingham',
  'D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\030_2016-04-06_Nottingham'],
 ['D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\027_2016-04-06_Nottingham'],
 ['D:\\Databases\\NoXi\\NoXi_annotations_reliable\\English\\028_2016-04-06_Nottingham']]

In [8]:
from src.NoXi.preprocessing.labels_preprocessing import load_all_labels_by_paths, clean_labels, \
    average_from_several_labels


# separate all annotations on 2 categories: novice and expert
def separate_annotation_filenames_on_novice_expert(path_to_dir:str)->Dict[str,List[str]]:
    """Separates the filenames in the provided directory on novice and expert categories

    :param path_to_dir: str
            path to dir, where annotations shoud be separated
    :return: Dict[str,List[str]]
            Dict with the structure Dict[novice/exert->filenames]
    """
    novice_filenames=glob.glob(os.path.join(path_to_dir,'*novice*.annotation~'))
    expert_filenames=glob.glob(os.path.join(path_to_dir,'*expert*.annotation~'))
    separated_annotations={}
    separated_annotations['novice']=novice_filenames
    separated_annotations['expert']=expert_filenames
    return separated_annotations

# labels averaging withing one entity (novice or expert)
def get_averaged_labels_for_one_entity(list_of_filenames_with_abs_paths:List[str])->np.ndarray:
    """Calculates and returns averaged labels according to the provided labels and confidence levels in the files

    :param list_of_filenames_with_abs_paths: List[str]
            absolute paths to filenames with labels
    :return: np.ndarray
            averaged labels
    """
    # load all labels
    labels=load_all_labels_by_paths(list_of_filenames_with_abs_paths) # output type is Dict[str, pd.DataFrame]
    # clean every label file from NaN
    for key in labels.keys():
        labels[key]=clean_labels(labels[key])
    # cut all to the minimum length (in case the lengths of annotation sequences are different)
    lengths=np.array([len(item) for key, item in labels.items()])
    min_length=int(np.min(lengths))
    for key in labels.keys():
        labels[key]=labels[key][:min_length]
    # average labels
    averaged_labels=average_from_several_labels(list(labels.values()))
    return averaged_labels

<h1> General function, which encompasses all previous ones</h1>

In [9]:
from typing import Tuple

'''
THe annotation files are located in directory path_to_labels. Then, they are pre-sorted on languages - ENglish, French, German, and Others.
Within these directories there are subdirectories with names like "026_2016-04-06_Nottingham". However, some of them could contain several annotation files, because the authors have not
calculated "the gold standard" of annotations. Therefore, we need to average them depending on the confidence score (every annotation contains the label itsefl and the confidence score, which can be seen as confidence of the rater in the annotation). Moreover, the sorting on train/dev/test within one language should be done. Therefore, the consequence of the actions is the following:
1) Identify all video filenames within every language
2) Separate randomly some video filenames within every language
3) Process separated annotations:
    - identify novice_ and expert_ annotations and separate them in different lists
    - average the labels according to their confidence
4) save processed annotations in given directory path_to_sorted_labels
'''
def general_function(path_to_labels:str, output_path:str, train_prop:float, dev_prop:float, test_prop:float)->None:
    paths_to_video_files_with_languages=get_abs_paths_to_video_filename_for_every_language(path_to_labels) # Dict[language->List[paths]]
    # create output path if does not exist
    if not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)
    # go through all languages
    for language in paths_to_video_files_with_languages.keys():
        # create directory for language if does not exist
        if not os.path.exists(os.path.join(output_path, language)):
            os.makedirs(os.path.join(output_path, language), exist_ok=True)
        # get video filenames for concrete language
        video_filenames=paths_to_video_files_with_languages[language]
        # divide videos on train/dev/test
        train_dev_test_filenames=separate_on_train_dev_test(video_filenames, train_prop, dev_prop, test_prop)

        # TRAIN
        # create train directory in output_path if does not exist
        if not os.path.exists(os.path.join(output_path, language, 'train')):
            os.makedirs(os.path.join(output_path, language, 'train'), exist_ok=True)
        # save all train samples
        separate_average_and_save_all_labels_in_subset_of_videofiles(set_of_video_filenames=train_dev_test_filenames[0],
                                                                     output_path=os.path.join(output_path, language, "train"))
        print("TRAIN ENDED")
        # DEV
        # create dev directory in output_path if does not exist
        if not os.path.exists(os.path.join(output_path, language, 'dev')):
            os.makedirs(os.path.join(output_path, language, 'dev'), exist_ok=True)
        # save all dev samples
        separate_average_and_save_all_labels_in_subset_of_videofiles(set_of_video_filenames=train_dev_test_filenames[1],
                                                                     output_path=os.path.join(output_path, language, "dev"))
        print("DEV ENDED")
        # TEST
        # create dev directory in output_path if does not exist
        if not os.path.exists(os.path.join(output_path, language, 'test')):
            os.makedirs(os.path.join(output_path, language, 'test'), exist_ok=True)
        # save all dev samples
        separate_average_and_save_all_labels_in_subset_of_videofiles(set_of_video_filenames=train_dev_test_filenames[2],
                                                                     output_path=os.path.join(output_path, language, "test"))
        print("TEST ENDED")




def save_annotation_in_txt_file(path:str, filename:str, data:np.ndarray)->None:
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)
    full_path=os.path.join(path, filename+".txt")
    np.savetxt(full_path, data, fmt="%.6f" )

def separate_average_and_save_all_labels_in_subset_of_videofiles(set_of_video_filenames:List[str], output_path:str)->None:
    # go through all subset
    for video_filename in set_of_video_filenames:
        print("%s in process..."%video_filename)
        separated_novice_expert=separate_annotation_filenames_on_novice_expert(video_filename)
        averaged_labels_novice=get_averaged_labels_for_one_entity(separated_novice_expert['novice'])
        averaged_labels_expert=get_averaged_labels_for_one_entity(separated_novice_expert['expert'])
        # save obtained labels
        only_video_filename=video_filename[video_filename.rfind("\\")+1:]
        # novice
        full_path_novice=os.path.join(output_path, only_video_filename)
        save_annotation_in_txt_file(full_path_novice, "annotation_novice", averaged_labels_novice)
        # expert
        full_path_expert=os.path.join(output_path, only_video_filename)
        save_annotation_in_txt_file(full_path_expert, "annotation_expert", averaged_labels_expert)





In [11]:
path_to_labels=r"D:\Databases\NoXi\NoXi_annotations_reliable"
output_path=r"D:\Databases\NoXi\NoXi_annotations_reliable_gold_standard_regression"
train_prop=0.7
dev_prop=0.2
test_prop=0.1


general_function(path_to_labels=path_to_labels, output_path=output_path, train_prop=train_prop, dev_prop=dev_prop, test_prop=test_prop)

Elements shape:7, train_idx:5, dev_idx:6
D:\Databases\NoXi\NoXi_annotations_reliable\English\027_2016-04-06_Nottingham in process...
HELLO, labels_shape: (16750, 2)
Do you have nans after fillna method? Answer: False
HELLO, labels_shape: (16750, 2)
Do you have nans after fillna method? Answer: False
D:\Databases\NoXi\NoXi_annotations_reliable\English\034_2016-04-07_Nottingham in process...
HELLO, labels_shape: (16469, 2)
Do you have nans after fillna method? Answer: False
HELLO, labels_shape: (16469, 2)
Do you have nans after fillna method? Answer: False
D:\Databases\NoXi\NoXi_annotations_reliable\English\026_2016-04-06_Nottingham in process...
HELLO, labels_shape: (18051, 2)
Do you have nans after fillna method? Answer: False
HELLO, labels_shape: (18051, 2)
Do you have nans after fillna method? Answer: False
D:\Databases\NoXi\NoXi_annotations_reliable\English\029_2016-04-06_Nottingham in process...
HELLO, labels_shape: (20568, 2)
Do you have nans after fillna method? Answer: False
HEL