In [79]:
from __future__ import print_function

import gc
import os
import json
import math
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm

# Insert path to .skeleton files here
path = 'data/videos/nturgb+d_skeletons/'
train_dest_path = 'data/processed/train/'
test_dest_path = 'data/processed/test/'

# Extraction et conversion des données

In [5]:
def distance_matrix(x, y, z, matrix=None, reduce=True):
    """
    Compute distance matrix.
    
    Args:
        x, y, z: coordonates of points
         matrix: matrix to update, if already built
         reduce: reduce matrix coeffs in [0, 1]
     
    Return:
        matrix: distance matrix
    """
    assert x.size == y.size == z.size
    nb_pt = x.size
    if matrix is None:
        matrix = np.zeros((25, 25), dtype="float32")
    
    for i in range(nb_pt):
        for j in range(nb_pt):
            matrix[i, j] = math.sqrt((x[i] - x[j])**2 + (y[i] - y[j])**2 + (z[i] - z[j])**2)
            
    if reduce:
        matrix = (matrix - matrix.min()) / matrix.max()
        
    return matrix

In [6]:
"""
Author: Henry Powell
Institution: Institute of Neuroscience and Psychology, Glasgow University, Scotland.

Python script for formatting the NTU RGB+D Skeletons data set into a format suitable for most LSTM RNNs. The aim is to
take each .skeletons file and compress it into a 3D numpy array with [samples, time-steps, features] as its dimensions.
The final data set will thus be a [56,881, max(len(samples(data_files)))=600, 12*25=300] numpy array. The data has
been left normal (i.e. not normalized) for the sake of flexibility although it is generally recommended to normalize
the data at some stage in the preprocessing.
"""

# Keep track of total files processed
total_files = 0

# List of class numbers labels from data_set
NTU_classes = [c for c in range(1, 49)]


def filter_missing_samples():
    """ Function to filter out all of the samples from the data set that have no data in them.

        Returns: list object containing str(filenames) of all files with no data

    """
    # List of files with missing data.
    missing = np.load("data/utils/missing_samples.npy")

    missing_skeleton = [path + i + '.skeleton' for i in missing]
    missing = missing_skeleton
    del missing_skeleton
    gc.collect()
    return missing


def load_files(path, missing, fix_total_files=False, prop_files=100, batch_type='train', drop_first=False):

    """
    :param path: Path to the data set.
    :param missing: List of files with no data.
    :param fix_total_files: Specify the number of data files you want to process up to 56,881
    :param prop_files: What proportion of the fix_total_files you want to load.
    :param batch_type: Splits the loaded files into either 80% of total files if batch_type = 'train', or 20% of
                       total files if batch_type = 'test'.
    :param drop_first: Stop function from iterating over .CD file if there is one present in the directory.
    :return: List of .skeleton files as posixpath objects
    """

    directory = Path(path)
    
    # Store files as list to be iterated through
    # Suppression des classes avec interaction de squelettes
    files = [p for p in directory.iterdir() if p.is_file() and str(p) not in missing and not(int(str(p)[48:51]) > 49)] 

    # You may have a .CD file hidden in this folder. This drops this from [files] so that the code doesn't run over it.
    if drop_first:
        files.pop(0)
    else:
        pass

    if fix_total_files:
        files = files[:fix_total_files]
    else:
        files = files

    # Number of total files before dropping if files_batch_prop < 100
    total_num_files = len(files)
    file_percentage = (total_num_files / 100) * prop_files

    # Drop proportion of files you don't want to process
    if prop_files == 100:
        files = files
    elif prop_files != 100 and batch_type == 'train':
        files = files[:int(file_percentage)]
    elif prop_files != 100 and batch_type == 'test':
        files = files[int(file_percentage):]
    elif prop_files > 100 or prop_files < 0:
        raise Exception('files_batch_prop should be an integer between 0 and 100. You gave {}'.format(prop_files))
    gc.collect()

    return files


def get_classes(files, one_hot=False, subset=False):

    """

    :param files: list of .skeleton files to be processed (must be posixPath object)
    :param one_hot: translate classes to a one-hot encoding
    :param subset: specify that you are using a the binary subset of the dataset (Action 1 and Action 3 (A001 & A003)
    :return: list of classes

    """

    files = [str(f) for f in files]
    class_list = list()
    class_index = files[0].find('A0')

    for i in range(len(files)):
        class_list.append(files[i][class_index+2:class_index+4])
    del class_index

    class_list = [int(c)-1 for c in class_list]
    class_list = np.array(class_list)

    if one_hot:
        # One-hot encode integers to make suitable for LSTM
        class_list = keras.utils.to_categorical(class_list)

    else:
        pass

    gc.collect()
    return class_list


def process_raw_data(files, dest_path, save_as_ndarray=False, three_d=True, derivative=False):

    """
    :param files: list of .skeleton files to be processed (must be posixPath object)
    :param save_as_ndarray: set to True to save the outputted data to an ndarray in the current directory
    :param derivative: add feature engineered columns to the output. Adds first derivative calculations to each
                       position point in x,y,z dimensions.
    :param three_d: set to False if you only want the three d position features for each time frame
    :return: np.array of dimension (samples, time_steps, features)

    """

    # This variable tracks how many files have been formatted and added to the new data set
    progress = 0
    file_nb = len(files)
    loaded = list()
    errors = list()

    # Iteration loop which formats the .skeleton files.
    for file in files:
        if not(os.path.isfile(dest_path + str(file)[31:51] + ".npy")):
            try:
                features = list()
                row = list()

                data = pd.read_csv(file, header=None)
                # on ne conserve que les données dont la taille démontre qu'il s'agit de nombres flottants
                data['length'] = data[0].apply(lambda x: len(str(x)))
                cond = data['length'] > 10
                data = data[cond]
                # ré-indexage
                data = data.reset_index(drop=True)
                # suppression des en-têtes de frame
                data = data[data.index % 26 != 0]
                # supression de la column de tri par la longueur
                data = data.drop(columns=['length'])
                # ré-indexage
                data = data.reset_index(drop=True)
                # formattage des données numériques
                data = data[0].str.split(" ", expand=True)
                data = data.fillna(method='bfill')
                if three_d:
                    data = data.drop(columns=[3, 4, 5, 6, 7, 8, 9, 10, 11])
                # noms des variables
                data.columns = ["x", "y", "z"]

                X = np.array(data.x, dtype="float32")
                Y = np.array(data.y, dtype="float32")
                Z = np.array(data.z, dtype="float32")

                frames = len(data.index) // 25
                for i in range(frames):
                    x = X[i*25:(i+1)*25]
                    y = Y[i*25:(i+1)*25]
                    z = Z[i*25:(i+1)*25]
                    features.append(distance_matrix(x, y, z))

                del data
                gc.collect()

                features = np.array(features)

                loaded.append(features)

                if save_as_ndarray:
                    np.save(os.path.join(dest_path, str(file)[31:51]), features)

                # Sanity check to ensure all the matrices are of the right dimension (Uncomment the below to make check)
                # print(features.shape)

            except:
                errors.append(file)
            
        progress += 1
        perc = progress/(file_nb/100)
        print(f'Samples Processed: {progress}/{file_nb} - - - Percentage Complete = {perc:.2f}%', end='\r')

        if progress == total_files:
            print(f'Samples Processed: {progress}/{file_nb} - - - Percentage Complete = {100}%')
            
    loaded = np.array(loaded)

    return loaded, errors


def preprocess_training(training_split_size=80, fix_total_files=1000, sanity=False, save=False, one_hot=False):

    print('Processing Training Set')

    missing = filter_missing_samples()
    files = load_files(path, missing, prop_files=training_split_size,
                       batch_type='train', fix_total_files=fix_total_files)
    classes = get_classes(files, one_hot=one_hot)
    
    mapping = [np.array([str(f)[31:51], c]) for f, c in zip(files, classes)]
    pd.DataFrame(mapping, columns=["file", "label"]).to_csv("data/utils/trainset.csv")
    
    loaded, errors = process_raw_data(files, train_dest_path, save_as_ndarray=True)
    
    print(f"{len(errors)} corrupted files.")
    with open("data/utils/train_errors.json", "w+") as file:
        json.dump([str(e) for e in errors], file)

    if save:
        np.save('skeletons_array_train_S', loaded)
        np.save('skeletons_array_train_labels_S', classes)
    else:
        pass

    # Sanity check to ensure resulting matrix is of the right shape
    print('Final training data dimensions: {}'.format(loaded.shape))

    if sanity:
        print('One-hot classes matrix:\n', classes)
    else:
        pass

    print('Final training labels dimensions: {} \n'.format(classes.shape))

    return loaded, classes


def preprocess_test(training_split_size=80, fix_total_files=1000, sanity=False, save=True):

    print('Processing Test Set')
    missing = filter_missing_samples()
    files = load_files(path, missing, prop_files=training_split_size,
                       batch_type='test', fix_total_files=fix_total_files)
    classes = get_classes(files)
    
    mapping = [np.array([str(f)[31:51], c]) for f, c in zip(files, classes)]
    pd.DataFrame(mapping, columns=["file", "label"]).to_csv("data/utils/testset.csv")
    
    loaded, errors = process_raw_data(files, test_dest_path, save_as_ndarray=True)
    
    print(f"{len(errors)} corrupted files.")
    with open("data/utils/test_errors.json", "w+") as file:
        json.dump([str(e) for e in errors], file)

    if save:
        np.save('skeletons_array_test_S', loaded)
        np.save('skeletons_array_test_labels_S', classes)
    else:
        pass

    # Sanity check to ensure resulting matrix is of the right shape
    print('Final Training data dimensions: {}'.format(loaded.shape))

    if sanity:
        print('One-hot classes matrix:\n', classes)
    else:
        pass

    print('Final test labels dimensions: {} \n'.format(classes.shape))

    return loaded, classes


def get_test_train(training_split_size=80, fix_total_files=60, sanity=False, save=True):

    preprocess_training(training_split_size=training_split_size, fix_total_files=fix_total_files, sanity=sanity, save=save)
    preprocess_test(training_split_size=training_split_size, fix_total_files=fix_total_files, sanity=sanity, save=save)


get_test_train(training_split_size=80, fix_total_files=False, sanity=True, save=False)

Processing Training Set
144 corrupted files.7161/37161 - - - Percentage Complete = 100.00%
Final training data dimensions: (0,)
One-hot classes matrix:
 [ 0  1  2 ... 16 17 18]
Final training labels dimensions: (37161,) 

Processing Test Set
13 corrupted files.9291/9291 - - - Percentage Complete = 100.00%
Final Training data dimensions: (485,)
One-hot classes matrix:
 [19 20 21 ... 46 47 48]
Final test labels dimensions: (9291,) 



# Nettoyage des données

## Training

In [72]:
train_dir = Path(train_dest_path)
train_files = [str(p)[21:41] for p in train_dir.iterdir() if p.is_file()]

In [54]:
trainset = pd.read_csv("data/utils/trainset.csv", sep=",")
trainset.columns = ["index", "file", "label"]
trainset.drop(columns=["index"], inplace=True)
len(trainset)

37161

In [83]:
filtered_train = trainset.copy()
for i, file in tqdm(enumerate(trainset.file)):
    if not(file in train_files):
        filtered_train.drop([i], axis=0, inplace=True)    

37161it [00:11, 3314.04it/s]


In [84]:
len(filtered_train)

37017

In [88]:
filtered_train["set"] = ["train" for _ in range(len(filtered_train))]
filtered_train.head()

Unnamed: 0,file,label,set
0,S001C001P001R001A001,0,train
1,S001C001P001R001A002,1,train
2,S001C001P001R001A003,2,train
3,S001C001P001R001A004,3,train
4,S001C001P001R001A005,4,train


## Testing

In [108]:
test_dir = Path(test_dest_path)
test_files = [str(p)[20:40] for p in test_dir.iterdir() if p.is_file()]

In [109]:
testset = pd.read_csv("data/utils/testset.csv", sep=",")
testset.columns = ["index", "file", "label"]
testset.drop(columns=["index"], inplace=True)
len(testset)

9291

In [110]:
filtered_test = testset.copy()
for i, file in tqdm(enumerate(testset.file)):
    if not(file in test_files):
        filtered_test.drop([i], axis=0, inplace=True)   

9291it [00:00, 13645.44it/s]


In [111]:
len(filtered_test)

9278

In [112]:
filtered_test["set"] = ["validation" for _ in range(len(filtered_test))]
filtered_test.head()

Unnamed: 0,file,label,set
0,S014C001P008R001A020,19,validation
1,S014C001P008R001A021,20,validation
2,S014C001P008R001A022,21,validation
3,S014C001P008R001A023,22,validation
4,S014C001P008R001A024,23,validation


## Format PyTorch Dataset

In [128]:
dataset = {
    "train": [f for f in filtered_train.file],
    "validation": [f for f in filtered_test.file]
}

concat = pd.concat([filtered_train, filtered_test], axis=0)

labels = {f: l for f, l in zip(concat.file, concat.label)}

In [133]:
with open("dataset.json", "w+") as file:
    json.dump(dataset, file)
with open("labels.json", "w+") as file:
    json.dump(labels, file)

stats = {
    "train_len": len(filtered_train),
    "test_len": len(filtered_test)
}

with open("datastats.json", "w+") as file:
    json.dump(stats, file)