In [None]:
import numpy as np
import pandas as pd
import re
import os
import h5py

In [None]:
VALENCE_SOURCE_PATH = '/Users/canchel/Desktop/Archive/annotations/annotations averaged per song/dynamic (per second annotations)/valence.csv'
AROUSAL_SOURCE_PATH = '/Users/canchel/Desktop/Archive/annotations/annotations averaged per song/dynamic (per second annotations)/arousal.csv'

METADATA_2013_PATH = '/Users/canchel/Desktop/metadata/metadata_2013.csv'

FEATURE_FILES_DIR = '/Users/canchel/Desktop/Archive/features'

ARCHIVE_OUTPUT_PATH = '/Users/canchel/Desktop/deam_samples.hdf5'

NORMALIZED_ARCHIVE_OUTPUT_PATH = '/Users/canchel/Desktop/normalized_deam_samples.hdf5'

In [None]:
valence_df = pd.read_csv(VALENCE_SOURCE_PATH).set_index('song_id')
arousal_df = pd.read_csv(AROUSAL_SOURCE_PATH).set_index('song_id')

In [None]:
va_dicts = {si : (dict(valence_df.loc[si].dropna()), dict(arousal_df.loc[si].dropna())) for si in valence_df.index.values}

In [None]:
metadata_2013_df = pd.read_csv(METADATA_2013_PATH).set_index('song_id')

def t_shift(song_index):
    if song_index <= 1000:
        start_min, start_sec = tuple([*map(int, str(metadata_2013_df.loc[song_index]['start of the segment (min.sec)']).split('.'))])
        return start_min * 60 + start_sec - 1
    elif 1000 < song_index <= 2000:
        return -1
    elif 2000 < song_index:
        return -1

with h5py.File(ARCHIVE_OUTPUT_PATH) as out_file:
    for song_index in va_dicts.keys():
        print('processing', song_index, end='\r')
        valence_dict, arousal_dict = va_dicts[song_index]
        if len(valence_dict) != len(arousal_dict):
            print(song_index, 'removed because of unmatched valence arousal sample count')
            continue
        timestamp_re = re.compile('[a-z_]+(\d+)[a-z]')
        key_mapping_func = lambda x: dict(sorted({int(timestamp_re.findall(key)[0]) / 1e3 : value for key, value in x.items()}.items()))
        valence_dict, arousal_dict = key_mapping_func(valence_dict), key_mapping_func(arousal_dict)
        song_feature_df = pd.read_csv(os.path.join(FEATURE_FILES_DIR, str(song_index) + '.csv'), sep=';').set_index('frameTime').filter(regex='mean$', axis=1)
        try:
            feature_dict = {timestamp : song_feature_df.loc[timestamp + t_shift(song_index)].to_numpy() for timestamp in valence_dict.keys()}
        except KeyError:
            print(song_index, 'removed because of csv key error')
            continue
        out_file[str(song_index) + '/features'] = np.array([*feature_dict.values()])
        out_file[str(song_index) + '/valence'] = np.array([*valence_dict.values()]).reshape(-1, 1)
        out_file[str(song_index) + '/arousal'] = np.array([*arousal_dict.values()]).reshape(-1, 1)
print('done')

In [None]:
feature_list = []
with h5py.File(ARCHIVE_OUTPUT_PATH) as in_file:
    for key in in_file.keys():
        feature_list.append(np.array(in_file[key + '/features']))

In [None]:
# z-score normalization
concatenated_matrix = np.concatenate(feature_list, axis=0)
mean_vector = np.mean(concatenated_matrix, axis=0)
std_vector = np.std(concatenated_matrix, axis=0)

with h5py.File(ARCHIVE_OUTPUT_PATH) as in_file:
    with h5py.File(NORMALIZED_ARCHIVE_OUTPUT_PATH) as out_file:
        for key in in_file.keys():
            sample_length = np.array(in_file[key + '/features']).shape[0]
            mean_matrix, std_matrix = np.stack(mean_vector * sample_length), np.stack(std_vector * sample_length)
            out_file[key + '/features'] = (np.array(in_file[key + '/features']) - mean_matrix) / std_matrix
            out_file[key + '/valence'] = np.array(in_file[key + '/valence'])
            out_file[key + '/arousal'] = np.array(in_file[key + '/arousal'])

In [None]:
# min-max normalization
concatenated_matrix = np.concatenate(feature_list, axis=0)
minimum_vector = np.min(concatenated_matrix, axis=0)
maximum_vector = np.max(concatenated_matrix, axis=0)

saved_features = None

with h5py.File(ARCHIVE_OUTPUT_PATH) as in_file:
    with h5py.File(NORMALIZED_ARCHIVE_OUTPUT_PATH) as out_file:
        for key in in_file.keys():
            sample_length = np.array(in_file[key + '/features']).shape[0]
            min_matrix, max_matrix = np.stack([minimum_vector] * sample_length), np.stack([maximum_vector] * sample_length)
            saved_features = np.array(in_file[key + '/features'])
            out_file[key + '/features'] = (np.array(in_file[key + '/features']) - min_matrix) / (max_matrix - min_matrix)
            out_file[key + '/valence'] = np.array(in_file[key + '/valence'])
            out_file[key + '/arousal'] = np.array(in_file[key + '/arousal'])