In [25]:
import os
import numpy as np
import argparse
from itertools import product

# Constants definitions
MAX_SIMU_TOKENS = 5 

class PitchToken:
    SOS = 0  # Start of sequence, example value

class DurationToken:
    SOS = 0  # Start of sequence, example value

N_DISCRETE_VALUES = 128  
N_SAMPLES = 100  
N_TIMESTEPS = 100  

def generate_sample_data(num_samples, n_timesteps):
    np.random.seed(42)  
    emotions = np.random.randint(0, N_DISCRETE_VALUES, (num_samples, n_timesteps))
    locations = np.random.randint(0, N_DISCRETE_VALUES, (num_samples, n_timesteps))
    activities = np.random.randint(0, N_DISCRETE_VALUES, (num_samples, n_timesteps))
    modes = np.random.randint(0, N_DISCRETE_VALUES, (num_samples, n_timesteps))
    return emotions, locations, activities, modes

def preprocess_data(data, n_bars, resolution):
    """
    Preprocess the sample data to create content and structure tensors.
    """
    num_samples, n_timesteps = data[0].shape
    length = n_timesteps

    # Initialize content and structure tensors
    c_tensor = np.zeros((num_samples, length, MAX_SIMU_TOKENS, 2), np.int16)
    s_tensor = np.zeros((num_samples, length), dtype=bool)

    for i, sample in enumerate(zip(*data)):
        for t in range(length):
            c_tensor[i, t, 0, 0] = PitchToken.SOS  # Start of sequence
            c_tensor[i, t, 1, 0] = sample[0][t]  # Emotions as pitch
            c_tensor[i, t, 2, 0] = sample[1][t]  # Locations as pitch
            c_tensor[i, t, 3, 0] = sample[2][t]  # Activities as pitch
            c_tensor[i, t, 4, 0] = sample[3][t]  # Modes as pitch
            c_tensor[i, t, :, 1] = DurationToken.SOS  # Using SOS token for duration for simplicity
            s_tensor[i, t] = True  # Example, setting structure tensor

    # Apply sliding window to generate sequences
    sequences = []
    for i in range(0, length - n_bars * resolution + 1, resolution):
        seq_c_tensor = c_tensor[:, i:i + n_bars * resolution, :, :]
        seq_s_tensor = s_tensor[:, i:i + n_bars * resolution]
        sequences.append((seq_c_tensor, seq_s_tensor))
        print(f"seq_c_tensor shape: {seq_c_tensor.shape}, seq_s_tensor shape: {seq_s_tensor.shape}")
    return sequences

def save_preprocessed_data(filepath, sequences):
    """
    Save the preprocessed data sequences to a file.
    Each sequence is saved as a separate item in the npz file.
    """
    # Create a dict where key is the sequence index and value is the sequence data
    seq_dict = {}
    for i, (seq_c_tensor, seq_s_tensor) in enumerate(sequences):
        seq_dict[f'seq_c_{i}'] = seq_c_tensor
        seq_dict[f'seq_s_{i}'] = seq_s_tensor

    # Save each sequence tensor as a separate item in the npz file
    np.savez(filepath, **seq_dict)



In [26]:
args = argparse.Namespace(n_bars=2, resolution=8)

# Generate sample data
sample_data = generate_sample_data(N_SAMPLES, N_TIMESTEPS)

# Preprocess the data
sequences = preprocess_data(sample_data, args.n_bars, args.resolution)

# Save the preprocessed data
save_preprocessed_data("preprocessed_data.npz", sequences)

seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
seq_c_tensor shape: (100, 16, 5, 2), seq_s_tensor shape: (100, 16)
