In [6]:
import torch
import pandas as pd
import os
from tqdm import tqdm

In [7]:
MAX_SEQ_LEN = 135 #135 for 95th percentile , or 90 for 90th percentile
#pad sequences with length less than max_seq_length with value -1
#truncate sequences with length more than max_seq_length to max_seq_length
PAD_VALUE = -1

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Truncation and padding on GPU

In [9]:
truncate_methods = ['starting', 'ending', 'middle', 'random sampling', 'uniform sampling']

In [10]:
# === Padding ===
def pad_tensor(tensor, max_length, pad_value):
    current_len = tensor.shape[0]
    if current_len >= max_length:
        return tensor[:max_length]
    else:
        pad_size = max_length - current_len
        padding = torch.full((pad_size, tensor.shape[1]), pad_value, device=tensor.device)
        return torch.cat([tensor, padding], dim=0)


# === Truncation + Padding ===
def pad_or_truncate_tensor(tensor, max_length, pad_value, method="uniform sampling"):
    if tensor.shape[0] > max_length:
        if method == "starting":
            return tensor[:max_length]
        elif method == "ending":
            return tensor[-max_length:]
        elif method == "middle":
            mid = tensor.shape[0] // 2
            start = mid - (max_length // 2)
            return tensor[start:start + max_length]
        elif method == "random sampling":
            start = torch.randint(0, tensor.shape[0] - max_length + 1, (1,)).item()
            return tensor[start:start + max_length]
        elif method == "uniform sampling":
            indices = torch.linspace(0, tensor.shape[0] - 1, steps=max_length).long()
            return tensor[indices]
        else:
            raise ValueError(f"Unknown truncation method: {method}")
    else:
        return pad_tensor(tensor, max_length, pad_value)

In [11]:
# === Get all parquet file paths ===
input_dir = 'train_landmark_files_normalized'
output_dir = 'train_landmark_files_final_prepared'

In [12]:
dataset6 = []
for dirname, _, filenames in os.walk(input_dir):
    for filename in filenames:
        if filename.endswith('.parquet'):
            dataset6.append(os.path.join(dirname, filename))

SAVE PADDED AND TRUNCATED DATA

This preprocessed data can be used for training all the three models: Transformer, LSTM, RNN

In [13]:
# === Process each file ===
chosen_method = "uniform sampling"  # Change this to use a different truncation method

for directory in tqdm(dataset6):
    # Parse path parts
    path_parts = directory.split("/")
    frame_num = path_parts[-2]
    file_name = path_parts[-1]
    output_file_path = os.path.join(output_dir, frame_num, file_name)

    # Read parquet file
    df = pd.read_parquet(directory)
    if df.empty:
        continue  # Skip empty files to avoid errors

    # Convert to tensor and move to GPU
    data_tensor = torch.tensor(df.values, dtype=torch.float32).to(device)

    # Pad or truncate using GPU
    processed_tensor = pad_or_truncate_tensor(data_tensor, MAX_SEQ_LEN, PAD_VALUE, method=chosen_method)

    # Convert back to DataFrame on CPU
    processed_df = pd.DataFrame(processed_tensor.cpu().numpy(), columns=df.columns)

    # Save
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    processed_df.to_parquet(output_file_path)

100%|██████████| 94477/94477 [3:22:44<00:00,  7.77it/s]  
