# Proyecto 2
Ariela Mishaan (22052), Alina Carías (22539), Diego Soto (22737), Ignacio Méndez (22613) y Marcos Díaz

## Librerias

## Carga de datos

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/asl-fingerspelling/supplemental_metadata.csv
/kaggle/input/asl-fingerspelling/character_to_prediction_index.json
/kaggle/input/asl-fingerspelling/train.csv
/kaggle/input/asl-fingerspelling/supplemental_landmarks/371169664.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/369584223.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/1682915129.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/775880548.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/2100073719.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/1650637630.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/1471096258.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/86446671.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/897287709.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/333606065.parquet
/kaggle/input/asl-fingerspelling/supplemental_landmarks/2057261717.parquet
/kaggle/inpu

In [3]:
#dataset_df = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv')
df = pd.read_parquet('/kaggle/input/asl-fingerspelling/train_landmarks/1019715464.parquet')
print("Full train dataset shape is {}".format(df.shape))

Full train dataset shape is (161461, 1630)


## Preprocesamiento

In [None]:
import os
import json
import gc
import glob
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
INPUT_DIR = "/kaggle/input/asl-fingerspelling"
TRAIN_CSV = f"{INPUT_DIR}/train.csv"

SELECTION_MODE = "hands"         

MIN_FRAMES = 8                    
MAX_FRAMES = 64                   

TRAIN_PCT, VAL_PCT, TEST_PCT = 0.70, 0.15, 0.15
RANDOM_STATE = 42

OUT_DIR = "/kaggle/working/preprocessed"
SAMPLES_PER_SHARD = 2000

DEBUG_MAX_PARQUETS = None  

In [None]:
def list_parquet_files(train_df: pd.DataFrame) -> List[str]:
    rels = sorted(train_df["path"].unique())
    abs_paths = [os.path.join(INPUT_DIR, p.lstrip("/")) for p in rels]
    # filtrar los que realmente existen (por seguridad)
    abs_paths = [p for p in abs_paths if os.path.exists(p)]
    return abs_paths


def select_columns(parquet_columns: List[str]) -> List[str]:
    keep = {"sequence_id", "frame"}
    cols = set(parquet_columns)

    # Manos (todas las claves de mano izquierda/derecha)
    hand_keys = []
    for axis in ["x", "y", "z"]:
        hand_keys += [c for c in cols if c.startswith(f"{axis}left_hand")]
        hand_keys += [c for c in cols if c.startswith(f"{axis}right_hand")]

    keep.update(hand_keys)

    if SELECTION_MODE == "hands+face_ref":
        # Indices de referencia de MediaPipe Face (pocos puntos: nariz y centros de ojos)
        # (Usamos algunos índices comunes como referencia aproximada)
        face_ref_idx = [1, 4, 33, 263]   # nariz, punta de nariz, ojo izq der aprox (landmarks MP)
        for axis in ["x", "y", "z"]:
            for i in face_ref_idx:
                name = f"{axis}face{i}"
                if name in cols:
                    keep.add(name)

    # Devolver en orden del parquet
    return [c for c in parquet_columns if c in keep]


def zscore_per_participant(df_seq: pd.DataFrame, participant_id: int,
                           stats_dict: Dict[int, Tuple[np.ndarray, np.ndarray]]) -> pd.DataFrame:
    if participant_id not in stats_dict:
        return df_seq
    mu, sigma = stats_dict[participant_id]
    # Encontrar solo columnas numéricas de landmarks:
    land_cols = [c for c in df_seq.columns if c not in ("sequence_id", "frame")]
    X = df_seq[land_cols].to_numpy(dtype=np.float32)
    X = (X - mu) / (sigma + 1e-8)
    df_seq.loc[:, land_cols] = X
    return df_seq


def pad_or_truncate(arr: np.ndarray, max_len: int) -> np.ndarray:
    
    T, D = arr.shape
    if T == max_len:
        return arr
    if T > max_len:
        return arr[:max_len]
    # padding
    out = np.zeros((max_len, D), dtype=arr.dtype)
    out[:T] = arr
    return out


def to_tensor(df_seq: pd.DataFrame) -> np.ndarray:
    
    land_cols = [c for c in df_seq.columns if c not in ("sequence_id", "frame")]
    df_seq = df_seq.sort_values("frame")
    X = df_seq[land_cols].to_numpy(dtype=np.float32)
    return X


def save_shard(X_list: List[np.ndarray], y_list: List[str], pid_list: List[int],
               split_name: str, shard_id: int):
    os.makedirs(f"{OUT_DIR}/{split_name}", exist_ok=True)
    out_path = f"{OUT_DIR}/{split_name}/shard_{shard_id:03d}.npz"
    np.savez_compressed(out_path,
                        X=np.stack(X_list, axis=0),      # [N, MAX_FRAMES, D]
                        y=np.array(y_list),
                        participant=np.array(pid_list, dtype=np.int32))
    print(f"[{split_name}] guardado {out_path} con {len(X_list)} secuencias")



In [None]:
# Carga del CSV
train_df = pd.read_csv(TRAIN_CSV)
print("train.csv ->", train_df.shape, "columnas:", list(train_df.columns))

required_cols = {"path", "file_id", "sequence_id", "participant_id", "phrase"}
missing = required_cols - set(train_df.columns)
if missing:
    raise ValueError(f"Faltan columnas en train.csv: {missing}")

# Quitar duplicados 
before = train_df.shape[0]
train_df = train_df.drop_duplicates(subset=["sequence_id"])
print(f"Duplicados por sequence_id eliminados: {before - train_df.shape[0]}")

# Parquets
parquet_paths = list_parquet_files(train_df)
if DEBUG_MAX_PARQUETS is not None:
    parquet_paths = parquet_paths[:DEBUG_MAX_PARQUETS]

print("Parquets a procesar:", len(parquet_paths))
assert len(parquet_paths) > 0, "No se encontraron archivos .parquet"

import pyarrow.parquet as pq
first_cols = pq.ParquetFile(parquet_paths[0]).schema.names
selected_cols = select_columns(first_cols)
print(f"Total columnas en parquet[0]: {len(first_cols)} | Seleccionadas: {len(selected_cols)}")

## Exploración

In [5]:
#dataset_df.head()
df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0_level_0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1975433633,0,0.578892,0.578482,0.582906,0.572686,0.57903,0.582115,0.59143,0.498995,0.595773,...,-0.253687,-0.291687,-0.123892,-0.195255,-0.249135,-0.284375,-0.12505,-0.187797,-0.224827,-0.249662
1975433633,1,0.577563,0.578528,0.582916,0.57276,0.57909,0.582197,0.591687,0.497613,0.596104,...,,,,,,,,,,
1975433633,2,0.576181,0.576949,0.581346,0.572293,0.577725,0.581191,0.59155,0.497113,0.596477,...,-0.233556,-0.267114,-0.116145,-0.179053,-0.223861,-0.253004,-0.12209,-0.173169,-0.200727,-0.219106
1975433633,3,0.575575,0.577569,0.581769,0.572443,0.578289,0.581652,0.591728,0.496759,0.596573,...,-0.190909,-0.218471,-0.093956,-0.149982,-0.188452,-0.211573,-0.102759,-0.147642,-0.168562,-0.180578
1975433633,4,0.577907,0.577628,0.582295,0.57293,0.578345,0.581873,0.592467,0.49817,0.597383,...,,,,,,,,,,


Cada entrada en el train tiene el nombre del archivo (path), el id del archivo (file_id), el id de la sequencia o la oración (sequence_id), el id del participante (participant_id) y la frase que se representa con el lenguaje de señas (phrase). El file_id indica el archivo que tiene la data de los landmarks para cada frase y el sequence_id es el indice unico de una secuencia dentro de cada archivo de landmarks. 

In [9]:
columnas = df.columns.tolist()
print(columnas)

['frame', 'x_face_0', 'x_face_1', 'x_face_2', 'x_face_3', 'x_face_4', 'x_face_5', 'x_face_6', 'x_face_7', 'x_face_8', 'x_face_9', 'x_face_10', 'x_face_11', 'x_face_12', 'x_face_13', 'x_face_14', 'x_face_15', 'x_face_16', 'x_face_17', 'x_face_18', 'x_face_19', 'x_face_20', 'x_face_21', 'x_face_22', 'x_face_23', 'x_face_24', 'x_face_25', 'x_face_26', 'x_face_27', 'x_face_28', 'x_face_29', 'x_face_30', 'x_face_31', 'x_face_32', 'x_face_33', 'x_face_34', 'x_face_35', 'x_face_36', 'x_face_37', 'x_face_38', 'x_face_39', 'x_face_40', 'x_face_41', 'x_face_42', 'x_face_43', 'x_face_44', 'x_face_45', 'x_face_46', 'x_face_47', 'x_face_48', 'x_face_49', 'x_face_50', 'x_face_51', 'x_face_52', 'x_face_53', 'x_face_54', 'x_face_55', 'x_face_56', 'x_face_57', 'x_face_58', 'x_face_59', 'x_face_60', 'x_face_61', 'x_face_62', 'x_face_63', 'x_face_64', 'x_face_65', 'x_face_66', 'x_face_67', 'x_face_68', 'x_face_69', 'x_face_70', 'x_face_71', 'x_face_72', 'x_face_73', 'x_face_74', 'x_face_75', 'x_face_76',

68 documentos de train landmarks
* x_face 467 columnas
* x_left_hand 20 columnas
* x_pose 32 columnas
* x_right_hand 20 columnas
* y_face 467 columnas
* y_left_hand 20 columnas
* y_pose 32 columnas
* y_right_hand 20 columnas
* z_face 467 columnas
* z_left_hand 20 columnas
* z_pose 32 columnas
* z_right_hand 20 columnas


In [None]:
archivo = '/Users/arielamishaancohen/Downloads/asl-fingerspelling/supplemental_landmarks/86446671.parquet'

# Leer el archivo parquet
df = pd.read_parquet(archivo)
# Mostrar las primeras filas
df.head()

In [None]:
print(list(df.columns))

Cada archivo parquet contiene una frase dividida en frames, cada frame es una fila, en las columnas se encuentra la información de las coordenadas. 