In [10]:
import numpy as np
import pandas as pd
import random

random.seed(42)
def parse_coordinate(val):
    if isinstance(val, str):
        clean_val = val.replace('[', '').replace(']', '').replace(',', ' ').replace('\n', ' ').strip()
        return np.fromstring(clean_val, sep=' ')
    return np.array(val)

def preprocess_dataset(df):
    """
    Converts the DataFrame of lists into a 3D Numpy Tensor.
    Output Shape: (n_samples, 240, 207)
    """
    n_samples = len(df)
    n_frames = 240
    
    # identify feature columns (excluding any ID/Label columns)
    # assuming all columns that end in _x, _y, _z are features
    feature_cols = [c for c in df.columns if c.endswith(('_x', '_y', '_z'))]
    n_features = len(feature_cols) # Should be 207
    
    print(f"Processing {n_samples} samples with {n_features} features over {n_frames} frames...")
    
    # Initialize empty 3D tensor
    X = np.zeros((n_samples, n_frames, n_features))
    
    for i in range(n_samples):
        row = df.iloc[i]
        for j, col_name in enumerate(feature_cols):
            # Parse the list of 240 floats
            val = parse_coordinate(row[col_name])
            
            # Safety check for length
            if len(val) == n_frames:
                X[i, :, j] = val
            else:
                # Handle edge case where a list might be short (pad with zeros?)
                # For now, just truncating or padding to fit
                valid_len = min(len(val), n_frames)
                X[i, :valid_len, j] = val[:valid_len]
                
    return X, feature_cols



In [11]:
# LOAD DATA EXAMPLE
df = pd.read_csv('train.csv')
X_train_unclean, feature_names = preprocess_dataset(df)

# X_train.shape should now be (345, 240, 207)
X_train_unclean.shape

Processing 345 samples with 207 features over 240 frames...


(345, 240, 207)

In [12]:
def clean_nans(X):
    """
    Fills NaN values in the 3D tensor (Samples, Time, Features).
    Strategy: 
    1. Forward Fill (propagate last valid observation forward).
    2. Backward Fill (use next valid observation to fill initial NaNs).
    3. Fill remaining (if a joint is missing for the entire shot) with 0.
    """
    # Check if there are actually NaNs to avoid unnecessary processing
    if not np.isnan(X).any():
        print("No NaNs found. Data is clean.")
        return X

    print(f"NaNs detected. Cleaning {X.shape[0]} samples...")
    
    # Iterate through every sample (shot)
    for i in range(X.shape[0]):
        # Extract the (240 frames x 207 features) matrix for this shot
        sample_data = X[i, :, :]
        
        # Convert to Pandas DataFrame to use its fill methods
        df_sample = pd.DataFrame(sample_data)
        
        # 1. Forward Fill (takes care of NaNs in the middle or end)
        df_sample = df_sample.ffill(axis=0)
        
        # 2. Backward Fill (takes care of NaNs at the very start, frame 0)
        df_sample = df_sample.bfill(axis=0)
        
        # 3. Final safety net: If a column is 100% NaN, fill with 0
        df_sample = df_sample.fillna(0.0)
        
        # Assign the cleaned data back to the numpy array
        X[i, :, :] = df_sample.values
    

    # Double check
    assert not np.isnan(X).any(), "Error: NaNs still exist!"
    print("Cleaning complete.")

    return X

# ---------------------------------------------------------
# USAGE
# ---------------------------------------------------------
# Assuming you already ran preprocess_dataset and have X_train
# X_train, feature_cols = preprocess_dataset(df)



In [13]:
def center_skeleton(X, feature_cols):
    """
    Subtracts the mid_hip position from all other joints for every frame.
    Assumes layout: X shape (Samples, Frames, Features)
    """
    # 1. Identify indices for mid_hip x, y, z
    try:
        hip_x_idx = feature_cols.index('mid_hip_x')
        hip_y_idx = feature_cols.index('mid_hip_y')
        hip_z_idx = feature_cols.index('mid_hip_z')
    except ValueError:
        print("Error: mid_hip columns not found!")
        return X

    print("Centering skeleton relative to mid_hip...")
    
    # X shape: (N, 240, 207)
    X_centered = X.copy()
    
    # We need to iterate over features in triplets (x, y, z)
    # This assumes features are ordered: nose_x, nose_y, nose_z, ...
    # A safer way is to loop through columns and find matching dimension
    
    for i, col in enumerate(feature_cols):
        if col.endswith('_x'):
            # Subtract hip_x from this column
            X_centered[:, :, i] -= X[:, :, hip_x_idx]
        elif col.endswith('_y'):
            # Subtract hip_y from this column
            X_centered[:, :, i] -= X[:, :, hip_y_idx]
        elif col.endswith('_z'):
            # Subtract hip_z from this column
            X_centered[:, :, i] -= X[:, :, hip_z_idx]
    
    print("Centering complete.")
    return X_centered

In [14]:
from sklearn.preprocessing import OneHotEncoder

def add_participant_features(X_3d, df, encoder=None):
    """
    1. One-Hot Encodes the participant_id.
    2. Repeats this encoding 240 times (for each frame).
    3. Appends it to the skeleton data.
    
    Returns: 
        X_combined: Shape (N, 240, 207 + n_participants)
        encoder: The fitted encoder object
    """
    # 1. Extract IDs and Reshape for Sklearn (requires 2D array)
    # Ensure we fill NaNs if any exist (though IDs shouldn't have NaNs)
    ids = df['participant_id'].fillna('Unknown').values.reshape(-1, 1)
    
    if encoder is None:
        # TRAIN MODE: Fit the encoder on training IDs
        # handle_unknown='ignore' ensures if Test has a new person, it becomes all 0s (safe)
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        ids_encoded = encoder.fit_transform(ids)
    else:
        # TEST MODE: Transform using existing logic
        ids_encoded = encoder.transform(ids)
        
    # ids_encoded shape is (N_samples, N_participants) e.g., (345, 10)
    
    # 2. Broadcast (Repeat) across time steps
    n_samples = X_3d.shape[0]
    n_frames = X_3d.shape[1] # 240
    n_id_features = ids_encoded.shape[1]
    
    # Create a 3D block of IDs: (N, 240, N_participants)
    # We use np.tile to repeat the ID vector for every frame
    ids_3d = np.tile(ids_encoded[:, np.newaxis, :], (1, n_frames, 1))
    
    # 3. Concatenate with Skeleton Data
    # X_3d is (N, 240, 207)
    # Combined is (N, 240, 207 + N_participants)
    X_combined = np.concatenate([X_3d, ids_3d], axis=2)
    
    return X_combined, encoder

In [15]:
#PIPELINE
from sklearn.preprocessing import StandardScaler

def processing_pipeline_with_id(df, skel_scaler=None, id_encoder=None):
    # 1. Preprocess & Clean
    X_3d, feature_names = preprocess_dataset(df)
    X_clean = clean_nans(X_3d)
    X_centered = center_skeleton(X_clean, feature_names)
    
    # 2. Scale Skeleton (StandardScaler)
    N, T, F = X_centered.shape
    X_flat = X_centered.reshape(N * T, F)
    
    if skel_scaler is None:
        skel_scaler = StandardScaler()
        X_scaled_flat = skel_scaler.fit_transform(X_flat)
    else:
        X_scaled_flat = skel_scaler.transform(X_flat)
        
    X_skel_final = X_scaled_flat.reshape(N, T, F)

    # 3. Add Participant ID
    X_final, id_encoder = add_participant_features(X_skel_final, df, id_encoder)

    return X_final, skel_scaler, id_encoder


In [16]:
import tensorflow as tf
from tensorflow.keras import layers, models

# ---------------------------------------------------------
# 1. BUILD THE BASE MODEL (Regression)
# ---------------------------------------------------------
def create_base_model(input_shape):
    model = models.Sequential()
    
    # Layer 1: LSTM to process the time-series
    model.add(layers.LSTM(64, input_shape=input_shape, return_sequences=False))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))
    
    # Layer 2: Dense Layer
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))

    # Layer 3: Output Layer (Linear for regression)
    model.add(layers.Dense(1, activation='linear'))

    # Compile
    opt = tf.keras.optimizers.Adam(learning_rate=0.01)
    model.compile(optimizer=opt, loss='mse', metrics=['mae'])
    
    return model

# ---------------------------------------------------------
# 2. TRAIN MODELS FUNCTION
# ---------------------------------------------------------
def train_all_models(y_train_df, X_train, target_columns=None, epochs=20, batch_size=16):
    """
    Trains separate regression models for each target column.
    
    Args:
        y_train_df: DataFrame containing ONLY the target labels (or all labels).
        X_train: The preprocessed 3D numpy array (Samples, Frames, Features).
        target_columns: List of strings (column names to predict).
    """
    
    # Default targets
    if target_columns is None:
        target_columns = ['angle', 'depth', 'left_right']
        
    # Infer input shape
    input_shape = (X_train.shape[1], X_train.shape[2])
    
    trained_models = {}

    for target in target_columns:
        print(f"\n" + "="*40)
        print(f"Training model for target: {target}")
        print("="*40)
        
        # 1. Get the target data (Y) for this specific column
        # We use the passed 'y_train_df' instead of 'df'
        if len(y_train_df) != len(X_train):
            print(f"Warning: y_train_df has {len(y_train_df)} rows but X_train has {len(X_train)}. Truncating.")
            # Use a different variable name for the numpy array to avoid overwriting the argument
            current_y = y_train_df[target].values[:len(X_train)]
        else:
            current_y = y_train_df[target].values
        
        # 2. Create fresh model
        model = create_base_model(input_shape)
        
        # 3. Train
        history = model.fit(
            X_train, 
            current_y,  # <--- Use the specific numpy array for this target
            epochs=epochs, 
            batch_size=batch_size, 
            validation_split=0.2,
            verbose=1
        )
        
        # 4. Save
        trained_models[f"shot_{target}"] = model
        
        final_mae = history.history['mae'][-1]
        print(f"--> Done. Final MAE for {target}: {final_mae:.4f}")
        
    return trained_models

In [17]:
# ---------------------------------------------------------
# LOAD RAW DATA
# ---------------------------------------------------------

# Load your training CSV
print("Loading train.csv...")
train_df = pd.read_csv('train.csv')

# --- TRAIN ---
print("Processing Train...")
# Pass None for both scalers to create them
X_train_final, my_skel_scaler, my_id_encoder = processing_pipeline_with_id(
    train_df, 
    skel_scaler=None, 
    id_encoder=None
)

print(f"New Input Shape: {X_train_final.shape}") 
# Shape will be larger now! e.g., (345, 240, 207 + NumberOfParticipants)

# Train your models exactly as before (The function handles the new shape automatically)
trained_models = train_all_models(train_df, X_train_final)



# --- TEST ---

print('loading test.csv...')
test_df = pd.read_csv('test.csv')
print("Processing Test...")
# Pass the scalers/encoders we just learned
X_test_final, _, _ = processing_pipeline_with_id(
    test_df, 
    skel_scaler=my_skel_scaler, 
    id_encoder=my_id_encoder
)

# Predict...

print("\nAll models trained successfully!")

Loading train.csv...
Processing Train...
Processing 345 samples with 207 features over 240 frames...
NaNs detected. Cleaning 345 samples...
Cleaning complete.
Centering skeleton relative to mid_hip...
Centering complete.
New Input Shape: (345, 240, 212)

Training model for target: angle
Epoch 1/20


  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - loss: 2165.0178 - mae: 46.2703 - val_loss: 1465.6930 - val_mae: 38.0796
Epoch 2/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 1991.9305 - mae: 44.4266 - val_loss: 1167.6091 - val_mae: 33.9515
Epoch 3/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 1583.5510 - mae: 39.5602 - val_loss: 798.8459 - val_mae: 27.9586
Epoch 4/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 1086.0554 - mae: 32.5109 - val_loss: 325.1088 - val_mae: 17.5688
Epoch 5/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 534.3041 - mae: 22.1649 - val_loss: 64.4316 - val_mae: 7.2824
Epoch 6/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 155.8820 - mae: 10.8231 - val_loss: 25.0379 - val_mae: 3.4963
Epoch 7/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [18]:
# ---------------------------------------------------------
# STEP 4: GENERATE SUBMISSION (Immediately after training)
# ---------------------------------------------------------
print("Loading test.csv...")
test_df = pd.read_csv('test.csv')

# A. Process Test Data using the SAME scaler from Step 2
X_test_final, _, _ = processing_pipeline_with_id(
    test_df, 
    skel_scaler=my_skel_scaler, 
    id_encoder=my_id_encoder)

# B. Generate Predictions & Scale to 0-1
submission_data = {'id': test_df['id']}

# Manual Scaler Params (from your competition rules)
scaling_params = {
    'angle':      {'min': 30,  'max': 60},
    'depth':      {'min': -12, 'max': 30},
    'left_right': {'min': -16, 'max': 16}
}

tasks = [
    {'name': 'angle',      'col': 'scaled_angle',      'model_key': 'shot_angle'},
    {'name': 'depth',      'col': 'scaled_depth',      'model_key': 'shot_depth'},
    {'name': 'left_right', 'col': 'scaled_left_right', 'model_key': 'shot_left_right'}
]

for task in tasks:
    model = trained_models[task['model_key']]
    
    # Predict raw value (e.g. 45 degrees)
    raw_preds = model.predict(X_test_final, verbose=0).flatten()
    
    # Scale to 0-1 range
    p = scaling_params[task['name']]
    scaled_preds = (raw_preds - p['min']) / (p['max'] - p['min'])
    
    # Clip and Save
    submission_data[task['col']] = np.clip(scaled_preds, 0, 1)

# C. Save
pd.DataFrame(submission_data).to_csv('submission.csv', index=False)
print("Saved submission.csv")

Loading test.csv...
Processing 113 samples with 207 features over 240 frames...
NaNs detected. Cleaning 113 samples...
Cleaning complete.
Centering skeleton relative to mid_hip...
Centering complete.
Saved submission.csv
