# ===============================================================
# üìó NOTEBOOK 2: Data Preparation & Analysis
# Load landmarks, prepare sequences, create train/val/test splits
# ===============================================================

In [None]:
# --- 1: Setup ---

!pip install tensorflow pandas numpy matplotlib seaborn scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

print("‚úÖ Setup complete!")


In [None]:
# --- 2: Load Metadata ---

def load_dataset(landmarks_path='/content/landmarks_data',
                 metadata_path='/content/metadata/sentence_dataset_metadata.csv'):
    """Load landmarks and metadata"""
    
    # Load metadata
    metadata = pd.read_csv(metadata_path, encoding='utf-8-sig')
    print(f"üìä Metadata loaded: {len(metadata)} entries")
    
    # Filter successful
    success_df = metadata[metadata['success'] == True]
    print(f"‚úÖ Successful: {len(success_df)} videos")
    
    # Load sentence mapping
    try:
        with open('/content/metadata/sentence_mapping.json', 'r', encoding='utf-8') as f:
            sentence_mapping = json.load(f)
        print(f"üìù Sentence mapping loaded: {len(sentence_mapping)} sentences")
    except:
        sentence_mapping = None
        print("‚ö†Ô∏è No sentence mapping found")
    
    return success_df, sentence_mapping

# Load data
success_df, sentence_mapping = load_dataset()
print(f"\nüìã First 5 entries:")
print(success_df[['sentence', 'signer_id', 'left_hand_coverage']].head())


In [None]:
# --- 3: Load Landmark Sequences ---

def load_landmark_sequences(metadata_df, landmarks_folder='/content/landmarks_data'):
    """Load all landmark sequences into memory"""
    
    X = []  # Features
    y = []  # Labels (sentences)
    signer_ids = []  # Signer info for split
    
    print("üîÑ Loading landmark sequences...")
    
    for idx, row in metadata_df.iterrows():
        try:
            # Load .npy file
            file_path = os.path.join(landmarks_folder, row['landmarks_file'])
            landmarks_data = np.load(file_path, allow_pickle=True)
            
            # Convert to feature matrix
            features = []
            for frame in landmarks_data:
                frame_features = []
                
                # Left hand (if available)
                if frame['left_hand'] is not None:
                    frame_features.extend(frame['left_hand'])
                else:
                    frame_features.extend([0.0] * (21 * 4))  # 21 points √ó 4 values
                
                # Right hand
                if frame['right_hand'] is not None:
                    frame_features.extend(frame['right_hand'])
                else:
                    frame_features.extend([0.0] * (21 * 4))
                
                # Pose (upper body)
                if frame['pose'] is not None:
                    frame_features.extend(frame['pose'])
                else:
                    frame_features.extend([0.0] * (25 * 4))  # 25 points √ó 4 values
                
                # Lip ROI
                if frame['lip_roi'] is not None:
                    frame_features.extend(frame['lip_roi'])
                else:
                    frame_features.extend([0.0] * (50 * 4))  # ~50 lip points √ó 4 values
                
                features.append(frame_features)
            
            X.append(np.array(features))
            y.append(row['sentence'])
            signer_ids.append(row['signer_id'])
            
        except Exception as e:
            print(f"‚ùå Error loading {row['landmarks_file']}: {e}")
    
    X = np.array(X)
    print(f"\n‚úÖ Loaded {len(X)} sequences")
    print(f"   Shape: {X.shape}")
    print(f"   Features per frame: {X.shape[2]}")
    
    return X, np.array(y), np.array(signer_ids)

# Load all data
X, y, signer_ids = load_landmark_sequences(success_df)


In [None]:
# --- 4: Analyze Class Distribution ---

def analyze_class_distribution(y_labels):
    """Analyze distribution of sentences"""
    
    # Count occurrences
    counter = Counter(y_labels)
    
    # Sort by frequency
    sorted_items = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    
    print("\nüìä CLASS DISTRIBUTION")
    print("="*50)
    
    for sentence, count in sorted_items:
        percentage = (count / len(y_labels)) * 100
        print(f"{sentence[:30]:30} : {count:3} videos ({percentage:5.1f}%)")
    
    # Plot
    plt.figure(figsize=(12, 6))
    sentences = [s[:20] + '...' for s, _ in sorted_items[:15]]
    counts = [c for _, c in sorted_items[:15]]
    
    plt.barh(range(len(sentences)), counts)
    plt.yticks(range(len(sentences)), sentences)
    plt.xlabel('Number of Videos')
    plt.title('Top 15 Sentences by Video Count')
    plt.tight_layout()
    plt.savefig('/content/class_distribution.png', dpi=150)
    plt.show()
    
    return counter

counter = analyze_class_distribution(y)


In [None]:
# --- 5: Encode Labels ---

def encode_labels(y_labels):
    """Convert sentence strings to numeric labels"""
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y_labels)
    
    # Create mapping
    label_mapping = dict(zip(label_encoder.classes_, 
                              label_encoder.transform(label_encoder.classes_)))
    
    print(f"\nüî¢ Label Encoding:")
    print(f"   Classes: {len(label_mapping)}")
    print(f"   Sample mapping:")
    for sentence, idx in list(label_mapping.items())[:5]:
        print(f"   {sentence[:30]:30} ‚Üí {idx}")
    
    # Save mapping
    with open('/content/label_encoder.pkl', 'wb') as f:
        pickle.dump(label_encoder, f)
    
    with open('/content/label_mapping.json', 'w', encoding='utf-8') as f:
        json.dump(label_mapping, f, indent=2, ensure_ascii=False)
    
    return y_encoded, label_encoder

y_encoded, label_encoder = encode_labels(y)


In [None]:
# --- 6: Create Train/Val/Test Splits (Signer Independent) ---

def create_signer_independent_splits(X, y, signer_ids, test_size=0.2, val_size=0.1):
    """
    Create splits where signers in test set are NOT in training set
    This tests real-world generalization
    """
    
    unique_signers = np.unique(signer_ids)
    print(f"\nüë• Unique signers: {unique_signers}")
    
    # Split signers into train/val/test
    n_signers = len(unique_signers)
    n_test = int(n_signers * test_size)
    n_val = int(n_signers * val_size)
    n_train = n_signers - n_test - n_val
    
    # Shuffle signers
    np.random.seed(42)
    shuffled_signers = np.random.permutation(unique_signers)
    
    train_signers = shuffled_signers[:n_train]
    val_signers = shuffled_signers[n_train:n_train + n_val]
    test_signers = shuffled_signers[n_train + n_val:]
    
    print(f"\nüìä Split by signer:")
    print(f"   Train signers ({len(train_signers)}): {train_signers}")
    print(f"   Val signers ({len(val_signers)}): {val_signers}")
    print(f"   Test signers ({len(test_signers)}): {test_signers}")
    
    # Create masks
    train_mask = np.isin(signer_ids, train_signers)
    val_mask = np.isin(signer_ids, val_signers)
    test_mask = np.isin(signer_ids, test_signers)
    
    # Split data
    X_train = X[train_mask]
    y_train = y[train_mask]
    X_val = X[val_mask]
    y_val = y[val_mask]
    X_test = X[test_mask]
    y_test = y[test_mask]
    
    print(f"\nüìä Dataset sizes:")
    print(f"   Train: {len(X_train)} videos")
    print(f"   Val:   {len(X_val)} videos")
    print(f"   Test:  {len(X_test)} videos")
    
    # Check class distribution in splits
    print(f"\nüéØ Classes in each split:")
    print(f"   Train: {len(np.unique(y_train))} classes")
    print(f"   Val:   {len(np.unique(y_val))} classes")
    print(f"   Test:  {len(np.unique(y_test))} classes")
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test), {
        'train_signers': train_signers.tolist(),
        'val_signers': val_signers.tolist(),
        'test_signers': test_signers.tolist()
    }

# Create splits
(X_train, y_train), (X_val, y_val), (X_test, y_test), signer_splits = create_signer_independent_splits(
    X, y_encoded, signer_ids
)

# Save splits info
with open('/content/signer_splits.json', 'w') as f:
    json.dump(signer_splits, f, indent=2)


In [None]:
# --- 7: Normalize Features ---

def normalize_features(X_train, X_val, X_test):
    """Z-score normalization per feature"""
    
    # Calculate mean and std from training data only
    mean = np.mean(X_train, axis=(0, 1), keepdims=True)
    std = np.std(X_train, axis=(0, 1), keepdims=True)
    std[std == 0] = 1  # Avoid division by zero
    
    # Normalize
    X_train_norm = (X_train - mean) / std
    X_val_norm = (X_val - mean) / std
    X_test_norm = (X_test - mean) / std
    
    print(f"\nüìä Normalization stats:")
    print(f"   Mean shape: {mean.shape}")
    print(f"   Std shape: {std.shape}")
    print(f"   X_train range: [{X_train_norm.min():.2f}, {X_train_norm.max():.2f}]")
    
    # Save normalization params
    np.save('/content/normalization_mean.npy', mean)
    np.save('/content/normalization_std.npy', std)
    
    return X_train_norm, X_val_norm, X_test_norm

X_train_norm, X_val_norm, X_test_norm = normalize_features(X_train, X_val, X_test)


In [None]:
# --- 8: Save Prepared Data ---

def save_prepared_data(X_train, X_val, X_test, y_train, y_val, y_test):
    """Save all prepared data for training"""
    
    # Create output folder
    os.makedirs('/content/prepared_data', exist_ok=True)
    
    # Save features
    np.save('/content/prepared_data/X_train.npy', X_train)
    np.save('/content/prepared_data/X_val.npy', X_val)
    np.save('/content/prepared_data/X_test.npy', X_test)
    
    # Save labels
    np.save('/content/prepared_data/y_train.npy', y_train)
    np.save('/content/prepared_data/y_val.npy', y_val)
    np.save('/content/prepared_data/y_test.npy', y_test)
    
    # Save metadata
    metadata = {
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'sequence_length': X_train.shape[1],
        'features_per_frame': X_train.shape[2],
        'num_classes': len(np.unique(y_train)),
        'normalized': True
    }
    
    with open('/content/prepared_data/dataset_info.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\nüíæ Data saved to /content/prepared_data/")
    print(f"   Train: {X_train.shape}")
    print(f"   Val:   {X_val.shape}")
    print(f"   Test:  {X_test.shape}")

save_prepared_data(X_train_norm, X_val_norm, X_test_norm, 
                  y_train, y_val, y_test)



In [None]:
# --- 9: Quick Data Quality Check ---

def data_quality_check(X_train, y_train):
    """Check data quality"""
    
    print("\nüîç DATA QUALITY CHECK")
    print("="*50)
    
    # Check for NaN
    has_nan = np.isnan(X_train).any()
    print(f"Contains NaN values: {'‚ùå YES' if has_nan else '‚úÖ NO'}")
    
    # Check for Inf
    has_inf = np.isinf(X_train).any()
    print(f"Contains Inf values: {'‚ùå YES' if has_inf else '‚úÖ NO'}")
    
    # Check variance
    variance = np.var(X_train, axis=(0, 1))
    zero_var = np.sum(variance < 1e-6)
    print(f"Zero-variance features: {zero_var}/{X_train.shape[2]}")
    
    # Class balance
    unique, counts = np.unique(y_train, return_counts=True)
    min_class = counts.min()
    max_class = counts.max()
    print(f"Class balance: min={min_class}, max={max_class}, ratio={max_class/min_class:.1f}")
    
    # Sample sequence
    sample_idx = np.random.randint(0, len(X_train))
    plt.figure(figsize=(15, 4))
    plt.plot(X_train[sample_idx, :, 0])  # Plot first feature over time
    plt.title(f'Sample Sequence - Class {y_train[sample_idx]}')
    plt.xlabel('Frame')
    plt.ylabel('Feature Value')
    plt.savefig('/content/sample_sequence.png', dpi=150)
    plt.show()

data_quality_check(X_train_norm, y_train)



In [None]:
# --- 10: Create TensorFlow Dataset ---

import tensorflow as tf

def create_tf_datasets(X_train, y_train, X_val, y_val, X_test, y_test, batch_size=32):
    """Create TensorFlow datasets for training"""
    
    # Convert to one-hot
    num_classes = len(np.unique(y_train))
    y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes)
    y_val_cat = tf.keras.utils.to_categorical(y_val, num_classes)
    y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes)
    
    # Create datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_cat))
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val_cat))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test_cat))
    
    # Batch and prefetch
    train_dataset = train_dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    print(f"\nüì¶ TensorFlow datasets created:")
    print(f"   Batch size: {batch_size}")
    print(f"   Train batches: {len(list(train_dataset))}")
    print(f"   Val batches: {len(list(val_dataset))}")
    print(f"   Test batches: {len(list(test_dataset))}")
    
    return train_dataset, val_dataset, test_dataset, num_classes

train_ds, val_ds, test_ds, num_classes = create_tf_datasets(
    X_train_norm, y_train, X_val_norm, y_val, X_test_norm, y_test
)



In [None]:
# --- 11: Download Prepared Data ---

from google.colab import files
import shutil

# Zip prepared data
shutil.make_archive('/content/prepared_data', 'zip', '/content/prepared_data')
files.download('/content/prepared_data.zip')

print("‚úÖ Notebook 2 complete! Ready for model training.")