In [1]:
import os
import json

import cv2
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoImageProcessor, AutoModelForPreTraining


# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can use the GPU.")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")


base_path = ''
FeatureNum = 6


def process_action_data(base_path, action_name):
    """
    Process data for a specific action by loading the corresponding Excel and JSON files,
    adding pose data, and splitting into train, validation, and test sets.

    Args:
        base_path (str): The base directory containing the files.
        action_name (str): The name of the action (e.g., 'squat', 'deadlift', 'lunges').

    Returns:
        tuple: train_df, val_df, test_df DataFrames.
    """
    # Load the Excel file
    excel_file = f"{action_name}.xlsx"
    df = pd.read_excel(os.path.join(base_path, excel_file))

    # Load the JSON files for front and lateral poses
    front_pose_file = f"front_pose_{action_name}.json"
    lat_pose_file = f"lat_pose_{action_name}.json"

    def load_json_as_numpy(json_file):
        with open(json_file, 'r') as file:
            data = json.load(file)
        return np.array(data)

    front_pose_array = load_json_as_numpy(os.path.join(base_path, front_pose_file))
    lat_pose_array = load_json_as_numpy(os.path.join(base_path, lat_pose_file))

    # Ensure `front_pose` and `lat_pose` columns exist
    if 'front_pose' not in df.columns:
        df['front_pose'] = None
    if 'lat_pose' not in df.columns:
        df['lat_pose'] = None

    # Assign the loaded arrays to the DataFrame if lengths match
    if len(front_pose_array) == len(df) and len(lat_pose_array) == len(df):
        df['front_pose'] = list(front_pose_array)
        df['lat_pose'] = list(lat_pose_array)
    else:
        raise ValueError("The length of the loaded arrays does not match the DataFrame.")

    # Shuffle the DataFrame
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Define split ratios
    train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

    # Calculate the number of samples for each set
    total_samples = len(df)
    train_size = int(total_samples * train_ratio)
    val_size = int(total_samples * val_ratio)

    # Split the DataFrame
    train_df = df.iloc[:train_size]
    val_df = df.iloc[train_size:train_size + val_size]
    test_df = df.iloc[train_size + val_size:]

    return train_df, val_df, test_df



train_df_squat, val_df_squat, test_df_squat = process_action_data(base_path, 'squat')
train_df_dead, val_df_dead, test_df_dead = process_action_data(base_path, 'deadlift')
train_df_lunge, val_df_lunge, test_df_lunge = process_action_data(base_path, 'lunges')


# Define the actions
actions = ['squat', 'deadlift', 'lunges']

# Process data for each action and store results in dictionaries
splits = {action: process_action_data(base_path, action) for action in actions}

# Concatenate and shuffle DataFrames for each split
train_df = pd.concat([splits[action][0] for action in actions]).sample(frac=1, random_state=1).reset_index(drop=True)
val_df = pd.concat([splits[action][1] for action in actions]).sample(frac=1, random_state=1).reset_index(drop=True)
test_df = pd.concat([splits[action][2] for action in actions]).sample(frac=1, random_state=1).reset_index(drop=True)



CUDA is not available. PyTorch is using the CPU.


FileNotFoundError: [Errno 2] No such file or directory: 'squat.xlsx'

# Vision

In [None]:

import torch.nn as nn
import torch
import torchvision.models.video as models


class PretrainedResNet3D(nn.Module):
    """
    A wrapper around the 3D ResNet model (ResNet3D) to leverage pretrained weights and provide feature extraction.
    The final fully connected layer is replaced with an identity layer, making it suitable for downstream tasks 
    like feature extraction for multi-input models.

    Args:
        pretrained (bool, optional): Whether to load the pretrained weights for ResNet3D (default is True).
    
    Attributes:
        resnet3d (nn.Module): The ResNet3D model with the final layer replaced with an identity layer.
    
    Methods:
        forward(x): Performs a forward pass through the ResNet3D model, returning the extracted features.

    """
    def __init__(self, pretrained=True):
        super(PretrainedResNet3D, self).__init__()
        try:
            # Load the pretrained ResNet3D model
            self.resnet3d = models.r3d_18(pretrained=pretrained)
            # Replace the final fully connected layer with an identity layer
            self.resnet3d.fc = nn.Identity()

            for name, param in self.resnet3d.named_parameters():
                if "layer3" not in name and "layer4" not in name:
                    param.requires_grad = False
        except Exception as e:
            print(f"Error in PretrainedResNet3D initialization: {e}")

    def forward(self, x):
        """
        Forward pass through the ResNet3D model. This method processes the input tensor through the ResNet3D 
        architecture, extracting features by passing through the layers.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, channels, num_frames, height, width].

        Returns:
            torch.Tensor: Extracted feature vector after passing through the ResNet3D model.
        """
        try:
            return self.resnet3d(x)
        except Exception as e:
            print(f"Error in PretrainedResNet3D forward pass: {e}")
            return None


import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """
    An optimized residual block with balanced main and shortcut paths for improved gradient flow.
    """
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        try:
            # Main path with bottleneck architecture for efficiency
            self.main_path = nn.Sequential(
                # Reduce channels first (bottleneck)
                nn.Conv2d(in_channels, out_channels//2, kernel_size=(1, 1), stride=1),
                nn.BatchNorm2d(out_channels//2),
                nn.ReLU(),
                # Extract features with larger kernel
                nn.Conv2d(out_channels//2, out_channels//2, kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)),
                nn.BatchNorm2d(out_channels//2),
                nn.ReLU(),
                # Expand channels back
                nn.Conv2d(out_channels//2, out_channels, kernel_size=(1, 1), stride=1),
                nn.BatchNorm2d(out_channels),
                nn.Dropout(0.2)  # Lower dropout for better feature retention
            )

            # Adaptive shortcut connection
            if stride != 1 or in_channels != out_channels:
                self.shortcut = nn.Sequential(
                    nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=(stride, 1)),
                    nn.BatchNorm2d(out_channels)
                )
            else:
                self.shortcut = nn.Identity()
                
        except Exception as e:
            print(f"Error in ResidualBlock initialization: {e}")

    def forward(self, x):
        """
        Forward pass of the enhanced ResidualBlock with pre-activation design.
        """
        try:
            # Process paths
            residual = self.shortcut(x)
            main_output = self.main_path(x)
            
            # Combine and activate
            x = main_output + residual
            return F.relu(x)  # Apply ReLU after addition
        except Exception as e:
            print(f"Error in ResidualBlock forward pass: {e}")
            return None



# Pose

In [None]:

class Pose_Model(nn.Module):
    """
    A CNN model for extracting features from pose data with enhanced architecture.
    """
    def __init__(self, input_channels=5984, hidden_dim=256, expansion_factor=2, feature_dim=512):
        super(Pose_Model, self).__init__()
        try:
            mid_channels = hidden_dim * expansion_factor
            
            # Initial dimensionality reduction
            self.initial_conv = nn.Sequential(
                nn.Conv2d(input_channels, hidden_dim, kernel_size=(3, 1), stride=(2, 1), padding=(1, 0)),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.2)
            )

            # Enhanced multi-stage residual pathway
            self.res_stage1 = ResidualBlock(hidden_dim, mid_channels, stride=2)
            self.res_stage2 = ResidualBlock(mid_channels, feature_dim, stride=1)
            
            # Spatial pooling to further compact the representation
            self.pool = nn.AdaptiveAvgPool2d((1, 1))

        except Exception as e:
            print(f"Error in Pose_Model initialization: {e}")

    def forward(self, x):
        """
        Forward pass with enhanced feature extraction.
        
        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, num_frames, channels]
        
        Returns:
            torch.Tensor: Features of shape [batch_size, feature_dim]
        """
        try:
            batch_size, num_frames, channels = x.shape
            x = x.permute(0, 2, 1).unsqueeze(-1)  # [batch_size, channels, num_frames, 1]

            # Multi-stage feature extraction
            x = self.initial_conv(x)
            x = self.res_stage1(x)
            x = self.res_stage2(x)
            
            # Global pooling and flatten
            x = self.pool(x)
            x = x.view(batch_size, -1)  # [batch_size, feature_dim]
            
            return x
            
        except Exception as e:
            print(f"Error in Pose_Model forward pass: {e}")
            return None


class DualInputPose(nn.Module):
    """
    A model that processes pose data from two views with enhanced feature extraction.
    """
    def __init__(self, MergedOrAlone=1, output_size=FeatureNum, pose_input_channels=5984, feature_dim=512):
        super(DualInputPose, self).__init__()
        try:
            # Pose models for front and lateral views with consistent output dimension
            self.front_model = Pose_Model(input_channels=pose_input_channels, feature_dim=feature_dim)
            self.lat_model = Pose_Model(input_channels=pose_input_channels, feature_dim=feature_dim)
            
            combined_dim = feature_dim * 2
            
            # Path selection based on standalone vs. merged operation
            if MergedOrAlone == 1:
                # Progressive reduction for standalone predictions
                self.fc = nn.Sequential(
                    nn.Linear(combined_dim, combined_dim//2),  # 1024 → 512
                    nn.BatchNorm1d(combined_dim//2),
                    nn.ReLU(),
                    nn.Dropout(0.3),
                    nn.Linear(combined_dim//2, combined_dim//4),  # 512 → 256
                    nn.ReLU(),
                    nn.Dropout(0.2),
                    nn.Linear(combined_dim//4, output_size)  # 256 → output
                )
            else:
                # Maintain dimension for fusion - output same size as individual streams for balanced fusion
                self.fc = nn.Sequential(
                    nn.Linear(combined_dim, feature_dim),  # 1024 → 512
                    nn.BatchNorm1d(feature_dim),
                    nn.ReLU(),
                    nn.Dropout(0.2)
                )
        except Exception as e:
            print(f"Error in DualInputPose initialization: {e}")

    def forward(self, front_input, lat_input):
        """
        Forward pass of the DualInputPose model. The model processes the front and lateral views of pose data 
        through separate Pose_Model instances, concatenates the resulting features, and makes predictions.

        Args:
            front_input (torch.Tensor): Input tensor of shape [batch_size, num_frames, channels] for frontal pose data.
            lat_input (torch.Tensor): Input tensor of shape [batch_size, num_frames, channels] for lateral pose data.
        
        Returns:
            torch.Tensor: The predicted output of shape [batch_size, output_size].
        """
        try:
            # Pass through the front and lateral Pose models
            front_features = self.front_model(front_input)  # Shape: [batch_size, feature_dim]
            print(f"Front pose features shape: {front_features.shape}")  # Debugging

            lat_features = self.lat_model(lat_input)  # Shape: [batch_size, feature_dim]
            print(f"Lateral pose features shape: {lat_features.shape}")  # Debugging

            # Concatenate features
            combined_features = torch.cat((front_features, lat_features), dim=1)  # Shape: [batch_size, feature_dim * 2]
            print(f"Combined pose features shape: {combined_features.shape}")  # Debugging

            # Predict criteria
            output = self.fc(combined_features)  # Shape: [batch_size, output_size]
            print(f"Final pose features shape: {output.shape}")  # Debugging

            return output
        except Exception as e:
            print(f"Error in DualInputPose forward pass: {e}")
            return None


class DualInputResNet3D(nn.Module):
    """
    A model that uses two separate streams of the 3D ResNet architecture with enhanced fusion.
    """
    def __init__(self, MergedOrAlone, output_size=FeatureNum, hidden_size=512, dropout_rate=0.3):
        super(DualInputResNet3D, self).__init__()
        try:
            # Pretrained ResNet3D streams remain unchanged
            self.resnet3d_frontal = PretrainedResNet3D()
            self.resnet3d_lateral = PretrainedResNet3D()
            
            combined_dim = hidden_size * 2
            
            # Path selection logic
            if MergedOrAlone == 1:
                # Standalone prediction path with advanced normalization
                self.fc_layers = nn.Sequential(
                    nn.Linear(combined_dim, combined_dim//2),  # 1024 → 512
                    nn.BatchNorm1d(combined_dim//2),
                    nn.ReLU(),
                    nn.Dropout(dropout_rate),
                    nn.Linear(combined_dim//2, combined_dim//4),  # 512 → 256
                    nn.ReLU(),
                    nn.Dropout(dropout_rate * 0.7),  # Reduced dropout
                    nn.Linear(combined_dim//4, output_size)  # Final prediction
                )
            else:
                # Maintain dimension for fusion - consistent with pose stream
                self.fc_layers = nn.Sequential(
                    nn.Linear(combined_dim, hidden_size),  # 1024 → 512
                    nn.BatchNorm1d(hidden_size),
                    nn.ReLU(),
                    nn.Dropout(dropout_rate * 0.7)
                )
        except Exception as e:
            print(f"Error in DualInputResNet3D initialization: {e}")

    def forward(self, frontal, lateral):
        """
        Forward pass through the dual input ResNet3D model. The inputs (frontal and lateral views) are processed
        through separate ResNet3D models, and the extracted features are concatenated and passed through fully
        connected layers to produce the final output.

        Args:
            frontal (torch.Tensor): Input tensor for the frontal view, shape [batch_size, channels, num_frames, height, width].
            lateral (torch.Tensor): Input tensor for the lateral view, shape [batch_size, channels, num_frames, height, width].

        Returns:
            torch.Tensor: Final output tensor, shape [batch_size, output_size], where output_size is the number of classes or features.
        """
        try:
            # Permute to [batch_size, channels, num_frames, height, width]
            frontal = frontal.permute(0, 2, 1, 3, 4).contiguous()
            lateral = lateral.permute(0, 2, 1, 3, 4).contiguous()

            # Process frontal input through ResNet3D
            x_f = self.resnet3d_frontal(frontal)
            x_f = x_f.view(x_f.size(0), -1)  # Flatten features
            print(f"Frontal features shape: {x_f.shape}")  # Debugging

            # Process lateral input through ResNet3D
            x_l = self.resnet3d_lateral(lateral)
            x_l = x_l.view(x_l.size(0), -1)  # Flatten features
            print(f"Lateral features shape: {x_l.shape}")  # Debugging

            # Concatenate features from frontal and lateral streams
            x = torch.cat((x_f, x_l), dim=1)
            print(f"Concatenated features shape: {x.shape}")  # Debugging

            # Pass through fully connected layers
            x = self.fc_layers(x)
            print(f"Final output shape: {x.shape}")  # Debugging

            return x
        except Exception as e:
            print(f"Error in DualInputResNet3D forward pass: {e}")
            return None


# Multi

In [None]:

class MultiModalModel(nn.Module):
    """
    Multi-modal model combining vision and pose streams with enhanced fusion mechanisms.
    """
    def __init__(self, output_size=FeatureNum):
        super(MultiModalModel, self).__init__()
        try:
            # Both feature extractors now output 512-dimensional vectors
            self.vision_model = DualInputResNet3D(0, output_size=0)
            self.pose_model = DualInputPose(0, output_size=0)
            
            # Balanced fusion network with consistent dimensions
            self.fusion_layers = nn.Sequential(
                nn.Linear(512 + 512, 768),  # Combined 1024D → 768D
                nn.BatchNorm1d(768),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(768, 384),  # 768D → 384D
                nn.BatchNorm1d(384),
                nn.ReLU(),
                nn.Dropout(0.25),
                nn.Linear(384, 192),  # 384D → 192D
                nn.BatchNorm1d(192),
                nn.ReLU(),
                nn.Dropout(0.2)
            )
            
            # Classification head with progressive reduction
            self.classifier = nn.Sequential(
                nn.Linear(192, 96),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(96, output_size)
            )
            
        except Exception as e:
            print(f"Error in MultiModalModel initialization: {e}")

    def forward(self, image_frontal, image_lateral, pose_frontal, pose_lateral):
        """
        Forward pass of the MultiModalModel combining image and pose features.

        Args:
            image_frontal (Tensor): The frontal view video data.
            image_lateral (Tensor): The lateral view video data.
            pose_frontal (Tensor): The frontal pose data.
            pose_lateral (Tensor): The lateral pose data.
        
        Returns:
            Tensor: Final predicted output after processing both vision and pose data.
        """
        try:
            image_features = self.vision_model(image_frontal, image_lateral)
            pose_features = self.pose_model(pose_frontal, pose_lateral)

            combined_features = torch.cat((image_features, pose_features), dim=1)
            fused_features = self.fusion_layers(combined_features)

            output = self.classifier(fused_features)
            return output
        except Exception as e:
            print(f"Error in MultiModalModel forward pass: {e}")
            return None


# Dataloadr and utils

In [None]:

def compute_pairwise_distances(points_tensor):
    """
    Compute pairwise distances between 33 pose landmarks for each frame.

    Args:
        points_tensor (torch.Tensor): Tensor of shape [num_frames, 33, 3],
                                       where each row is a point (x, y, z) for each frame.

    Returns:
        torch.Tensor: Tensor of shape [num_frames, 528], containing pairwise distances for each frame.
    """
    num_frames, num_points, _ = points_tensor.size()

    # Generate index pairs for the upper triangle of a matrix (excluding diagonal)
    pairs = torch.combinations(torch.arange(num_points), r=2, with_replacement=False)  # Shape: [528, 2]

    # Gather the coordinates for each pair of points
    point1 = points_tensor[:, pairs[:, 0]]  # Shape: [num_frames, 528, 3]
    point2 = points_tensor[:, pairs[:, 1]]  # Shape: [num_frames, 528, 3]

    # Compute pairwise Euclidean distances
    pairwise_distances = torch.norm(point1 - point2, dim=2)  # Shape: [num_frames, 528]

    return pairwise_distances


def compute_distances_and_angles_combined(points_tensor):
    """
    Compute and concatenate pairwise distances and angles between every three points for 33 pose landmarks for each frame.

    Args:
        points_tensor (torch.Tensor): Tensor of shape [num_frames, 33, 3],
                                       where each row is a point (x, y, z) for each frame.

    Returns:
        torch.Tensor: Tensor of shape [num_frames, 528 + comb(33, 3)],
                      containing pairwise distances and angles for each frame.
    """
    num_frames, num_points, _ = points_tensor.size()

    # Precompute all pairs and triplets of indices
    pairs = torch.combinations(torch.arange(num_points), r=2, with_replacement=False)
    triplets = torch.combinations(torch.arange(num_points), r=3, with_replacement=False)

    # Compute pairwise distances
    point_diffs = points_tensor[:, pairs[:, 0]] - points_tensor[:, pairs[:, 1]]  # [num_frames, num_pairs, 3]
    pairwise_distances = torch.norm(point_diffs, dim=2)  # [num_frames, num_pairs]

    # Compute angles between triplets
    vec1 = points_tensor[:, triplets[:, 0]] - points_tensor[:, triplets[:, 1]]  # [num_frames, num_triplets, 3]
    vec2 = points_tensor[:, triplets[:, 2]] - points_tensor[:, triplets[:, 1]]  # [num_frames, num_triplets, 3]
    dot_products = torch.sum(vec1 * vec2, dim=2)  # [num_frames, num_triplets]
    norms = torch.norm(vec1, dim=2) * torch.norm(vec2, dim=2)  # [num_frames, num_triplets]
    cos_angles = dot_products / (norms + 1e-8)  # Add epsilon to avoid division by zero
    # angles = torch.acos(cos_angles)  # [num_frames, num_triplets]

    # Concatenate distances and angles
    combined_features = torch.cat([pairwise_distances, cos_angles], dim=1)  # [num_frames, 5984]
    return combined_features




class PreprocessedPoseVideoDataset(Dataset):

    def __init__(self, df, num_frames, preprocessed_dir):
        self.df = df
        self.num_frames = num_frames
        self.preprocessed_dir = preprocessed_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Get frontal and lateral video identifiers
        num_video_frontal = row['Num Video Frontal']
        num_video_lateral = row['Num Video Lateral']
        num_idx = row['NumIdx']
        action = row['Action']

        # Load preprocessed frames
        frontal_frames = self._load_preprocessed_frames(num_video_frontal, action, num_idx)
        lateral_frames = self._load_preprocessed_frames(num_video_lateral, action, num_idx)

        # Normalize images
        image_frontal = torch.tensor(frontal_frames, dtype=torch.float32).permute(0, 3, 1, 2) / 255.0
        image_lateral = torch.tensor(lateral_frames, dtype=torch.float32).permute(0, 3, 1, 2) / 255.0

        label_class = torch.tensor(row['class'], dtype=torch.long)
        ratings = self._process_ratings(train_df_squat, row)
        ratings = torch.tensor(ratings, dtype=torch.float32) if ratings is not None else None

        # Extract pose landmarks from video frames
        pose_landmarks_frontal = row['front_pose']
        pose_landmarks_lateral = row['lat_pose']
        pose_landmarks_tensor_frontal = torch.tensor(pose_landmarks_frontal).float()
        pose_landmarks_tensor_lateral = torch.tensor(pose_landmarks_lateral).float()
        pose_frontal = compute_distances_and_angles_combined(pose_landmarks_tensor_frontal)
        pose_lateral = compute_distances_and_angles_combined(pose_landmarks_tensor_lateral)

        return (image_frontal, image_lateral, pose_frontal, pose_lateral, label_class, ratings)

    def _load_preprocessed_frames(self, num_video, action, num_idx):
        frames = []
        for i in range(1, self.num_frames + 1):
            # Construct the preprocessed file path
            file_name = f"{num_video}_idx_{num_idx}_{i}.npy"
            file_path = os.path.join(self.preprocessed_dir, action, file_name)

            # Load the preprocessed .npy file
            frame = np.load(file_path)
            frames.append(frame)
        return np.stack(frames, axis=0)

    def _process_ratings(self, df, row):
        relevant_columns = [col for col in df.columns if col.endswith('F') or col.endswith('L')]
        scores = row[relevant_columns].values
        thresholded_scores = np.where(scores >= 0.5, 1, 0)
        return thresholded_scores.tolist()



def create_dataloader(df, num_frames, batch_size=16, shuffle=True):
    dataset = PreprocessedPoseVideoDataset(df, num_frames, preprocessed_dir=os.path.join('', 'preprocessed_images'))
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=True           # Optimize for GPU training
    )


# Train and eval

In [None]:

def train_combined_model(CustomModel, train_dataloader, eval_dataloader, epochs=1000, lr=1e-4,
                         device='cpu', clip_grad_norm=1.0, patience=10):
    CustomModel.to(device)

    # Set up optimizer and loss functions
    optimizer = optim.Adam(list(CustomModel.parameters()), lr=lr)
    feature_loss = nn.BCEWithLogitsLoss()  # Loss for binary feature classification

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
    best_eval_loss = float('inf')
    best_hamming_distance = float('inf')
    patience_counter = 0
    print("Training the model...")
    
    for epoch in range(epochs):
        CustomModel.train()
        running_loss = 0.0
        print(f'Start of Epoch {epoch+1}')
        for batch_idx, batch in enumerate(train_dataloader):
            # Calculate progress percentage
            progress = (batch_idx + 1) / len(train_dataloader) * 100
            # Unpack batch data and move to the specified device
            (image_frontal, image_lateral, pose_frontal, pose_lateral, label_class, ratings) = [tensor.to(device) for tensor in batch]
            optimizer.zero_grad()
            
            # Forward pass - Check dimensions:
            # image_frontal: [batch_size, 16, 3, 224, 224]
            # image_lateral: [batch_size, 16, 3, 224, 224]
            # pose_frontal: [batch_size, 16, 5984]
            # pose_lateral: [batch_size, 16, 5984]
            # ratings: [batch_size, FeatureNum]
            ratings_output = CustomModel(image_frontal, image_lateral).to(device)
            # ratings_output should be: [batch_size, FeatureNum]

            loss = feature_loss(ratings_output, ratings)

            # Backward pass and optimization
            loss.backward()
            torch.nn.utils.clip_grad_norm_(list(CustomModel.parameters()), clip_grad_norm)
            optimizer.step()

            running_loss += loss.item()

            # Print progress during each epoch
            print(f"Epoch [{epoch + 1}/{epochs}], Progress: {progress:.2f}%, Batch [{batch_idx + 1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")

        avg_loss = running_loss / len(train_dataloader)

        # Evaluate model after each epoch
        eval_loss, hamming_distances, metrics = evaluate_combined_model(CustomModel, eval_dataloader, feature_loss, device)
        mean_hamming_distance = sum(hamming_distances.values()) / len(hamming_distances) if len(hamming_distances) > 0 else 0.0

        tp = metrics['TP']
        tn = metrics['TN']
        fp = metrics['FP']
        fn = metrics['FN']

        # Class 1 metrics
        precision_1 = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall_1 = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0

        # Class 0 metrics
        precision_0 = tn / (tn + fn) if (tn + fn) > 0 else 0
        recall_0 = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1_score_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0

        jaccard_index = tp / (tp + fp + fn) if (tp + fp + fn) > 0 else 0.0

        # Print summary for each epoch
        print(f"Epoch [{epoch + 1}/{epochs}] with lr: {lr} Summary: "
              f"Train Loss: {avg_loss:.4f}, Eval Loss: {eval_loss:.4f},"
              f"Precision: {(precision_1 +precision_0)/2:.4f}, Recall: {(recall_1 +recall_0)/2:.4f},"
              f" F1-Score 0: {f1_score_0:.4f}, F1-Score 1: {f1_score_1:.4f}, F1-Score: {(f1_score_1 +f1_score_0)/2:.4f}"
              )
        print(f"Hamming Loss: {mean_hamming_distance:.4f}, jaccard index: {jaccard_index:.4f}", )

        scheduler.step(eval_loss)

        # Early stopping and model saving
        if (eval_loss < best_eval_loss):
            best_eval_loss = eval_loss
            patience_counter = 0
            torch.save(CustomModel.state_dict(), f"best_rating_model_epoch_{epoch + 1}_eval_pose.pt")
            print("Model checkpoint saved.")
        elif (mean_hamming_distance < best_hamming_distance):
            best_hamming_distance = mean_hamming_distance
            patience_counter = 0
            torch.save(CustomModel.state_dict(), f"best_rating_model_epoch_{epoch + 1}_ham_pose.pt")
            print("Model checkpoint saved.")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    print("Training complete.")



def evaluate_combined_model(CustomSwinTransformerModel, dataloader, feature_loss, device):
    """
    Evaluates the model and computes evaluation metrics for each feature.

    Args:
        CustomSwinTransformerModel (torch.nn.Module): The rating prediction model.
        dataloader (DataLoader): A DataLoader providing the evaluation data.
        feature_loss (nn.Module): The loss function for ratings.
        device (str): The device to perform evaluation on ('cpu' or 'cuda').

    Returns:
        float: Average evaluation loss.
        dict: Hamming distance for squat features.
        dict: TP, TN, FP, FN for each feature.
    """
    CustomSwinTransformerModel.eval()
    total_loss = 0.0
    num_features = FeatureNum
    metrics = {
            feature_idx: {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0} for feature_idx in range(num_features)
        }
    total_samples = {feature_idx: 0 for feature_idx in range(num_features)}

    with torch.no_grad():
        a=0
        for batch_idx, batch in enumerate(dataloader):
            # Move batch data to the device
            print(f"a ={a},  batch_idx = {batch_idx}")
            a=a+1
            image_frontal, image_lateral, pose_frontal, pose_lateral, label_class, ratings = [tensor.to(device) for tensor in batch]

            # Predict ratings using the model
            ratings_output = CustomSwinTransformerModel(image_frontal, image_lateral).to(device)
            total_loss += feature_loss(ratings_output, ratings)

            # Sigmoid activation for binary predictions
            predicted_ratings = torch.sigmoid(ratings_output) > 0.5
            actual_ratings = ratings.byte()

            # Compute TP, TN, FP, FN for each feature
            for feature_idx in range(num_features):
                preds = predicted_ratings[:, feature_idx]
                trues = actual_ratings[:, feature_idx]

                metrics[feature_idx]['TP'] += (preds & trues).sum().item()
                metrics[feature_idx]['TN'] += (~preds & ~trues).sum().item()
                metrics[feature_idx]['FP'] += (preds & ~trues).sum().item()
                metrics[feature_idx]['FN'] += (~preds & trues).sum().item()
                total_samples[feature_idx] += trues.numel()

    # Calculate Hamming distance for each feature
    hamming_distances = {}
    for feature_idx in range(num_features):
          hamming_distances[feature_idx] = (
              metrics[feature_idx]['FP'] + metrics[feature_idx]['FN']
          ) / total_samples[feature_idx] if total_samples[feature_idx] > 0 else 0.0

    avg_loss = total_loss / len(dataloader)

    total_metrics = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
    # Print TP, TN, FP, FN for each feature
    for feature_idx in range(num_features):
        print(f"Feature {feature_idx}:")
        print(f"  TP: {metrics[feature_idx]['TP']}")
        print(f"  TN: {metrics[feature_idx]['TN']}")
        print(f"  FP: {metrics[feature_idx]['FP']}")
        print(f"  FN: {metrics[feature_idx]['FN']}")
        print(f"  Hamming Loss: {hamming_distances[feature_idx]:.4f}")
        total_metrics['TP'] += metrics[feature_idx]['TP']
        total_metrics['TN'] += metrics[feature_idx]['TN']
        total_metrics['FP'] += metrics[feature_idx]['FP']
        total_metrics['FN'] += metrics[feature_idx]['FN']
    mean_hamming_distance = sum(hamming_distances.values()) / len(hamming_distances) if len(hamming_distances) > 0 else 0.0

    tp = total_metrics['TP']
    tn = total_metrics['TN']
    fp = total_metrics['FP']
    fn = total_metrics['FN']

    # Class 1 metrics
    precision_1 = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall_1 = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0

    # Class 0 metrics
    precision_0 = tn / (tn + fn) if (tn + fn) > 0 else 0
    recall_0 = tn / (tn + fp) if (tn + fp) > 0 else 0
    f1_score_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0

    jaccard_index = tp / (tp + fp + fn) if (tp + fp + fn) > 0 else 0.0

    # Print summary for each epoch
    print(
          f"Precision: {(precision_1 +precision_0)/2:.4f}, Recall: {(recall_1 +recall_0)/2:.4f},"
          f" F1-Score 0: {f1_score_0:.4f}, F1-Score 1: {f1_score_1:.4f}, F1-Score: {(f1_score_1 +f1_score_0)/2:.4f}"
          )
    print(f"Hamming Loss: {mean_hamming_distance:.4f}, jaccard index: {jaccard_index:.4f}", )
    return avg_loss, hamming_distances, total_metrics



# Run Merge


In [None]:

dataloader = create_dataloader(train_df_squat, 16, batch_size=16)
eval_dataloader = create_dataloader(val_df_squat, 16, batch_size=16)
test_dataloader = create_dataloader(test_df_squat, 16, batch_size=16)

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Initialize the model
# rating_model = MultiModalModel()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# print(device)

# # Train the model
# train_combined_model(rating_model, dataloader, eval_dataloader, epochs=1000, lr=4e-4, device=device)

# Test Merge

In [None]:
# # Load the model for testing
# def load_model(model, path, device):
#     model.load_state_dict(torch.load(path, map_location=device))
#     model.to(device)
#     model.eval()
#     return model
# feature_loss = nn.BCEWithLogitsLoss()
# # Load the trained model
# test_model = MultiModalModel().to(device)
# test_model = load_model(test_model, "/kaggle/working/best_rating_model_epoch_32_ham.pt", device)
# evaluate_combined_model(test_model, test_dataloader, feature_loss, device)
# # 

In [None]:
# torch.save(rating_model.state_dict(), "final.pt")

# Vision Run

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
rating_model = DualInputPose(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

# Train the model
train_combined_model(rating_model, dataloader, eval_dataloader, epochs=1000, lr=4e-4, device=device)

In [None]:
# Load the model for testing
def load_model(model, path, device):
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    return model
feature_loss = nn.BCEWithLogitsLoss()
# Load the trained model
test_model = DualInputResNet3D(1).to(device)
test_model = load_model(test_model, "/kaggle/working/best_rating_model_epoch_32_ham.pt", device)
evaluate_combined_model(test_model, test_dataloader, feature_loss, device)
