This Notebook contains implementations of neural network models that process video frames and pose landmarks to classify actions and predict ratings.


In [1]:
import pandas as pd
import cv2
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoImageProcessor, AutoModelForPreTraining
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can use the GPU.")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")

CUDA is available. PyTorch can use the GPU.
Device name: NVIDIA GeForce GTX 1660 Ti


In [3]:
base_path = ''


## Functions

### `load_and_resize_frames`
- **Description**: Loads and resizes video frames from specified file paths.
- **Parameters**:
  - `num_video_frontal`: Identifier for the video file.
  - `num_idx`: Index for the specific video segment.
  - `num_frames`: Number of frames to load.
  - `size`: Target size for resizing frames (default: (224, 224)).
- **Returns**: List of resized frames.

### `extract_pose_landmarks`
- **Description**: Extracts pose landmarks from a list of video frames using MediaPipe.
- **Parameters**:
  - `video_frames`: List of frames (each of size [224, 224, 3]).
- **Returns**: Array of pose landmarks with shape [num_frames, 33, 3].


In [4]:
def load_and_resize_frames(num_video, action, frontORlat, num_idx, num_frames, size=(224, 224)):
    """
    Loads and resizes video frames from specified file paths based on the action type.

    Args:
        num_video (int): Identifier for the video file.
        action (str): The action being performed ('deadlift', 'squat', 'lunges').
        frontORlat (int): 1 for frontal frames, 0 for lateral frames.
        num_idx (int): Index for the specific video segment.
        num_frames (int): Number of frames to load.
        size (tuple): Target size for resizing frames (default is (224, 224)).

    Returns:
        list: A list of resized frames.
    """
    frames = []

    # Determine the folder paths based on the action
    if action == 'Deadlift':
        frontal_folder = base_path+'Deadlift_Frames'
        lateral_folder =  base_path+'Deadlift_Frames'
    elif action == 'Squat':
        frontal_folder =  base_path+'Squat_Frames'
        lateral_folder =  base_path+'Squat_Frames'
    elif action == 'lunges':
        frontal_folder =  base_path+'Lunges_Frames'
        lateral_folder =  base_path+'Lunges_Frames'
    else:
        raise ValueError(f"Unknown action: {action}")

    # Load the frames based on frontORlat flag
    if frontORlat == 1:
        folder = frontal_folder
    else:
        folder = lateral_folder

    # Load and resize frames
    for i in range(1, num_frames + 1):  # Loop from 1 to num_frames (e.g., 16)
        path = f"{folder}/{num_video}_idx_{num_idx}_{i}.jpg"
        img = cv2.imread(path)
        if img is not None:
            img_resized = cv2.resize(img, size)  # Resize to the specified size (default is 224x224)
            frames.append(img_resized)
        else:
            print(f"Warning: Could not load image at {path}")

    return frames

def extract_pose_landmarks(video_frames):
    """
    Extracts pose landmarks from a list of video frames using MediaPipe.

    Args:
        video_frames (list): List of frames (each of size [224, 224, 3]).

    Returns:
        np.array: Array of pose landmarks with shape [num_frames, 33, 3].
    """
    pose_landmarks = []

    for frame in video_frames:
        # Ensure the frame is in the right format
        if frame.shape != (224, 224, 3):
            print(f"Unexpected frame shape: {frame.shape}")
            continue
        # Convert the frame from float64 [0, 1] to uint8 [0, 255]
        frame_uint8 = (np.clip(frame * 255, 0, 255)).astype(np.uint8)  # Shape: [224, 224, 3]

        # Extract pose landmarks
        results = pose.process(frame_uint8)

        if results.pose_landmarks:
            # Get the landmarks (33 landmarks, each with x, y, z)
            landmarks = [[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark]
        else:
            # If no landmarks are detected, we use a zero vector
            landmarks = np.zeros((33, 3))

        pose_landmarks.append(landmarks)

    return np.array(pose_landmarks)  # Shape: [num_frames, 33, 3]

In [5]:
import json
import numpy as np
import pandas as pd

df_squat = pd.read_excel(base_path + 'squat_edited.xlsx')

# Load JSON files and convert to NumPy arrays
def load_json_as_numpy(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)  # Load the JSON data
    return np.array(data)  # Convert the list to a NumPy array

# Load the front and lateral poses
front_pose_array = load_json_as_numpy(base_path + 'front_pose_squat.json')
lat_pose_array = load_json_as_numpy(base_path + 'lat_pose_squat.json')

# Make sure `front_pose` and `lat_pose` columns exist
if 'front_pose' not in df_squat.columns:
    df_squat['front_pose'] = None
if 'lat_pose' not in df_squat.columns:
    df_squat['lat_pose'] = None

# Assign the loaded arrays back to the DataFrame
# Make sure the lengths match
if len(front_pose_array) == len(df_squat) and len(lat_pose_array) == len(df_squat):
    df_squat['front_pose'] = list(front_pose_array)
    df_squat['lat_pose'] = list(lat_pose_array)
else:
    print("Error: The length of the loaded arrays does not match the DataFrame.")

# Display the updated DataFrame
print(df_squat[['front_pose', 'lat_pose']])


                                            front_pose  \
0    [[[0.49896547198295593, 0.24871453642845154, -...   
1    [[[0.5140300393104553, 0.22558534145355225, -0...   
2    [[[0.5227701663970947, 0.22155773639678955, -0...   
3    [[[0.5229289531707764, 0.2611807584762573, -0....   
4    [[[0.5273605585098267, 0.22662483155727386, -0...   
..                                                 ...   
290  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
291  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
292  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
293  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
294  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   

                                              lat_pose  
0    [[[0.48301321268081665, 0.2528548836708069, 0....  
1    [[[0.5150753259658813, 0.25385501980781555, 0....  
2    [[[0.523187518119812, 0.23705779016017914, 0.1...  
3    [[[0.5000579953193665, 0.24061907827854156, 0....  
4    [[[0.53196150

In [6]:
# Define your split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate the number of samples for each set
total_samples = len(df_squat)
train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)

# Split the DataFrame
train_df_squat = df_squat.iloc[:train_size]                    # First 70% for training
val_df_squat = df_squat.iloc[train_size:train_size + val_size]  # Next 15% for validation
test_df_squat = df_squat.iloc[train_size + val_size:]           # Last 15% for testing

In [7]:
train_df_squat

Unnamed: 0,Num Video Frontal,Num Video Lateral,NumIdx,Action,Feet Out 30 F,Score: Frontal( - / 1),Whole Feet Flat On the Floor (1) L,Bend Hips and Knees Simultaniously (1) L,Hips backwards (1) L,Lower back neural (1) L,Hips are lower than knees level (1 point) L,Score: Lateral ( - / 5),Total Score - /6,class,rating,front_pose,lat_pose
0,1,2,0,Squat,0,0,1,1,1,1,0,4,4,1,4,"[[[0.49896547198295593, 0.24871453642845154, -...","[[[0.48301321268081665, 0.2528548836708069, 0...."
1,1,2,1,Squat,0,0,0,1,1,1,0,3,3,1,3,"[[[0.5140300393104553, 0.22558534145355225, -0...","[[[0.5150753259658813, 0.25385501980781555, 0...."
2,1,2,2,Squat,0,0,0,1,1,1,0,3,3,1,3,"[[[0.5227701663970947, 0.22155773639678955, -0...","[[[0.523187518119812, 0.23705779016017914, 0.1..."
3,1,2,3,Squat,0,0,0,1,1,1,0,3,3,1,3,"[[[0.5229289531707764, 0.2611807584762573, -0....","[[[0.5000579953193665, 0.24061907827854156, 0...."
4,1,2,4,Squat,0,0,0,1,1,1,0,3,3,1,3,"[[[0.5273605585098267, 0.22662483155727386, -0...","[[[0.5319615006446838, 0.2524169683456421, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,85,86,1,Squat,1,1,0,1,0,1,1,3,4,1,4,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...","[[[0.3942027986049652, 0.23040449619293213, -0..."
202,85,86,2,Squat,1,1,0,1,0,1,1,3,4,1,4,"[[[0.4873969852924347, 0.1548309326171875, -0....","[[[0.4314277470111847, 0.22929038107395172, -0..."
203,85,86,3,Squat,1,1,0,1,0,1,1,3,4,1,4,"[[[0.4695884585380554, 0.15741464495658875, -0...","[[[0.39387422800064087, 0.2241220325231552, -0..."
204,85,86,4,Squat,1,1,0,1,0,1,1,3,4,1,4,"[[[0.47849878668785095, 0.20681476593017578, -...","[[[0.41384050250053406, 0.22573530673980713, -..."


In [8]:
df_dead = pd.read_excel(base_path + 'deadlift_edited.xlsx')

# Load JSON files and convert to NumPy arrays
def load_json_as_numpy(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)  # Load the JSON data
    return np.array(data)  # Convert the list to a NumPy array

# Load the front and lateral poses
front_pose_array = load_json_as_numpy(base_path + 'front_pose_dead.json')
lat_pose_array = load_json_as_numpy(base_path + 'lat_pose_dead.json')

# Make sure `front_pose` and `lat_pose` columns exist
if 'front_pose' not in df_dead.columns:
    df_dead['front_pose'] = None
if 'lat_pose' not in df_dead.columns:
    df_dead['lat_pose'] = None

# Assign the loaded arrays back to the DataFrame
# Make sure the lengths match
if len(front_pose_array) == len(df_dead) and len(lat_pose_array) == len(df_dead):
    df_dead['front_pose'] = list(front_pose_array)
    df_dead['lat_pose'] = list(lat_pose_array)
else:
    print("Error: The length of the loaded arrays does not match the DataFrame.")

# Display the updated DataFrame
print(df_dead[['front_pose', 'lat_pose']])


                                            front_pose  \
0    [[[0.6141521334648132, 0.2384583204984665, -0....   
1    [[[0.7448023557662964, 0.29284247756004333, -0...   
2    [[[0.6720446944236755, 0.26214927434921265, -1...   
3    [[[0.7419819831848145, 0.2703408896923065, -0....   
4    [[[0.7648568749427795, 0.3041684925556183, -1....   
..                                                 ...   
264  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
265  [[[0.6985482573509216, 0.3878808617591858, -0....   
266  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
267  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
268  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   

                                              lat_pose  
0    [[[0.5667536854743958, 0.2648758590221405, -0....  
1    [[[0.5568003058433533, 0.25841817259788513, 0....  
2    [[[0.6044440865516663, 0.25434428453445435, -0...  
3    [[[0.614557683467865, 0.25464609265327454, -0....  
4    [[[0.59178572

In [9]:
# Define your split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate the number of samples for each set
total_samples = len(df_dead)
train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)

# Split the DataFrame
train_df_dead = df_dead.iloc[:train_size]                    # First 70% for training
val_df_dead = df_dead.iloc[train_size:train_size + val_size]  # Next 15% for validation
test_df_dead = df_dead.iloc[train_size + val_size:]           # Last 15% for testing

In [10]:
df_lunge = pd.read_excel(base_path + 'lunges_edited.xlsx')

# Load JSON files and convert to NumPy arrays
def load_json_as_numpy(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)  # Load the JSON data
    return np.array(data)  # Convert the list to a NumPy array

# Load the front and lateral poses
front_pose_array = load_json_as_numpy(base_path + 'front_pose_lunges.json')
lat_pose_array = load_json_as_numpy(base_path + 'lat_pose_lunges.json')

# Make sure `front_pose` and `lat_pose` columns exist
if 'front_pose' not in df_lunge.columns:
    df_lunge['front_pose'] = None
if 'lat_pose' not in df_lunge.columns:
    df_lunge['lat_pose'] = None

# Assign the loaded arrays back to the DataFrame
# Make sure the lengths match
if len(front_pose_array) == len(df_lunge) and len(lat_pose_array) == len(df_lunge):
    df_lunge['front_pose'] = list(front_pose_array)
    df_lunge['lat_pose'] = list(lat_pose_array)
else:
    print("Error: The length of the loaded arrays does not match the DataFrame.")

# Display the updated DataFrame
print(df_lunge[['front_pose', 'lat_pose']])


                                            front_pose  \
0    [[[0.5307167768478394, 0.26830747723579407, -0...   
1    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
2    [[[0.5470829606056213, 0.22924844920635223, -0...   
3    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
4    [[[0.5109826922416687, 0.2688097357749939, -0....   
..                                                 ...   
101  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
102  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
103  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
104  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   
105  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...   

                                              lat_pose  
0    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
1    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
2    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
3    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
4    [[[0.36190509

In [11]:
# Define your split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate the number of samples for each set
total_samples = len(df_lunge)
train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)

# Split the DataFrame
train_df_lunge = df_lunge.iloc[:train_size]                    # First 70% for training
val_df_lunge = df_lunge.iloc[train_size:train_size + val_size]  # Next 15% for validation
test_df_lunge = df_lunge.iloc[train_size + val_size:]           # Last 15% for testing

In [12]:

# Keep only the specified columns and concatenate the DataFrames
train_df = pd.concat([
    train_df_squat,
    train_df_lunge,
    train_df_dead
])

train_df = train_df.sample(frac=1, random_state=1).reset_index(drop=True)

val_df = pd.concat([
    val_df_squat,
    val_df_lunge,
    val_df_dead
])

val_df = val_df.sample(frac=1, random_state=1).reset_index(drop=True)

test_df = pd.concat([
    test_df_squat,
    test_df_lunge,
    test_df_dead
])

test_df = test_df.sample(frac=1, random_state=1).reset_index(drop=True)


### `Video_Model`
- **Description**: A model that utilizes a pretrained VideoMAE model for video representation learning.
- **Methods**:
  - `forward(pixel_values, bool_masked_pos)`: Forward pass through the VideoMAE model.


In [13]:
class Video_Model(nn.Module):
    """
    Video_Model is a PyTorch neural network module that leverages a pretrained
    VideoMAE model for video representation learning.

    Attributes:
        videomae (AutoModelForPreTraining): Pretrained VideoMAE model.
        pooling (nn.AdaptiveAvgPool1d): Layer for mean pooling of patch embeddings.
        fc1, fc2, fc3 (nn.Linear): Fully connected layers for additional feature extraction.
        relu1, relu2, relu3 (nn.ReLU): ReLU activation functions.

    Methods:
        forward(pixel_values, bool_masked_pos): Performs the forward pass using pixel values.
    """
    def __init__(self, pretrained_model_name="MCG-NJU/videomae-base", hidden_size=2048):
        super(Video_Model, self).__init__()

        # Load the pretrained VideoMAE model
        self.videomae = AutoModelForPreTraining.from_pretrained(pretrained_model_name)

        # Pooling layer to aggregate the patch embeddings (mean pooling)
        self.pooling = nn.AdaptiveAvgPool1d(1)  # Converts [batch_size, 782, 1536] -> [batch_size, 1536]

        # MLP layers with more depth (no final layer)
        self.fc1 = nn.Linear(1536, hidden_size)
        self.relu1 = nn.ReLU()

        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.relu3 = nn.ReLU()

    def forward(self, pixel_values, bool_masked_pos=None):
        # Forward pass through the pretrained VideoMAE model
        outputs = self.videomae(pixel_values, bool_masked_pos=bool_masked_pos)

        # Get the output embeddings [batch_size, 782, 1536]
        embeddings = outputs[1]

        # Apply pooling to get [batch_size, 1536]
        pooled_embeddings = self.pooling(embeddings.permute(0, 2, 1)).squeeze(-1)

        # Pass through the deeper MLP
        x = self.fc1(pooled_embeddings)
        x = self.relu1(x)

        x = self.fc2(x)
        x = self.relu2(x)

        x = self.fc3(x)
        x = self.relu3(x)

        # Return the final feature vector instead of predictions
        return x  # Shape: [batch_size, hidden_size]

### `Pose_Model`
- **Description**: A model that processes pose landmarks to output a feature vector.
- **Methods**:
  - `forward(pose_landmarks)`: Forward pass through the Pose model.

In [14]:
class Pose_Model(nn.Module):
    """
    Pose_Model is a PyTorch neural network module designed to process pose landmarks
    and output a feature vector for further analysis.
    """

    def __init__(self, input_size=33 * 3, hidden_size1=512, hidden_size2=1024, hidden_size3=256, final_size=2048, dropout_rate=0.5):
        super(Pose_Model, self).__init__()

        # Define the neural network layers
        self.fc1 = nn.Linear(input_size, hidden_size1)  # First hidden layer with reduced size
        self.bn1 = nn.BatchNorm1d(hidden_size1)  # Batch Normalization
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)  # Dropout for regularization

        self.fc2 = nn.Linear(hidden_size1, hidden_size2)  # Second hidden layer with increased size
        self.bn2 = nn.BatchNorm1d(hidden_size2)  # Batch Normalization
        self.relu2 = nn.LeakyReLU(0.2)  # Leaky ReLU activation
        self.dropout2 = nn.Dropout(dropout_rate)  # Dropout for regularization

        self.fc3 = nn.Linear(hidden_size2, hidden_size3)  # Third hidden layer with reduced size
        self.bn3 = nn.BatchNorm1d(hidden_size3)  # Batch Normalization
        self.relu3 = nn.ELU()  # ELU activation
        self.dropout3 = nn.Dropout(dropout_rate)  # Dropout for regularization

        self.fc4 = nn.Linear(hidden_size3, final_size)  # Output layer
        self.relu4 = nn.ReLU()

    def forward(self, pose_landmarks):
        """
        Forward pass through the Pose Model.

        pose_landmarks: Tensor of shape [batch_size, num_frames, 33, 3]
        """
        try:
            batch_size, num_frames, _, _ = pose_landmarks.shape

            # Flatten the pose landmarks for each frame: [batch_size, num_frames, 33*3]
            pose_landmarks = pose_landmarks.view(batch_size, num_frames, -1)

            # Process all frames at once
            x = self.fc1(pose_landmarks.view(-1, pose_landmarks.size(-1)))  # Shape: [batch_size*num_frames, hidden_size1]
            x = self.bn1(x)  # Batch Normalization
            x = self.relu1(x)
            x = self.dropout1(x)  # Apply Dropout

            x = self.fc2(x)
            x = self.bn2(x)  # Batch Normalization
            x = self.relu2(x)
            x = self.dropout2(x)  # Apply Dropout

            x = self.fc3(x)
            x = self.bn3(x)  # Batch Normalization
            x = self.relu3(x)
            x = self.dropout3(x)  # Apply Dropout

            x = self.fc4(x)  # Final layer without additional activation (could add if needed)
            x = self.relu4(x)

            # Reshape back to [batch_size, num_frames, final_size]
            frame_features = x.view(batch_size, num_frames, -1)  # Shape: [batch_size, num_frames, final_size]

            # Aggregate the frame features (mean pooling)
            pooled_features = frame_features.mean(dim=1)  # Shape: [batch_size, final_size]
        except Exception as e:
            print("An error occurred during bala:")
            print(f"Error: {e}")
        return pooled_features  # Shape: [batch_size, final_size]

### `Combined_Video_Pose_Model`
- **Description**: Combines the outputs of `Video_Model` and `Pose_Model` for comprehensive predictions.
- **Methods**:
  - `forward(pixel_values, bool_masked_pos, pose_landmarks)`: Combines features from video and pose models.


In [15]:
class Combined_Video_Pose_Model(nn.Module):
    """
    Combined_Video_Pose_Model integrates both video and pose models to provide
    a comprehensive hidden representation.
    """

    def __init__(self, video_hidden_size=2048, pose_hidden_size=2048, combined_hidden_size=1024, hidden_layer_size=2048, dropout_rate=0.5):
        super(Combined_Video_Pose_Model, self).__init__()

        # Video model (returns a 2048-dimensional feature vector)
        self.video_model = Video_Model(hidden_size=video_hidden_size)

        # Pose model (returns a 2048-dimensional feature vector)
        self.pose_model = Pose_Model(final_size=pose_hidden_size)

        # Separate processing for video features
        self.video_fc1 = nn.Linear(video_hidden_size, combined_hidden_size)
        self.video_relu1 = nn.ReLU()
        self.video_dropout1 = nn.Dropout(dropout_rate)

        # Separate processing for pose features
        self.pose_fc1 = nn.Linear(pose_hidden_size, combined_hidden_size)
        self.pose_relu1 = nn.ReLU()
        self.pose_dropout1 = nn.Dropout(dropout_rate)

        # Combine the features from the two models
        combined_input_size = 2 * combined_hidden_size  # Combined input from both models

        # Fully connected layers for the combined model
        self.fc1 = nn.Linear(combined_input_size, hidden_layer_size)  # Input should be 4096
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)

        self.fc2 = nn.Linear(hidden_layer_size, combined_hidden_size)  # Output layer to maintain original architecture
        self.relu2 = nn.ReLU()

    def forward(self, pixel_values, bool_masked_pos, pose_landmarks):
        """
        Forward pass through the combined model.
        Args:
            pixel_values: Input to the Video Model.
            bool_masked_pos: Masking input to the Video Model.
            pose_landmarks: Input to the Pose Model.

        Returns:
            x: The hidden representation of size 1024.
        """
        # Ensure to log the input shapes for debuggi

        # Forward pass through the video model
        video_features = self.video_model(pixel_values, bool_masked_pos=bool_masked_pos)
        if video_features is None:
            raise ValueError("video_model returned None.")

        # Process video features separately
        video_processed = self.video_fc1(video_features)
        video_processed = self.video_relu1(video_processed)
        video_processed = self.video_dropout1(video_processed)

        # Forward pass through the pose model
        pose_features = self.pose_model(pose_landmarks)
        if pose_features is None:
            raise ValueError("pose_model returned None.")

        # Process pose features separately
        pose_processed = self.pose_fc1(pose_features)
        pose_processed = self.pose_relu1(pose_processed)
        pose_processed = self.pose_dropout1(pose_processed)

        # Concatenate the processed feature vectors from both models
        combined_features = torch.cat((video_processed, pose_processed), dim=1)  # Shape should be [batch_size, 4096]

        # Pass through the combined MLP
        x = self.fc1(combined_features)
        x = self.relu1(x)
        x = self.dropout1(x)

        # Pass through the second fully connected layer
        x = self.fc2(x)
        x = self.relu2(x)
        return x  # Return hidden layer output only (Shape: [batch_size, combined_hidden_size])


## `Dual_Combined_Model`

- **Description**: Combines two instances of `Combined_Video_Pose_Model` to produce classification and rating outputs.

- **Methods**:
  - `forward(pixel_values_1, bool_masked_pos_1, pose_landmarks_tensor_1, pixel_values_2, bool_masked_pos_2, pose_landmarks_tensor_2)`: Forward pass through the dual model.


In [16]:
class Dual_Combined_Model(nn.Module):
    """
    Dual_Combined_Model combines two instances of Combined_Video_Pose_Model
    to produce classification and rating outputs.
    """

    def __init__(self):
        super(Dual_Combined_Model, self).__init__()

        # Initialize two instances of Combined_Video_Pose_Model
        self.model_1 = Combined_Video_Pose_Model()
        self.model_2 = Combined_Video_Pose_Model()

        # Fully connected layers for classification
        self.classification_layer = nn.Linear(2048, 512)  # Initial layer after concatenation
        self.classification_relu = nn.ReLU()
        self.classification_dropout = nn.Dropout(0.5)

        self.classification_output = nn.Linear(512, 3)  # Output layer for 3 classes

    def forward(self, pixel_values_1, bool_masked_pos_1, pose_landmarks_tensor_1,
                pixel_values_2, bool_masked_pos_2, pose_landmarks_tensor_2):
        try:
            # Forward pass through the first combined model
            hidden_output_1 = self.model_1(pixel_values_1, bool_masked_pos_1, pose_landmarks_tensor_1)

            # Forward pass through the second combined model
            hidden_output_2 = self.model_2(pixel_values_2, bool_masked_pos_2, pose_landmarks_tensor_2)

            # Check for None outputs
            if hidden_output_1 is None or hidden_output_2 is None:
                raise ValueError("One of the models returned None.")

            # Concatenate the hidden outputs from both models
            combined_hidden = torch.cat((hidden_output_1, hidden_output_2), dim=1)  # Shape: [batch_size, 4096]

            # Classification path
            classification_hidden = self.classification_layer(combined_hidden)
            classification_hidden = self.classification_relu(classification_hidden)
            classification_hidden = self.classification_dropout(classification_hidden)

            classification_output = self.classification_output(classification_hidden)  # Shape: [batch_size, 3]

            return classification_output, combined_hidden    # Return outputs

        except Exception as e:
            print("An error occurred during forward pass:")
            print(f"Error: {e}")
            raise  # Reraise the exception to maintain the stack trace


## `CriteriaPredictionModel`

- **Description**: A deep neural network for predicting multiple binary ratings (yes/no) using a series of fully connected layers, ReLU activations, and dropout for regularization. The model outputs a probability for each rating.

- **Methods**:
  - `forward(x)`: Forward pass through the network. The input is passed through five fully connected layers, each with ReLU and dropout, and the final output is processed by a sigmoid activation to produce binary classification probabilities.


In [17]:
class CriteriaPredictionModel(nn.Module):
    """
    RatingPredictionModel predicts multiple yes/no ratings with a deeper network architecture.
    The model uses binary classification for each output rating (5 in total).
    """
    def __init__(self, input_size=4096, hidden_size1=2048, hidden_size2=1024, output_size=5, dropout_rate=0.5):
        super(CriteriaPredictionModel, self).__init__()

        # First fully connected layer
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)

        # Second fully connected layer
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)

        # Adding more depth
        self.fc3 = nn.Linear(hidden_size2, hidden_size2 // 2)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)

        self.fc4 = nn.Linear(hidden_size2 // 2, hidden_size2 // 4)
        self.relu4 = nn.ReLU()
        self.dropout4 = nn.Dropout(dropout_rate)

        self.fc5 = nn.Linear(hidden_size2 // 4, hidden_size2 // 8)
        self.relu5 = nn.ReLU()
        self.dropout5 = nn.Dropout(dropout_rate)

        # Output layer for binary classification (yes/no for each of the 5 ratings)
        self.output_layer = nn.Linear(hidden_size2 // 8, output_size)
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary yes/no predictions

    def forward(self, x):
        # Forward pass through the network with multiple layers
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)

        x = self.fc4(x)
        x = self.relu4(x)
        x = self.dropout4(x)

        x = self.fc5(x)
        x = self.relu5(x)
        x = self.dropout5(x)

        # Final binary classification
        ratings = self.output_layer(x)
        return self.sigmoid(ratings)  # Returns 5 values, each in range [0, 1] for yes/no classification


## `DeepClassificationWithRatingModel`

- **Description**: The `DeepClassificationWithRatingModel` integrates the `DeepDualCombinedModel` and adds a separate rating prediction model. If the classification output predicts class `0`, the rating model is triggered. It uses different criteria models for deadlift, squat, and lunges based on the predicted class.

- **Methods**:
  - `forward(combined_hidden, predicted_class)`:
    - Takes the `combined_hidden` state and the `predicted_class`.
    - If class `0` is predicted, the `dead_criteria_model` is used to predict ratings.
    - If class `1` is predicted, the `squat_criteria_model` is used.
    - If class `2` is predicted, the `lunges_criteria_model` is used.
    - Returns the predicted ratings based on the class.


In [18]:
class DeepClassificationWithRatingModel(nn.Module):
    """
    DeepClassificationWithRatingModel integrates the DeepDualCombinedModel and
    adds a separate rating prediction model. If the classification output predicts class `0`,
    the rating model is triggered.
    """
    def __init__(self):
        super(DeepClassificationWithRatingModel, self).__init__()

        # Rating model with more layers
        self.dead_criteria_model = CriteriaPredictionModel(input_size=2048, hidden_size1=2048, hidden_size2=1024, output_size=5, dropout_rate=0.5)
        self.lunges_criteria_model = CriteriaPredictionModel(input_size=2048, hidden_size1=2048, hidden_size2=1024, output_size=7, dropout_rate=0.5)
        self.squat_criteria_model = CriteriaPredictionModel(input_size=2048, hidden_size1=2048, hidden_size2=1024, output_size=6, dropout_rate=0.5)

    def forward(self,combined_hidden,predicted_class):
        try:
            # Initialize ratings as None
            ratings = None

            # If class `0` is predicted, trigger rating prediction
            if predicted_class == 0:
                ratings = self.dead_criteria_model(combined_hidden)
            elif predicted_class == 1:
                ratings = self.squat_criteria_model(combined_hidden)
            elif predicted_class == 2:
                ratings = self.lunges_criteria_model(combined_hidden)
            return ratings

        except Exception as e:
            print(f"Error during forward pass in DeepClassificationWithRatingModel: {e}")
            raise

## `PoseVideoDataset`

- **Description**: A custom PyTorch Dataset designed to load video frames and pose landmarks, along with action class labels and ratings. It supports deadlift, squat, and lunge actions with different rating models for each. The dataset processes video frames, extracts pose landmarks, and normalizes ratings based on action class.

- **Methods**:
  - `__len__()`: Returns the number of samples in the dataset.
  - `__getitem__(idx)`: Loads and processes the data at the specified index:
    - Loads video frames for both frontal and lateral views.
    - Processes and normalizes video frames using the processor.
    - Extracts pose landmarks and action class labels.
    - Retrieves and normalizes ratings based on action class (Deadlift, Squat, or Lunge).
  - `_process_ratings(df)`: Processes and normalizes the ratings data from the corresponding DataFrame:
    - Extracts relevant columns (ending in 'F' or 'L').
    - Normalizes the scores and applies a threshold (0 or 1 based on the condition).
    - Returns the mean of the scores for each rating.


In [19]:

class PoseVideoDataset(Dataset):
    """
    Custom PyTorch Dataset to load video frames and pose landmarks, along with class and ratings labels.

    Args:
        df (pd.DataFrame): A DataFrame containing the dataset information, including video paths and labels.
        num_frames (int): Number of frames to load per video.
        processor: A pre-processing function to resize and normalize video frames.
        train_df_dead (pd.DataFrame): DataFrame containing ratings for deadlifts.
        train_df_squat (pd.DataFrame): DataFrame containing ratings for squats.
        train_df_lunge (pd.DataFrame): DataFrame containing ratings for lunges.

    Returns:
        A tuple containing the processed video frames, pose landmarks, labels, and ratings.
    """

    def __init__(self, df, num_frames, processor):
        self.df = df
        self.num_frames = num_frames
        self.processor = processor
        self.train_df_dead = train_df_dead
        self.train_df_squat = train_df_squat
        self.train_df_lunge = train_df_lunge

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Get frontal and lateral video paths and index
        num_video_frontal = row['Num Video Frontal']
        num_video_lateral = row['Num Video Lateral']
        num_idx = row['NumIdx']
        action = row['Action']

        # Load video frames and extract pose landmarks
        video_frames_frontal = load_and_resize_frames(num_video_frontal, action, 1, num_idx, self.num_frames)
        video_frames_lateral = load_and_resize_frames(num_video_lateral, action, 0, num_idx, self.num_frames)

        # Process the video frames to get pixel values
        pixel_values_frontal = self.processor(list(np.clip(np.array(video_frames_frontal), 0, 255)), return_tensors="pt").pixel_values.squeeze(0)
        pixel_values_lateral = self.processor(list(np.clip(np.array(video_frames_lateral), 0, 255)), return_tensors="pt").pixel_values.squeeze(0)

        model = AutoModelForPreTraining.from_pretrained("MCG-NJU/videomae-base")  # Load model for masking
        num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
        seq_length = (16 // model.config.tubelet_size) * num_patches_per_frame
        bool_masked_pos_frontal = torch.randint(0, 2, (seq_length,)).bool()
        bool_masked_pos_lateral = torch.randint(0, 2, (seq_length,)).bool()

        # Extract pose landmarks from video frames
        pose_landmarks_frontal = row['front_pose']
        pose_landmarks_lateral = row['lat_pose']
        pose_landmarks_tensor_frontal = torch.tensor(pose_landmarks_frontal).float()
        pose_landmarks_tensor_lateral = torch.tensor(pose_landmarks_lateral).float()

        # Get labels (action class)
        label_class = torch.tensor(row['class'], dtype=torch.long)

        # Initialize ratings
        ratings = None

        # Check label_class and load appropriate DataFrame
        if label_class.item() == 0:  # Deadlift
            ratings = self._process_ratings(self.train_df_dead,row)
        elif label_class.item() == 1:  # Squat
            ratings = self._process_ratings(self.train_df_squat,row)
        elif label_class.item() == 2:  # Lunge
            ratings = self._process_ratings(self.train_df_lunge,row)

        # Convert ratings to tensor
        ratings = torch.tensor(ratings, dtype=torch.float32) if ratings is not None else None

        return (pixel_values_frontal, bool_masked_pos_frontal, pose_landmarks_tensor_frontal,
                pixel_values_lateral, bool_masked_pos_lateral, pose_landmarks_tensor_lateral,
                label_class, ratings)

    def _process_ratings(self, df,row):
        """
        Process the ratings DataFrame to extract and normalize scores.

        Args:
            df (pd.DataFrame): DataFrame containing ratings for the specific action.

        Returns:
            list: Normalized and thresholded scores.
        """
        # Select columns ending with 'F' or 'L'
        relevant_columns = [col for col in df.columns if col.endswith('F') or col.endswith('L')]

        # Extract ratings and normalize
        scores = row[relevant_columns].values


        # Apply threshold: Convert to 0 or 1 based on specific thresholding conditions
        thresholded_scores = np.where(scores >= 0.5, 1, 0)

        return thresholded_scores.tolist()  # Return mean of the scores for each sample


In [20]:
def create_dataloader(df, num_frames, processor, batch_size=8, shuffle=True):
    """
    Creates a PyTorch DataLoader from the PoseVideoDataset.

    Args:
        df (pd.DataFrame): DataFrame containing the dataset information.
        num_frames (int): The number of frames to extract from each video.
        processor: A video frame pre-processing function.
        batch_size (int, optional): Batch size for the DataLoader. Defaults to 8.
        shuffle (bool, optional): Whether to shuffle the data. Defaults to True.

    Returns:
        DataLoader: A PyTorch DataLoader for the dataset.
    """
    dataset = PoseVideoDataset(df, num_frames, processor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


## `train_combined_model`

- **Description**: This function trains both the classification model (`Dual_Combined_Model`) and the rating prediction model (`CriteriaPredictionModel`). It evaluates them after each epoch and saves the best-performing models based on validation loss. It also includes early stopping if no improvement is observed for a set number of epochs.

- **Args**:
  - `model (torch.nn.Module)`: The classification model (e.g., `Dual_Combined_Model`).
  - `criteria_model (torch.nn.Module)`: The rating prediction model (e.g., `CriteriaPredictionModel`).
  - `train_dataloader (DataLoader)`: A DataLoader providing the training data.
  - `eval_dataloader (DataLoader)`: A DataLoader providing the evaluation data.
  - `epochs (int, optional)`: Number of epochs to train. Defaults to 1000.
  - `lr (float, optional)`: Learning rate for the optimizer. Defaults to 1e-4.
  - `device (str, optional)`: The device to train the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
  - `clip_grad_norm (float, optional)`: Maximum norm for gradient clipping. Defaults to 1.0.
  - `patience (int, optional)`: Number of epochs with no improvement after which training will be stopped. Defaults to 5.

- **Returns**:
  - None: The function prints the training progress, validation loss, classification accuracy, and rating accuracy during training.

- **Steps**:
  1. **Model Initialization**: Moves both the classification and rating models to the specified device.
  2. **Optimizer Setup**: Uses the Adam optimizer for both models with the specified learning rate.
  3. **Loss Functions**: Defines loss functions for both classification (`CrossEntropyLoss`) and ratings (`BCEWithLogitsLoss`).
  4. **Training Loop**:
     - Performs a forward pass through the classification model and computes the classification loss.
     - Computes the rating prediction loss only if the predicted class matches the actual class.
     - Combines both the classification and rating losses and performs backpropagation.
  5. **Gradient Clipping**: Clips gradients to avoid exploding gradients during backpropagation.
  6. **Model Evaluation**: Evaluates the models after each epoch and prints the results.
  7. **Early Stopping**: Monitors validation loss and triggers early stopping if no improvement is observed for a specified number of epochs.
  8. **Model Checkpointing**: Saves the best-performing models based on validation loss.


In [21]:
def train_combined_model(model, criteria_model, train_dataloader, eval_dataloader, epochs=1000, lr=1e-4,
                         device='cpu', clip_grad_norm=1.0, patience=5):
    """
    Trains both the classification and rating prediction models, evaluates them after each epoch.

    Args:
        model (torch.nn.Module): The classification model (Dual_Combined_Model).
        criteria_model (torch.nn.Module): The rating prediction model (RatingPredictionModel).
        train_dataloader (DataLoader): A DataLoader providing the training data.
        eval_dataloader (DataLoader): A DataLoader providing the evaluation data.
        epochs (int, optional): Number of epochs to train. Defaults to 1000.
        lr (float, optional): Learning rate for the optimizer. Defaults to 1e-4.
        device (str, optional): The device to train the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
        clip_grad_norm (float, optional): Maximum norm for gradient clipping. Defaults to 1.0.
        patience (int, optional): Number of epochs with no improvement after which training will be stopped. Defaults to 5.

    Returns:
        None: Prints the training progress, validation loss, classification accuracy, and rating accuracy.
    """
    model.to(device)
    criteria_model.to(device)

    # Set up optimizer and loss functions
    optimizer = optim.Adam(list(model.parameters()) + list(criteria_model.parameters()), lr=lr)
    criterion_class = nn.CrossEntropyLoss()  # Loss for classification task
    criterion_ratings = nn.BCEWithLogitsLoss()  # Loss for rating task (binary)

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)

    best_eval_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        criteria_model.train()
        running_loss = 0.0
        a = 0

        for batch in train_dataloader:
            # Unpack batch data and move to the specified device
            (pixel_values_frontal, bool_masked_pos_frontal, pose_landmarks_tensor_frontal,
             pixel_values_lateral, bool_masked_pos_lateral, pose_landmarks_tensor_lateral,
             label_class, ratings) = [tensor.to(device) for tensor in batch]

            optimizer.zero_grad()

            # Forward pass through the classification model
            classification_output, combined_hidden = model(
                pixel_values_frontal, bool_masked_pos_frontal, pose_landmarks_tensor_frontal,
                pixel_values_lateral, bool_masked_pos_lateral, pose_landmarks_tensor_lateral
            )
            _, predicted_class = torch.max(classification_output, 1)

            # Loss for classification
            loss_class = criterion_class(classification_output, label_class)
            print(f"real class : {label_class} and predicted class : {predicted_class}")
            print(f"loss class : {loss_class}")
            # Forward pass through the rating prediction model
            # We need to trigger the correct rating model based on the predicted class
            for i in range(label_class.size(0)):  # Iterate over each sample in the batch
                actual_class = label_class[i].item()
                predicted_class_item = predicted_class[i].item()
                # Loss for ratings (if the predicted class matches the actual class)
                if predicted_class_item == actual_class:
                    if actual_class == 0:  # Deadlift
                        ratings_output = criteria_model.dead_criteria_model(combined_hidden[i])
                    elif actual_class == 1:  # Squat
                        ratings_output = criteria_model.squat_criteria_model(combined_hidden[i])
                    elif actual_class == 2:  # Lunges
                        ratings_output = criteria_model.lunges_criteria_model(combined_hidden[i])

                    print(f"ratings : {ratings[i]} and ratings_output : {ratings_output}")
                    loss_ratings = criterion_ratings(ratings_output, ratings[i].float())
                    
                else:
                    # High loss if the predicted class doesn't match
                    loss_ratings = torch.tensor(100.0, device=device, requires_grad=True)
                
                print(f"loss ratings : {loss_ratings}")
                # Combine classification and rating losses
                loss = loss_class + loss_ratings
                
                # Backward pass and optimization
                loss.backward()
                torch.nn.utils.clip_grad_norm_(list(model.parameters()) + list(criteria_model.parameters()), clip_grad_norm)
                optimizer.step()

                running_loss += loss.item()
                print(f"{a} and data_size: {len(train_dataloader)} and epoch: {epoch + 1}")
                a += 1

        avg_loss = running_loss / len(train_dataloader)
        print(f"Epoch [{epoch + 1}/{epochs}], Training Loss: {avg_loss:.4f}")

        # Evaluate both models after each epoch
        eval_loss, eval_accuracy, eval_rating_accuracy = evaluate_combined_model(
            model, criteria_model, eval_dataloader, device
        )
        print(f"Epoch [{epoch + 1}/{epochs}], Validation Loss: {eval_loss:.4f}, "
              f"Classification Accuracy: {eval_accuracy:.4f}, "
              f"Rating Accuracy (Deadlift): {eval_rating_accuracy['deadlift']:.4f}, "
              f"Rating Accuracy (Squat): {eval_rating_accuracy['squat']:.4f}, "
              f"Rating Accuracy (Lunges): {eval_rating_accuracy['lunges']:.4f}")

        # Scheduler step
        scheduler.step(eval_loss)

        # Check for early stopping
        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            patience_counter = 0
            torch.save(model.state_dict(), f"best_combined_model_epoch_{epoch + 1}.pt")
            torch.save(criteria_model.state_dict(), f"best_rating_model_epoch_{epoch + 1}.pt")
            print("Model checkpoint saved.")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    print("Training complete.")

## `evaluate_combined_model`

- **Description**: This function evaluates both the classification and rating prediction models on the provided dataset. It computes the average evaluation loss, classification accuracy, and rating accuracies for different actions (deadlift, squat, lunges) using a given `DataLoader` and loss functions.

- **Args**:
  - `classification_model (torch.nn.Module)`: The classification model (e.g., `Dual_Combined_Model`).
  - `criteria_model (torch.nn.Module)`: The rating prediction model (e.g., `CriteriaPredictionModel`).
  - `dataloader (DataLoader)`: A DataLoader providing the evaluation data.
  - `criterion_class (nn.Module)`: The loss function for the classification task.
  - `criterion_ratings (nn.Module)`: The loss function for the rating task.
  - `device (str)`: The device to perform evaluation on ('cpu' or 'cuda').

- **Returns**:
  - `float`: The average evaluation loss across the dataset.
  - `dict`: A dictionary containing the classification accuracy and rating accuracies for each action (deadlift, squat, lunges).

- **Steps**:
  1. **Set Evaluation Mode**: Sets the models (`classification_model` and `criteria_model`) to evaluation mode.
  2. **Initialize Metrics**: Initializes accumulators for loss, classification accuracy, and rating accuracy for each action.
  3. **Evaluation Loop**:
     - Performs a forward pass through the classification model to get predictions and computes classification loss.
     - For each sample in the batch, if the predicted class matches the actual class, it computes the rating accuracy for that action.
     - The rating predictions are compared with the ground truth ratings, and accuracy is calculated for each action (deadlift, squat, lunges).
  4. **Compute Average Metrics**: Computes the average classification loss, overall classification accuracy, and rating accuracy for each action.
  5. **Return Results**: Returns the average loss, classification accuracy, and rating accuracies.


In [40]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_combined_model(classification_model, criteria_model, dataloader, device):
    """
    Evaluates both the classification and rating models on the provided dataset.

    Args:
        classification_model (torch.nn.Module): The classification model.
        criteria_model (torch.nn.Module): The rating prediction model.
        dataloader (DataLoader): A DataLoader providing the evaluation data.
        criterion_class (nn.Module): The loss function for classification.
        criterion_ratings (nn.Module): The loss function for ratings.
        device (str): The device to perform evaluation on ('cpu' or 'cuda').

    Returns:
        float: Average evaluation loss.
        dict: Classification accuracy and rating accuracies for each action (deadlift, squat, lunges).
    """
    classification_model.to(device)
    criteria_model.to(device)
    classification_model.eval()
    criteria_model.eval()
    criterion_class = nn.CrossEntropyLoss()  # Loss for classification task
    criterion_ratings = nn.BCEWithLogitsLoss()  # Loss for rating task (binary)
    total_loss = 0.0
    correct_predictions_class = 0
    total_samples_class = 0

    # Initialize accumulators for rating accuracy per feature
    rating_accumulators = {'deadlift': 0, 'squat': 0, 'lunges': 0}
    total_rating_samples = {'deadlift': 0, 'squat': 0, 'lunges': 0}

    # Initialize feature-level accuracy accumulators for different numbers of features per action
    feature_accuracies = {
        'deadlift': {'TP': [0] * 5, 'FP': [0] * 5, 'FN': [0] * 5, 'TN': [0] * 5},
        'squat': {'TP': [0] * 6, 'FP': [0] * 6, 'FN': [0] * 6, 'TN': [0] * 6},
        'lunges': {'TP': [0] * 7, 'FP': [0] * 7, 'FN': [0] * 7, 'TN': [0] * 7}
    }
    a = 0
    with torch.no_grad():
        for batch in dataloader:
            (pixel_values_frontal, bool_masked_pos_frontal, pose_landmarks_tensor_frontal,
             pixel_values_lateral, bool_masked_pos_lateral, pose_landmarks_tensor_lateral,
             label_class, ratings) = [tensor.to(device) for tensor in batch]

            # Forward pass through the classification model
            classification_output, combined_hidden = classification_model(
                pixel_values_frontal, bool_masked_pos_frontal, pose_landmarks_tensor_frontal,
                pixel_values_lateral, bool_masked_pos_lateral, pose_landmarks_tensor_lateral
            )
            _, predicted_class = torch.max(classification_output, 1)

            # Classification loss and accuracy
            loss_class = criterion_class(classification_output, label_class)
            total_loss += loss_class.item()
            correct_predictions_class += (predicted_class == label_class).sum().item()
            total_samples_class += label_class.size(0)

            a=a+1
            print(a)
            # Rating accuracy calculation per feature (only if predicted class matches actual class)
            for i in range(label_class.size(0)):  # Loop over each sample in the batch
                actual_class = label_class[i].item()
                predicted_class_item = predicted_class[i].item()

                if actual_class == predicted_class_item:
                    if actual_class == 0:  # Deadlift
                        ratings_output = criteria_model.dead_criteria_model(combined_hidden[i])
                        predicted_ratings = torch.sigmoid(ratings_output) > 0.5
                        correct_ratings = (predicted_ratings == ratings[i].byte()).sum().item()
                        rating_accumulators['deadlift'] += correct_ratings
                        total_rating_samples['deadlift'] += ratings[i].numel()

                        # Feature-level analysis for deadlift
                        for feature_idx in range(ratings[i].size(0)):  # Assume ratings[i] is the feature vector
                            predicted = predicted_ratings[feature_idx].item()
                            actual = ratings[i][feature_idx].item()

                            # Update TP, FP, FN, TN for deadlift
                            if predicted == 1 and actual == 1:
                                feature_accuracies['deadlift']['TP'][feature_idx] += 1
                            elif predicted == 1 and actual == 0:
                                feature_accuracies['deadlift']['FP'][feature_idx] += 1
                            elif predicted == 0 and actual == 1:
                                feature_accuracies['deadlift']['FN'][feature_idx] += 1
                            elif predicted == 0 and actual == 0:
                                feature_accuracies['deadlift']['TN'][feature_idx] += 1

                    elif actual_class == 1:  # Squat
                        ratings_output = criteria_model.squat_criteria_model(combined_hidden[i])
                        predicted_ratings = torch.sigmoid(ratings_output) > 0.5
                        correct_ratings = (predicted_ratings == ratings[i].byte()).sum().item()
                        rating_accumulators['squat'] += correct_ratings
                        total_rating_samples['squat'] += ratings[i].numel()

                        # Feature-level analysis for squat
                        for feature_idx in range(ratings[i].size(0)):
                            predicted = predicted_ratings[feature_idx].item()
                            actual = ratings[i][feature_idx].item()

                            # Update TP, FP, FN, TN for squat
                            if predicted == 1 and actual == 1:
                                feature_accuracies['squat']['TP'][feature_idx] += 1
                            elif predicted == 1 and actual == 0:
                                feature_accuracies['squat']['FP'][feature_idx] += 1
                            elif predicted == 0 and actual == 1:
                                feature_accuracies['squat']['FN'][feature_idx] += 1
                            elif predicted == 0 and actual == 0:
                                feature_accuracies['squat']['TN'][feature_idx] += 1

                    elif actual_class == 2:  # Lunges
                        ratings_output = criteria_model.lunges_criteria_model(combined_hidden[i])
                        predicted_ratings = torch.sigmoid(ratings_output) > 0.5
                        correct_ratings = (predicted_ratings == ratings[i].byte()).sum().item()
                        rating_accumulators['lunges'] += correct_ratings
                        total_rating_samples['lunges'] += ratings[i].numel()

                        # Feature-level analysis for lunges
                        for feature_idx in range(ratings[i].size(0)):
                            predicted = predicted_ratings[feature_idx].item()
                            actual = ratings[i][feature_idx].item()

                            # Update TP, FP, FN, TN for lunges
                            if predicted == 1 and actual == 1:
                                feature_accuracies['lunges']['TP'][feature_idx] += 1
                            elif predicted == 1 and actual == 0:
                                feature_accuracies['lunges']['FP'][feature_idx] += 1
                            elif predicted == 0 and actual == 1:
                                feature_accuracies['lunges']['FN'][feature_idx] += 1
                            elif predicted == 0 and actual == 0:
                                feature_accuracies['lunges']['TN'][feature_idx] += 1

    # Calculate the average loss and classification accuracy
    avg_loss = total_loss / len(dataloader)
    accuracy_class = correct_predictions_class / total_samples_class if total_samples_class > 0 else 0.0

    # Calculate rating accuracy for each action
    rating_accuracy = {
        'deadlift': rating_accumulators['deadlift'] / total_rating_samples['deadlift'] if total_rating_samples['deadlift'] > 0 else 0.0,
        'squat': rating_accumulators['squat'] / total_rating_samples['squat'] if total_rating_samples['squat'] > 0 else 0.0,
        'lunges': rating_accumulators['lunges'] / total_rating_samples['lunges'] if total_rating_samples['lunges'] > 0 else 0.0
    }

    # Print feature-level accuracy and performance metrics
    for action in ['deadlift', 'squat', 'lunges']:
        num_features = len(feature_accuracies[action]['TP'])  # Number of features for the action
        for feature_idx in range(num_features):
            TP = feature_accuracies[action]['TP'][feature_idx]
            FP = feature_accuracies[action]['FP'][feature_idx]
            FN = feature_accuracies[action]['FN'][feature_idx]
            TN = feature_accuracies[action]['TN'][feature_idx]

            precision = TP / (TP + FP) if TP + FP > 0 else 0.0
            recall = TP / (TP + FN) if TP + FN > 0 else 0.0
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
            accuracy = (TP + TN) / (TP + FP + FN + TN) if TP + FP + FN + TN > 0 else 0.0
            print(f"{action} feature {feature_idx + 1} - TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}")
            print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, Accuracy: {accuracy:.4f}")

    # Write results to a file
    with open('val_numbers.txt', 'a') as f:
        f.write(f"Classification Accuracy: {accuracy_class:.4f}\n")
        f.write(f"Rating Accuracy - Deadlift: {rating_accuracy['deadlift']:.4f}\n")
        f.write(f"Rating Accuracy - Squat: {rating_accuracy['squat']:.4f}\n")
        f.write(f"Rating Accuracy - Lunges: {rating_accuracy['lunges']:.4f}\n")
        f.write("Feature-level Accuracies and Metrics:\n")
        for action in ['deadlift', 'squat', 'lunges']:
            num_features = len(feature_accuracies[action]['TP'])  # Number of features for the action
            for feature_idx in range(num_features):
                TP = feature_accuracies[action]['TP'][feature_idx]
                FP = feature_accuracies[action]['FP'][feature_idx]
                FN = feature_accuracies[action]['FN'][feature_idx]
                TN = feature_accuracies[action]['TN'][feature_idx]

                precision = TP / (TP + FP) if TP + FP > 0 else 0.0
                recall = TP / (TP + FN) if TP + FN > 0 else 0.0
                f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
                accuracy = (TP + TN) / (TP + FP + FN + TN) if TP + FP + FN + TN > 0 else 0.0
                f.write(f"{action} feature {feature_idx + 1} - TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}\n")
                f.write(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, Accuarcy: {accuracy:.4f}\n")

    return avg_loss, accuracy_class, rating_accuracy


In [None]:
processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")

dataloader = create_dataloader(train_df, 16, processor, batch_size=1)
eval_dataloader = create_dataloader(val_df, 16, processor, batch_size=1)
test_dataloader = create_dataloader(test_df, 16, processor, batch_size=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [41]:
# # Initialize the model
# dual_combined_model = Dual_Combined_Model()
# rating_model = DeepClassificationWithRatingModel()

# print(device)

# # Train the model
# train_combined_model(dual_combined_model, rating_model, dataloader, eval_dataloader,
#                      epochs=1000, lr=1e-4, device=device)


# Define the path to the saved model weights
model_path = "best_combined_model_epoch_1.pt"
criteria_model_path = "best_rating_model_epoch_1.pt"

dual_combined_model = Dual_Combined_Model()
rating_model = DeepClassificationWithRatingModel()

# Load the saved model weights
dual_combined_model.load_state_dict(torch.load(model_path))
rating_model.load_state_dict(torch.load(criteria_model_path))

print(device)

# Continue training the model

train_combined_model(dual_combined_model, rating_model, dataloader, eval_dataloader,
                      epochs=1000, lr=1e-4, device=device)

# Evaluate the model on dataset
evaluate_combined_model(dual_combined_model, rating_model, eval_dataloader, device)

cuda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
deadlift feature 1 - TP: 37, FP: 3, FN: 0, TN: 0
Precision: 0.9250, Recall: 1.0000, F1-score: 0.9610, Accuracy: 0.9250
deadlift feature 2 - TP: 38, FP: 2, FN: 0, TN: 0
Precision: 0.9500, Recall: 1.0000, F1-score: 0.9744, Accuracy: 0.9500
deadlift feature 3 - TP: 17, FP: 23, FN: 0, TN: 0
Precision: 0.4250, Recall: 1.0000, F1-score: 0.5965, Accuracy: 0.4250
deadlift feature 4 - TP: 0, FP: 0, FN: 18, TN: 22
Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000, Accuracy: 0.5500
deadlift feature 5 - TP: 35, FP: 5, FN: 0, TN: 0
Precision: 0.8750, Recall: 1.0000, F1-score: 0.9333, Accuracy: 0.8750
squat feature 1 - TP: 0, FP: 0, FN: 43, TN: 1
Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000, Accuracy: 0

(0.001911617439202588,
 1.0,
 {'deadlift': 0.745,
  'squat': 0.6590909090909091,
  'lunges': 0.6190476190476191})