In [10]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from collections import deque
import matplotlib.pyplot as plt


from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model
# from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from keras.models import Model


import cv2
import mediapipe as mp

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [11]:
dataset_dir = "../../UCF50"

training_ratio = 0.6
validation_ratio = 0.25
test_ratio = 0.15 

image_shape=(128,128)

classes =['PullUps','Punch','PushUps']
num_classes = len(classes)
label_encoder = LabelEncoder()
label_encoder.fit(classes)

number_of_frames=15

In [12]:
def extract_ucf50_classes(dataset_dir):
    classes = [folder for folder in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, folder))]
    return classes


classes = extract_ucf50_classes(dataset_dir)
num_classes = len(classes)
label_encoder = LabelEncoder()
label_encoder.fit(classes)

In [6]:
classes

['Skijet',
 'JavelinThrow',
 'MilitaryParade',
 'HulaHoop',
 'Lunges',
 'SoccerJuggling',
 'Punch',
 'Swing',
 'BaseballPitch',
 'SkateBoarding',
 'YoYo',
 'Mixing',
 'Diving',
 'RockClimbingIndoor',
 'PommelHorse',
 'TrampolineJumping',
 'VolleyballSpiking',
 'PlayingGuitar',
 'CleanAndJerk',
 'BenchPress',
 'Rowing',
 'Skiing',
 'WalkingWithDog',
 'JugglingBalls',
 'PlayingTabla',
 'JumpingJack',
 'ThrowDiscus',
 'PlayingViolin',
 'HorseRace',
 'BreastStroke',
 'HighJump',
 'Drumming',
 'TaiChi',
 'Billiards',
 'PizzaTossing',
 'SalsaSpin',
 'PoleVault',
 'Basketball',
 'Kayaking',
 'JumpRope',
 'PushUps',
 'Nunchucks',
 'Biking',
 'PullUps',
 'TennisSwing',
 'RopeClimbing',
 'Fencing',
 'GolfSwing',
 'PlayingPiano',
 'HorseRiding']

In [4]:
def get_video_paths_and_categories(root_dir):
    training_video_paths = []
    training_targets = []
    validation_video_paths = []
    validation_targets = []
    test_video_paths = []
    test_targets = []
    

    for category in classes:
        category_path = os.path.join(root_dir, category)
        
        if os.path.isdir(category_path):
            nvideos = len(os.listdir(category_path))
            if nvideos == 0:
                print(f"No videos found for category: {category}")
                continue

            # Split counts based on the ratios
            training_count = int(nvideos * training_ratio)
            validation_count = int(nvideos * validation_ratio)
            test_count = nvideos - (training_count + validation_count)  # Remaining videos for testing
            
            label = label_encoder.transform([category])[0]
            
            # Get video paths
            video_files = os.listdir(category_path)
            for i in range(training_count):
                video_file = video_files[i]
                video_path = os.path.join(category_path, video_file)
                training_video_paths.append(video_path)
                training_targets.append(label)

            for i in range(training_count, training_count + validation_count):
                video_file = video_files[i]
                video_path = os.path.join(category_path, video_file)
                validation_video_paths.append(video_path)
                validation_targets.append(label)

            for i in range(training_count + validation_count, training_count + validation_count + test_count):
                video_file = video_files[i]
                video_path = os.path.join(category_path, video_file)
                test_video_paths.append(video_path)
                test_targets.append(label)

        else:
            print(f"No class found for {category}")

    return (training_video_paths, training_targets, 
            validation_video_paths, validation_targets, 
            test_video_paths, test_targets)

In [5]:
def apply_mediapipe_pose(video_path,number_of_frames=10):
    mp_pose = mp.solutions.pose
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Error: Could not open video.")
        return []

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step=total_frames//number_of_frames
    
    pose_landmarks_list = []
    
    index=-1
    frames_count=0
    with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            index+=1
            
            if index % step == 0 and frames_count<number_of_frames:
                frames_count+=1

                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                rgb_frame.flags.writeable = False

                results = pose.process(rgb_frame)

                rgb_frame.flags.writeable = True
                if results.pose_landmarks:
                    mp.solutions.drawing_utils.draw_landmarks(
                        frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

                    landmarks = results.pose_landmarks.landmark
                    pose_landmarks_list.append([(lm.x, lm.y, lm.z,lm.visibility) for lm in landmarks])


                #gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
                # cv2.imshow('MediaPipe Pose', gray_frame)
                # cv2.imshow('MediaPipe Pose', frame)
                # if cv2.waitKey(10) & 0xFF == ord('q'):
                #     break
           
    
    #padding
            
    if len(pose_landmarks_list) < number_of_frames and len(pose_landmarks_list) > 0:
        x=-1
        while len(pose_landmarks_list) < number_of_frames:
            pose_landmarks_list.append(pose_landmarks_list[x])
            x-=1

    cap.release()
    cv2.destroyAllWindows()

    return np.array(pose_landmarks_list)
    # return pose_landmarks_list

In [5]:
def apply_mediapipe_pose_v2(video_path, number_of_frames=15):
    mp_pose = mp.solutions.pose
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Error: Could not open video.")
        return []

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = total_frames // number_of_frames
    
    pose_landmarks_list = []
    selected_frame_indices=[]
    
    
    with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
        # Calculate the frame range for each area and choose the first frame that has pose landmarks
        for i in range(number_of_frames):
            start_frame = i * step
            end_frame = (i + 1) * step if (i + 1) < number_of_frames else total_frames
            
            # Try to find a frame with pose landmarks within the area
            frame_with_landmarks = False
            
            for frame_index in range(start_frame, end_frame):
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
                ret, frame = cap.read()
                if not ret:
                    break

                # frame =cv2.resize(frame,image_shape)

                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                rgb_frame.flags.writeable = False
                results = pose.process(rgb_frame)

                rgb_frame.flags.writeable = True
                if results.pose_landmarks:
                    # Pose landmarks found, add the frame's landmarks to the list
                    landmarks = results.pose_landmarks.landmark
                    pose_landmarks_list.append([(lm.x, lm.y, lm.z, lm.visibility,lm.presence) for lm in landmarks])
                    selected_frame_indices.append(frame_index)
                    frame_with_landmarks = True
                    break  # Once we find the first frame with landmarks, break the loop for this area

            # If no frame with landmarks was found, we add None or some default padding (optional)
            if not frame_with_landmarks:
                return np.array([]),np.array([])

    cap.release()
    cv2.destroyAllWindows()

    # Padding if the number of frames is less than required
    # if len(pose_landmarks_list) < number_of_frames and len(pose_landmarks_list) > 0:
    #     x = -1
    #     while len(pose_landmarks_list) < number_of_frames:
    #         pose_landmarks_list.append(pose_landmarks_list[x])
    #         x -= 1

    return np.array(pose_landmarks_list),np.array(selected_frame_indices)

In [6]:
#get frames from video

def get_frames_from_video(video_path,frame_shape, n_frames=10):
    cap = cv2.VideoCapture(video_path)
        
    if not cap.isOpened():
        print(f"Error: Could not open video at path {video_path}.")
        return None

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames == 0:
        print("Error: Video contains no frames.")
        cap.release()
        return None

    step = max(1, total_frames // n_frames)
    frames_count = 0
    result = []

    index = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Select frames based on step
        if index % step == 0 and frames_count < n_frames:
            resized_frame = cv2.resize(frame, frame_shape)
            result.append(resized_frame)
            frames_count += 1
        
        index += 1

    cap.release()
        
    while frames_count < n_frames:
        # print('aici')
        for i in range(1,n_frames,1):
            if frames_count < n_frames:
                frames_count+=1
                result.append(result[-i])
        
    result = np.stack(result, axis=0)
    result = result.astype(np.float32) / 255.0  # Normalize to [0, 1]

    return result
    
    
    

In [7]:
import cv2
import numpy as np

def get_frames_from_video_v2(video_path, frame_shape, frame_indices):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video.")
        return []

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    result = []

    # Iterate over each index in the frame_indices list
    for frame_index in frame_indices:
        if frame_index >= total_frames:
            print(f"Warning: Frame index {frame_index} exceeds total number of frames in the video.")
            continue
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)  # Set the video capture to the specific frame index
        ret, frame = cap.read()
        if not ret:
            print(f"Error: Could not read frame at index {frame_index}.")
            continue
        
        # Resize the frame to the desired shape
        rgb_frame = cv2.resize(frame, frame_shape)
        result.append(rgb_frame)

    cap.release()

    # Convert to numpy array and normalize the pixel values to [0, 1]
    result = np.array(result)
    result = result / 255.0

    return result


In [None]:
# def hip_center_normalize_mediapipe_data(data):
#     # Separate coordinates and visibility
#     coordinates = data[:, :, :, :3]  # x, y, z
#     visibility = data[:, :, :, 3:4]  # visibility score
    
#     # Normalize coordinates (center around hip)
#     hip_center = coordinates[:, :, 23, :]  # Hip center (landmark 23)
#     centered_coords = coordinates - hip_center[:, :, None, :]
    
#     # Scale to similar size
#     max_abs_val = np.max(np.abs(centered_coords), axis=(1, 2, 3), keepdims=True)
#     normalized_coords = centered_coords / (max_abs_val + 1e-10)
    
#     # Reattach visibility (optional: you might want to use visibility as a weight)
#     processed_data = np.concatenate([normalized_coords, visibility], axis=3)
    
#     return processed_data

In [13]:
def hip_center_and_normalize(pl):
   
    data=pl.copy()
    coordinates = data[:, :, :, :3]  
    visibility = data[:, :, :, 3:4]  
    presence = data[:, :, :, 4:5]  

    
    left_hip = coordinates[:, :, 23, :] 
    right_hip = coordinates[:, :, 24, :] 
    hip_center = (left_hip + right_hip) / 2 
    # hip_center_z=hip_center[:,:,2]

    # batch , n_frames, n_landmarks, 5

    centered_coords = coordinates.copy()
    

    hip_center = hip_center[:, :, None, :]
    centered_coords -= hip_center

    
    x_values = centered_coords[:, :, :, 0]  
    y_values = centered_coords[:, :, :, 1]  
    z_values = centered_coords[:, :, :, 2]  
    
    x_min=np.min(x_values,keepdims=True,axis=2)
    x_max=np.max(x_values,keepdims=True,axis=2)
    x_size=np.abs(x_max-x_min)
    x_max_size=np.max(x_size,keepdims=True,axis=1)
    
    y_min=np.min(y_values,keepdims=True,axis=2)
    y_max=np.max(y_values,keepdims=True,axis=2)
    y_size=np.abs(y_max-y_min)
    y_max_size=np.max(y_size,keepdims=True,axis=1)
    
    z_min=np.min(z_values,keepdims=True,axis=2)
    z_max=np.max(z_values,keepdims=True,axis=2)
    z_size=np.abs(z_max-z_min)   
    z_max_size=np.max(z_size,keepdims=True,axis=1)


    epsilon = 1e-8
    
    x_values /= (x_max_size + epsilon)
    y_values /= (y_max_size + epsilon)
    z_values /= (z_max_size + epsilon)
    
    centered_coords[:, :, :, 0] = x_values
    centered_coords[:, :, :, 1] = y_values
    centered_coords[:, :, :, 2] = z_values
    
    processed_data = np.concatenate([centered_coords, visibility, presence], axis=3)

    return processed_data

In [8]:
x_train, y_train, x_val, y_val, x_test, y_test = get_video_paths_and_categories(dataset_dir)

In [None]:
import concurrent.futures

# Initialize start indices and number of items per worker
train_size = len(x_train)
val_size = len(x_val)
test_size = len(x_test)

P = 48
train_startof=[None]*P
val_startof=[None]*P
test_startof=[None]*P

train_nof=[train_size // P] * P
val_nof=[val_size // P] * P
test_nof=[test_size // P] * P

train_startof[0]=0
val_startof[0]=0
test_startof[0]=0



for i in range(P):
    if i < train_size % P:
        train_nof[i] += 1
    if i < val_size % P:
        val_nof[i] += 1
    if i < test_size % P:
        test_nof[i] += 1
    
    if i > 0:
        train_startof[i] = train_startof[i - 1] + train_nof[i - 1]
        val_startof[i] = val_startof[i - 1] + val_nof[i - 1]
        test_startof[i] = test_startof[i - 1] + test_nof[i - 1]


# Worker function to process a chunk of the data
def worker(tid):
    
    # print(f"Worker {tid} started")
    
    train_start=train_startof[tid]
    train_n=train_nof[tid]
    val_start=val_startof[tid]
    val_n=val_nof[tid]
    test_start=test_startof[tid]
    test_n=test_nof[tid]
    
    train_end=train_start+train_n
    val_end=val_start+val_n
    test_end=test_start+test_n
    
    # train_x=[]
    # train_y=[]
    # val_x=[]
    # val_y=[]
    # test_x=[]
    # test_y=[]
    
    x_train_pose = []
    x_val_pose = []
    x_test_pose = []

    y_train_pose = []
    y_val_pose = []
    y_test_pose = []

    x_train_video=[]
    x_val_video=[]
    x_test_video=[]

    y_test_video=[]
    y_val_video=[]
    y_train_video=[]
    
    for i in range(train_start, train_end):
        video_path = x_train[i]
        target = y_train[i]
        pose_landmarks,indices = apply_mediapipe_pose_v2(video_path,number_of_frames)
        if len(pose_landmarks) == number_of_frames:
            x_train_pose.append(pose_landmarks)
            y_train_pose.append(target)
            x_train_video.append(get_frames_from_video_v2(video_path,image_shape,indices))
            y_train_video.append(target)
            
    for i in range(val_start, val_end):
        video_path = x_val[i]
        target = y_val[i]
        pose_landmarks,indices = apply_mediapipe_pose_v2(video_path,number_of_frames)
        if len(pose_landmarks) == number_of_frames:
            x_val_pose.append(pose_landmarks)
            y_val_pose.append(target)
            x_val_video.append(get_frames_from_video_v2(video_path,image_shape,indices))
            y_val_video.append(target)
            
    for i in range(test_start, test_end):
        video_path = x_test[i]
        target = y_test[i]
        pose_landmarks,indices = apply_mediapipe_pose_v2(video_path,number_of_frames)
        if len(pose_landmarks) == number_of_frames:
            x_test_pose.append(pose_landmarks)
            y_test_pose.append(target)
            x_test_video.append(get_frames_from_video_v2(video_path,image_shape,indices))
            y_test_video.append(target)
            
    return tid, x_train_pose, y_train_pose, x_val_pose, y_val_pose, x_test_pose, y_test_pose, x_train_video, y_train_video, x_val_video, y_val_video, x_test_video, y_test_video
            
    
 
    # for idx in range(startof[i], startof[i] + nof[i]):
    #     video_path = video_paths[idx]
    #     target = targets[idx]
    #     pose_landmarks = apply_mediapipe_pose(video_path,number_of_frames)
    #     if len(pose_landmarks) == P:
    #         local_x.append(pose_landmarks)
    #         local_y.append(target)
    # return i, local_x, local_y

with concurrent.futures.ThreadPoolExecutor(max_workers=P) as executor:
    results = list(executor.map(worker, range(P)))
    

I0000 00:00:1742502682.199539 1694344 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1742502682.342347 1694605 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.216.03), renderer: NVIDIA A40/PCIe/SSE2
I0000 00:00:1742502682.359797 1694340 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
I0000 00:00:1742502682.408034 1694654 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.216.03), renderer: NVIDIA A40/PCIe/SSE2
I0000 00:00:1742502682.453212 1694358 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
W0000 00:00:1742502682.487579 1694557 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1742502682.505986 1694799 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.216.03), renderer: NVIDIA A40/PCIe/SSE2
I0000 00:00:174250

In [None]:
# #old version

# import concurrent.futures

# # Initialize start indices and number of items per worker
# train_size = len(x_train)
# val_size = len(x_val)
# test_size = len(x_test)

# P = 48
# train_startof=[None]*P
# val_startof=[None]*P
# test_startof=[None]*P

# train_nof=[train_size // P] * P
# val_nof=[val_size // P] * P
# test_nof=[test_size // P] * P

# train_startof[0]=0
# val_startof[0]=0
# test_startof[0]=0



# for i in range(P):
#     if i < train_size % P:
#         train_nof[i] += 1
#     if i < val_size % P:
#         val_nof[i] += 1
#     if i < test_size % P:
#         test_nof[i] += 1
    
#     if i > 0:
#         train_startof[i] = train_startof[i - 1] + train_nof[i - 1]
#         val_startof[i] = val_startof[i - 1] + val_nof[i - 1]
#         test_startof[i] = test_startof[i - 1] + test_nof[i - 1]


# # Worker function to process a chunk of the data
# def worker(tid):
    
#     # print(f"Worker {tid} started")
    
#     train_start=train_startof[tid]
#     train_n=train_nof[tid]
#     val_start=val_startof[tid]
#     val_n=val_nof[tid]
#     test_start=test_startof[tid]
#     test_n=test_nof[tid]
    
#     train_end=train_start+train_n
#     val_end=val_start+val_n
#     test_end=test_start+test_n
    
#     # train_x=[]
#     # train_y=[]
#     # val_x=[]
#     # val_y=[]
#     # test_x=[]
#     # test_y=[]
    
#     x_train_pose = []
#     x_val_pose = []
#     x_test_pose = []

#     y_train_pose = []
#     y_val_pose = []
#     y_test_pose = []

#     x_train_video=[]
#     x_val_video=[]
#     x_test_video=[]

#     y_test_video=[]
#     y_val_video=[]
#     y_train_video=[]
    
#     for i in range(train_start, train_end):
#         video_path = x_train[i]
#         target = y_train[i]
#         pose_landmarks = apply_mediapipe_pose(video_path,number_of_frames)
#         if len(pose_landmarks) == number_of_frames:
#             x_train_pose.append(pose_landmarks)
#             y_train_pose.append(target)
#             x_train_video.append(get_frames_from_video(video_path,image_shape,number_of_frames))
#             y_train_video.append(target)
            
#     for i in range(val_start, val_end):
#         video_path = x_val[i]
#         target = y_val[i]
#         pose_landmarks = apply_mediapipe_pose(video_path,number_of_frames)
#         if len(pose_landmarks) == number_of_frames:
#             x_val_pose.append(pose_landmarks)
#             y_val_pose.append(target)
#             x_val_video.append(get_frames_from_video(video_path,image_shape,number_of_frames))
#             y_val_video.append(target)
            
#     for i in range(test_start, test_end):
#         video_path = x_test[i]
#         target = y_test[i]
#         pose_landmarks = apply_mediapipe_pose(video_path,number_of_frames)
#         if len(pose_landmarks) == number_of_frames:
#             x_test_pose.append(pose_landmarks)
#             y_test_pose.append(target)
#             x_test_video.append(get_frames_from_video(video_path,image_shape,number_of_frames))
#             y_test_video.append(target)
            
#     return tid, x_train_pose, y_train_pose, x_val_pose, y_val_pose, x_test_pose, y_test_pose, x_train_video, y_train_video, x_val_video, y_val_video, x_test_video, y_test_video
            
    
 
#     # for idx in range(startof[i], startof[i] + nof[i]):
#     #     video_path = video_paths[idx]
#     #     target = targets[idx]
#     #     pose_landmarks = apply_mediapipe_pose(video_path,number_of_frames)
#     #     if len(pose_landmarks) == P:
#     #         local_x.append(pose_landmarks)
#     #         local_y.append(target)
#     # return i, local_x, local_y

# with concurrent.futures.ThreadPoolExecutor(max_workers=P) as executor:
#     results = list(executor.map(worker, range(P)))

I0000 00:00:1740829450.704429 2287606 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1740829450.912881 2288519 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.216.03), renderer: NVIDIA A40/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
I0000 00:00:1740829450.993152 2287630 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
W0000 00:00:1740829451.017356 2287904 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1740829451.051950 2288955 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.216.03), renderer: NVIDIA A40/PCIe/SSE2
I0000 00:00:1740829451.085569 2287598 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
W0000 00:00:1740829451.097656 2287917 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling suppo

In [10]:
sorted_results = sorted(results, key=lambda x: x[0])

def get_shape(lst):
    if isinstance(lst, list):
        return [len(lst)] + get_shape(lst[0]) if lst else []
    return []

x_train_pose = []
x_val_pose = []
x_test_pose = []

y_train_pose = []
y_val_pose = []
y_test_pose = []

x_train_video=[]
x_val_video=[]
x_test_video=[]

y_test_video=[]
y_val_video=[]
y_train_video=[]

for result in sorted_results:
    i,worker_train_pose_x, worker_train_pose_y, worker_val_pose_x, worker_val_pose_y, worker_test_pose_x, worker_test_pose_y, worker_train_video_x, worker_train_video_y, worker_val_video_x, worker_val_video_y, worker_test_video_x, worker_test_video_y = result
    # Fill the respective ranges in the corresponding arrays
    # x_train_data[train_startof[i]:train_startof[i] + train_nof[i]] = worker_train_x
    # y_train_data[train_startof[i]:train_startof[i] + train_nof[i]] = worker_train_y
    # x_val_data[val_startof[i]:val_startof[i] + val_nof[i]] = worker_val_x
    # y_val_data[val_startof[i]:val_startof[i] + val_nof[i]] = worker_val_y
    # x_test_data[test_startof[i]:test_startof[i] + test_nof[i]] = worker_test_x
    # y_test_data[test_startof[i]:test_startof[i] + test_nof[i]] = worker_test_y
    
    x_train_pose[train_startof[i]:train_startof[i] + train_nof[i]] = worker_train_pose_x
    y_train_pose[train_startof[i]:train_startof[i] + train_nof[i]] = worker_train_pose_y
    x_val_pose[val_startof[i]:val_startof[i] + val_nof[i]] = worker_val_pose_x
    y_val_pose[val_startof[i]:val_startof[i] + val_nof[i]] = worker_val_pose_y
    x_test_pose[test_startof[i]:test_startof[i] + test_nof[i]] = worker_test_pose_x
    y_test_pose[test_startof[i]:test_startof[i] + test_nof[i]] = worker_test_pose_y 
    
    x_train_video[train_startof[i]:train_startof[i] + train_nof[i]] = worker_train_video_x
    y_train_video[train_startof[i]:train_startof[i] + train_nof[i]] = worker_train_video_y
    x_val_video[val_startof[i]:val_startof[i] + val_nof[i]] = worker_val_video_x
    y_val_video[val_startof[i]:val_startof[i] + val_nof[i]] = worker_val_video_y
    x_test_video[test_startof[i]:test_startof[i] + test_nof[i]] = worker_test_video_x
    y_test_video[test_startof[i]:test_startof[i] + test_nof[i]] = worker_test_video_y   
    
x_train_pose=np.array(x_train_pose)
x_val_pose=np.array(x_val_pose)
x_test_pose=np.array(x_test_pose)

y_train_pose=np.array(y_train_pose)
y_val_pose=np.array(y_val_pose)
y_test_pose=np.array(y_test_pose) 

x_train_video=np.array(x_train_video)
x_val_video=np.array(x_val_video)
x_test_video=np.array(x_test_video)

y_train_video=np.array(y_train_video)
y_val_video=np.array(y_val_video)
y_test_video=np.array(y_test_video)



In [11]:
x_test_pose.shape

(531, 15, 33, 5)

In [12]:
np.save('x_train_pose.npy', x_train_pose)
np.save('x_val_pose.npy', x_val_pose)
np.save('x_test_pose.npy', x_test_pose)

np.save('y_train_pose.npy', y_train_pose)
np.save('y_val_pose.npy', y_val_pose)
np.save('y_test_pose.npy', y_test_pose)

np.save('x_train_video.npy', x_train_video)
np.save('x_val_video.npy', x_val_video)
np.save('x_test_video.npy', x_test_video)

np.save('y_train_video.npy', y_train_video)
np.save('y_val_video.npy', y_val_video)
np.save('y_test_video.npy', y_test_video)

In [4]:
pose_landmarks_channles=x_train_pose.shape[3]

NameError: name 'x_train_pose' is not defined

In [14]:
def hip_center_and_normalize(pl):
   
    data=pl.copy()
    coordinates = data[:, :, :, :3]  
    visibility = data[:, :, :, 3:4]  
    presence = data[:, :, :, 4:5]  

    
    left_hip = coordinates[:, :, 23, :] 
    right_hip = coordinates[:, :, 24, :] 
    hip_center = (left_hip + right_hip) / 2 
    # hip_center_z=hip_center[:,:,2]

    # batch , n_frames, n_landmarks, 5

    centered_coords = coordinates.copy()
    

    hip_center = hip_center[:, :, None, :]
    centered_coords -= hip_center

    
    x_values = centered_coords[:, :, :, 0]  
    y_values = centered_coords[:, :, :, 1]  
    z_values = centered_coords[:, :, :, 2]  
    
    x_min=np.min(x_values,keepdims=True,axis=2)
    x_max=np.max(x_values,keepdims=True,axis=2)
    x_size=np.abs(x_max-x_min)
    x_max_size=np.max(x_size,keepdims=True,axis=1)
    
    y_min=np.min(y_values,keepdims=True,axis=2)
    y_max=np.max(y_values,keepdims=True,axis=2)
    y_size=np.abs(y_max-y_min)
    y_max_size=np.max(y_size,keepdims=True,axis=1)
    
    z_min=np.min(z_values,keepdims=True,axis=2)
    z_max=np.max(z_values,keepdims=True,axis=2)
    z_size=np.abs(z_max-z_min)   
    z_max_size=np.max(z_size,keepdims=True,axis=1)


    epsilon = 1e-8
    
    x_values /= (x_max_size + epsilon)
    y_values /= (y_max_size + epsilon)
    z_values /= (z_max_size + epsilon)
    
    centered_coords[:, :, :, 0] = x_values
    centered_coords[:, :, :, 1] = y_values
    centered_coords[:, :, :, 2] = z_values
    
    processed_data = np.concatenate([centered_coords, visibility, presence], axis=3)

    return processed_data

In [15]:
x_train_pose = np.load('x_train_pose.npy')
x_val_pose = np.load('x_val_pose.npy')
x_test_pose = np.load('x_test_pose.npy')

y_train_pose = np.load('y_train_pose.npy')
y_val_pose = np.load('y_val_pose.npy')
y_test_pose = np.load('y_test_pose.npy')

x_train_video = np.load('x_train_video.npy')
x_val_video = np.load('x_val_video.npy')
x_test_video = np.load('x_test_video.npy')

y_train_video = np.load('y_train_video.npy')
y_val_video = np.load('y_val_video.npy')
y_test_video = np.load('y_test_video.npy')

pose_landmarks_channeles=x_train_pose.shape[3]

In [9]:
x_test_pose = np.load('x_test_pose.npy')
y_test_pose = np.load('y_test_pose.npy')

x_test_video = np.load('x_test_video.npy')
y_test_video = np.load('y_test_video.npy')

In [None]:
# x_train_pose=normalize_video_pose(x_train_pose)
# x_val_pose=normalize_video_pose(x_val_pose)
# x_test_pose=normalize_video_pose(x_test_pose)

In [17]:
x_train_pose=hip_center_and_normalize(x_train_pose)
x_val_pose=hip_center_and_normalize(x_val_pose)
x_test_pose=hip_center_and_normalize(x_test_pose)

In [16]:
x_train_pose.shape , x_val_pose.shape, x_test_pose.shape

((2156, 15, 33, 5), (888, 15, 33, 5), (531, 15, 33, 5))

In [23]:
x_train_pose.shape , x_val_pose.shape, x_test_pose.shape

((3351, 15, 33, 4), (1372, 15, 33, 4), (851, 15, 33, 4))

In [18]:
#reshape mediapipe data
number_of_frames=15
x_train_pose = np.array(x_train_pose).reshape(len(x_train_pose), number_of_frames,33*pose_landmarks_channeles)
x_val_pose = np.array(x_val_pose).reshape(len(x_val_pose), number_of_frames,33*pose_landmarks_channeles)
x_test_pose = np.array(x_test_pose).reshape(len(x_test_pose), number_of_frames,33*pose_landmarks_channeles)

In [None]:
x_test_pose = np.array(x_test_pose).reshape(len(x_test_pose), number_of_frames,33*pose_landmarks_channeles)

In [None]:
#video data

# x_train_video = np.array([get_frames_from_video(video_path, image_shape, number_of_frames) for video_path in x_train])
# x_val_video = np.array([get_frames_from_video(video_path, image_shape, number_of_frames) for video_path in x_val])
# x_test_video = np.array([get_frames_from_video(video_path, image_shape, number_of_frames) for video_path in x_test])

# y_train_video = np.array(y_train)
# y_val_video = np.array(y_val)
# y_test_video = np.array(y_test)




In [None]:
# np.save('x_train_video.npy', x_train_video)
# np.save('x_val_video.npy', x_val_video)
# np.save('x_test_video.npy', x_test_video)

# np.save('y_train_video.npy', y_train_video)
# np.save('y_val_video.npy', y_val_video)
# np.save('y_test_video.npy', y_test_video)



In [19]:
from tensorflow.keras.applications import ResNet152V2,ResNet50V2
from tensorflow.keras.layers import Input, LSTM, TimeDistributed, Dense, Flatten, Concatenate,Dropout
from tensorflow.keras.models import Model

def create_model(input_shape, num_classes, number_of_frames=10):
    
    pretrained_model = ResNet50V2(
        include_top=False,
        weights="imagenet",
        pooling="avg",
        input_shape=input_shape[1:]
    )
    pretrained_model.trainable = False 

    
    video_input = Input(shape=input_shape, name="video_input")
    x = TimeDistributed(pretrained_model)(video_input)
    x = LSTM(256, return_sequences=True)(x)
    # # x = Dense(512, activation='relu')(x)
    x = LSTM(128, return_sequences=False)(x)
    video_output = Dense(256, activation='relu')(x)
    # video_output = Dense(128, activation='relu')(x)

   
    pose_input = Input(shape=(number_of_frames, 33 * pose_landmarks_channeles), name="pose_input")
    y = LSTM(512, return_sequences=True)(pose_input)
    y = Dropout(0.2)(y)
    y = LSTM(256,return_sequences=True)(y)
    y=Dropout(0.2)(y)
    y=LSTM(256)(y)

    pose_output = Dense(256, activation='relu')(y)

    
    merged = Concatenate()([video_output, pose_output])
    final_output = Dense(num_classes, activation="softmax")(merged)

    
    model = Model(inputs=[video_input, pose_input], outputs=final_output)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    return model


In [26]:
print(x_train_video.shape)
print(x_train_pose.shape)


(3351, 15, 128, 128, 3)
(3351, 15, 132)


In [20]:
# x_train_video.shape[2:]

model=create_model(x_train_video.shape[1:], num_classes, number_of_frames)
model.summary()

In [22]:
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

checkpoint_callback = ModelCheckpoint(
    filepath='resnet50_lstm.keras',
    monitor='val_accuracy',
    save_best_only=True,
    save_weights_only=False,
    mode='max',
    verbose=1
)

In [24]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "multi_input_resnet50+lstm.keras",
        save_best_only=True,
        monitor="val_accuracy",
        mode="max",
        save_weights_only=False,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001
    ),
    keras.callbacks.EarlyStopping(monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1),
]

In [23]:
# train_labels=to_categorical(y_train_pose, num_classes)
# val_labels=to_categorical(y_val_pose, num_classes)

# print(train_labels)

model.fit([x_train_video, x_train_pose], y_train_pose,
          validation_data=([x_val_video, x_val_pose], y_val_pose),
          callbacks=[early_stopping_callback, checkpoint_callback],
          batch_size=32, epochs=500,verbose=1)

# model.fit([x_train_video, x_train_pose], y_train_pose,
#           validation_data=([x_val_video, x_val_pose], y_val_pose),
#           callbacks=callbacks,
#           batch_size=32, epochs=500)

2025-05-23 17:47:48.570813: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 6358302720 exceeds 10% of free system memory.


Epoch 1/500
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.1914 - loss: 3.2476

2025-05-23 17:51:18.552383: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2618818560 exceeds 10% of free system memory.



Epoch 1: val_accuracy improved from -inf to 0.53716, saving model to resnet50_lstm.keras
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 3s/step - accuracy: 0.1933 - loss: 3.2392 - val_accuracy: 0.5372 - val_loss: 1.7776
Epoch 2/500
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.6072 - loss: 1.4739
Epoch 2: val_accuracy improved from 0.53716 to 0.60023, saving model to resnet50_lstm.keras
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 3s/step - accuracy: 0.6075 - loss: 1.4728 - val_accuracy: 0.6002 - val_loss: 1.4277
Epoch 3/500
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.7441 - loss: 0.9438
Epoch 3: val_accuracy improved from 0.60023 to 0.69369, saving model to resnet50_lstm.keras
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 3s/step - accuracy: 0.7444 - loss: 0.9426 - val_accuracy: 0.6937 - val_loss: 1.0894
Epoch 4/500
[1m68/68[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fb814a78b80>

In [24]:
model=keras.models.load_model('resnet50_lstm.keras')
acc,loss=model.evaluate([x_test_video, x_test_pose], y_test_pose)
print(f"Test accuracy: {acc}")
print(f"Test loss: {loss}")

2025-05-23 18:58:04.365771: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1565982720 exceeds 10% of free system memory.


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2s/step - accuracy: 0.8819 - loss: 0.4351
Test accuracy: 0.3730296790599823
Test loss: 0.9020715355873108


In [None]:
#best

model=keras.models.load_model('multi_input_resnet50+lstm.keras')
acc,loss=model.evaluate([x_test_video, x_test_pose], y_test_pose)
print(f"Test accuracy: {acc}")
print(f"Test loss: {loss}")


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2s/step - accuracy: 0.8538 - loss: 0.6609
Test accuracy: 0.5776236057281494
Test loss: 0.8549906015396118


In [31]:
model.evaluate([x_test_video, x_test_pose], y_test_pose)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2s/step - accuracy: 0.8842 - loss: 0.4217


[0.3507292866706848, 0.9058380126953125]

In [29]:
#not padded

model.evaluate([x_test_video, x_test_pose], y_test_pose)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2s/step - accuracy: 0.8578 - loss: 0.7496


[0.497791588306427, 0.8945385813713074]

In [27]:
import keras

model=keras.models.load_model('resnet50_mediapipe_nonpadded_pose.keras')

In [2]:
#padded

model.evaluate([x_test_video, x_test_pose], y_test_pose)

NameError: name 'model' is not defined

In [3]:
x_test_pose

NameError: name 'x_test_pose' is not defined