In [None]:
import cv2
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from keras.layers import Conv3D, ConvLSTM2D, Input, TimeDistributed, Flatten, Dense
from keras.models import Model
from sklearn.metrics import mean_squared_error
import tensorflow as tfj
import numpy as np
import cv2

In [5]:
# Define paths
dataset_path = "/kaggle/input/ucf101-action-recognition"
train_csv = os.path.join(dataset_path, "train.csv")
test_csv=os.path.join(dataset_path,"test.csv")
val_csv=os.path.join(dataset_path,"val.csv")
selected_classes = ['PullUps', 'PushUps', 'SoccerJuggling', 'SoccerPenalty', 'VolleyballSpiking']
#load into dataframes
train_df = pd.read_csv(train_csv)
test_df=pd.read_csv(test_csv)
val_df=pd.read_csv(val_csv)


selected_actions_df = train_df[train_df['label'].isin(selected_classes)]
total_videos = len(selected_actions_df)
video_paths = selected_actions_df['clip_path'].tolist()


In [None]:
# Path to the dataset
base_path = "/kaggle/input/ucf101-action-recognition"
actions_dir = "train/selected_actions"

# List to store frames for all selected actions
all_video_frames = []

# Loop through each video file in the selected actions directory
for file_name in os.listdir(os.path.join(base_path, actions_dir)):
    video_relative_path = os.path.join(actions_dir, file_name)
    video_full_path = os.path.join(base_path, video_relative_path)

    # Check if the video file exists
    if not os.path.exists(video_full_path):
        print(f"Error: File not found - {video_full_path}")
        continue

    # Load the video using OpenCV
    video_capture = cv2.VideoCapture(video_full_path)

    if video_capture.isOpened():
        frames_sequence = []

        while True:
            ret, frame = video_capture.read()
            if not ret:
                break  # Exit the loop when the video ends

            # Resize the frame to 64x64 and convert to grayscale
            resized_frame = cv2.resize(frame, (64, 64))
            grayscale_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
            frames_sequence.append(grayscale_frame)

        # Add frames of the current video to the main list
        all_video_frames.append(frames_sequence)

    else:
        print(f"Error: Unable to open video - {video_full_path}")


In [7]:
input_seq_len = 10
output_seq_len = 5

def generate_sequences(frame_data, input_len, output_len):
    inputs = []
    targets = []
    
    # Iterate to create input-output pairs
    for idx in range(len(frame_data) - input_len - output_len):
        input_data = frame_data[idx:idx + input_len]
        target_data = frame_data[idx + input_len:idx + input_len + output_len]
        
        inputs.append(input_data)
        targets.append(target_data)
    
    return np.array(inputs), np.array(targets)

# Use frames from the first video in the list
selected_video_frames = all_video_frames[0]
X_data, Y_data = generate_sequences(selected_video_frames, input_seq_len, output_seq_len)

Input shape: (71, 10, 64, 64), Output shape: (71, 5, 64, 64)


In [None]:
# Define STM Cell (modified from PredRNN)
class STMCell(nn.Module):
    def __init__(self, input_channels, hidden_channels, kernel_size):
        super(STMCell, self).__init__()
        self.input_channels = input_channels
        self.hidden_channels = hidden_channels
        padding = kernel_size // 2

        self.conv = nn.Conv2d(
            input_channels + hidden_channels * 2, 
            hidden_channels * 4, 
            kernel_size, 
            padding=padding
        )

    def forward(self, x, h, c, m):
        combined = torch.cat([x, h, m], dim=1)  # Combine input, hidden state, and memory
        conv_out = self.conv(combined)
        cc_i, cc_f, cc_o, cc_g = torch.split(conv_out, self.hidden_channels, dim=1)

        i = torch.sigmoid(cc_i)
        f = torch.sigmoid(cc_f)
        o = torch.sigmoid(cc_o)
        g = torch.tanh(cc_g)

        c_next = f * c + i * g
        h_next = o * torch.tanh(c_next)
        m_next = m + c_next  # Update spatio-temporal memory
        return h_next, c_next, m_next

# Define PredRNN Model with STM Cells
class PredRNN(nn.Module):
    def __init__(self, input_channels, hidden_channels, kernel_size, num_layers, output_channels):
        super(PredRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_channels = hidden_channels

        self.cells = nn.ModuleList([
            STMCell(
                input_channels if i == 0 else hidden_channels, 
                hidden_channels, 
                kernel_size
            ) for i in range(num_layers)
        ])
        self.conv_out = nn.Conv2d(hidden_channels, output_channels, kernel_size=1)

    def forward(self, x, predict_steps=1):
        batch_size, seq_len, _, height, width = x.size()
        h, c, m = self.init_hidden(batch_size, height, width, x.device)

        # Process input sequence
        for t in range(seq_len):
            x_t = x[:, t, :, :, :]
            for i, cell in enumerate(self.cells):
                h[i], c[i], m[i] = cell(x_t, h[i], c[i], m[i])
                x_t = h[i]

        # Generate future frames
        outputs = []
        x_t = self.conv_out(h[-1])  # First predicted frame based on last hidden state
        outputs.append(x_t)

        for _ in range(predict_steps - 1):
            for i, cell in enumerate(self.cells):
                h[i], c[i], m[i] = cell(x_t, h[i], c[i], m[i])
                x_t = h[i]
            x_t = self.conv_out(h[-1])
            outputs.append(x_t)

        return torch.stack(outputs, dim=1)

    def init_hidden(self, batch_size, height, width, device):
        h = [torch.zeros(batch_size, self.hidden_channels, height, width).to(device) for _ in range(self.num_layers)]
        c = [torch.zeros(batch_size, self.hidden_channels, height, width).to(device) for _ in range(self.num_layers)]
        m = [torch.zeros(batch_size, self.hidden_channels, height, width).to(device) for _ in range(self.num_layers)]
        return h, c, m

In [9]:
# Example: Dividing video frames into input-output segments
def split_video_frames(frame_list, seq_len=10, target_len=5):
    """
    Splits a sequence of video frames into input-output pairs for model training.

    :param frame_list: List containing all video frames (64x64 grayscale).
    :param seq_len: Number of frames in each input sequence.
    :param target_len: Number of frames in each output sequence.
    :return: Tuple (X_data, Y_data) of prepared input and output sequences.
    """
    X_data, Y_data = [], []
    num_frames = len(frame_list)

    # Generate input and target sequences
    for index in range(num_frames - seq_len - target_len + 1):
        # Extract input sequence
        X_data.append(frame_list[index : index + seq_len])

        # Extract corresponding target sequence
        Y_data.append(frame_list[index + seq_len : index + seq_len + target_len])

    # Convert lists to numpy arrays and add channel dimension for grayscale images
    X_data = np.expand_dims(np.array(X_data), axis=-1)  # Shape: (samples, seq_len, 64, 64, 1)
    Y_data = np.expand_dims(np.array(Y_data), axis=-1)  # Shape: (samples, target_len, 64, 64, 1)
    return X_data, Y_data

# Flatten all video frames into a single sequence
flattened_frames = [frame for video_clips in all_video_frames for frame in video_clips]

# Define input and target sequence lengths
seq_len = 10
target_len = 10  # Updated to 10 frames for the target sequence

# Generate input-output sequences
X_data, Y_data = split_video_frames(flattened_frames, seq_len=seq_len, target_len=target_len)

# Display the shapes of the resulting datasets
print(f"X_data shape: {X_data.shape}, Y_data shape: {Y_data.shape}")

X shape: (14351, 10, 64, 64, 1), Y shape: (14351, 10, 64, 64, 1)


In [None]:
# Compile the model
model.compile(
    optimizer="adam",
    loss="mse",  # Mean Squared Error for regression-like predictions
    metrics=["mae"]  # Mean Absolute Error
)

# Train the model
history = model.fit(
    X, Y,
    epochs=20,  # Adjust epochs as needed
    batch_size=8,  # Adjust batch size based on GPU memory
    validation_split=0.2  # Use 20% of data for validation
)
model.save("predrnn_model.h5")


Epoch 1/20


I0000 00:00:1733246909.972445     897 service.cc:145] XLA service 0x7c96e800aae0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733246909.972519     897 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1733246909.972524     897 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m   1/1435[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:32:01[0m 9s/step - loss: 13235.6953 - mae: 88.7106

I0000 00:00:1733246917.155943     897 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1435/1435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 196ms/step - loss: 10834.0117 - mae: 78.3530 - val_loss: 11363.1406 - val_mae: 83.2217
Epoch 2/20
[1m1435/1435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 205ms/step - loss: 10902.2041 - mae: 78.7203 - val_loss: 11363.1406 - val_mae: 83.2217
Epoch 3/20
[1m1435/1435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 205ms/step - loss: 10862.8564 - mae: 78.5528 - val_loss: 11363.1406 - val_mae: 83.2217
Epoch 4/20
[1m1435/1435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 205ms/step - loss: 10890.0166 - mae: 78.4851 - val_loss: 11363.1406 - val_mae: 83.2217
Epoch 5/20
[1m1435/1435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 205ms/step - loss: 10864.3398 - mae: 78.5073 - val_loss: 11363.1406 - val_mae: 83.2217
Epoch 6/20
[1m1435/1435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 205ms/step - loss: 10893.4570 - mae: 78.5412 - val_loss: 11363.1406 - val_mae: 83.2217
Epo

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

# Assuming the function `create_predrnn` is already defined as shown previously

# Example: Preparing the data
# Assuming input_output_pairs_per_class has the input-output pairs as described

# Prepare the training data from input-output pairs
X_train = []
y_train = []

for selected_class in selected_classes:
    inputs = input_output_pairs_per_class[selected_class]['inputs']
    outputs = input_output_pairs_per_class[selected_class]['outputs']
    
    X_train.extend(inputs)
    y_train.extend(outputs)

# Convert lists to numpy arrays
X_train = np.array(X_train)  # Shape: (num_samples, input_sequence_length, 64, 64, 3)
y_train = np.array(y_train)  # Shape: (num_samples, output_sequence_length, 64, 64, 3)

# Split data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Create the PredRNN model
input_sequence_length = 10  # Number of frames in the input sequence
output_sequence_length = 5  # Number of frames to predict
input_shape = (64, 64, 3)  # Each frame is 64x64 RGB image

model = create_predrnn(input_sequence_length, output_sequence_length, input_shape)

print("Model loaded from" + os.path(predrnn_model.pth))
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model with the prepared data
history = model.fit(X_train, y_train, 
                    batch_size=8, 
                    epochs=10, 
                    validation_data=(X_val, y_val))

# Calculate MSE (Mean Squared Error)
mse = np.mean((y_val - val_predictions) ** 2)
    
# Calculate SSIM (Structural Similarity Index)
ssim_scores = []
for i in range(len(val_predictions)):
    ssim_score = tf.image.ssim(y_val[i], val_predictions[i], max_val=1.0)
    ssim_scores.append(ssim_score.numpy())
    
avg_ssim = np.mean(ssim_scores)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Average SSIM: {avg_ssim}")

# Save the model
model.save("predrnn_model.h5")




Model loaded from "C:\Users\chabd\Desktop\deep\deep\models\predrnn_model.pth
Train Results: MSE: 0.0076, SSIM: 0.6183
Test Results: MSE: 0.0171, SSIM: 0.4278
