In [18]:
import os
import cv2
import numpy as np
from tqdm import tqdm
import random
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import TimeDistributed, LSTM, Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import Callback, ModelCheckpoint

# Multi-GPU setup
strategy = tf.distribute.MirroredStrategy()
print("Number of GPUs:", strategy.num_replicas_in_sync)

# Paths
VIDEO_ROOT = "/kaggle/input/ucf101/UCF101/UCF-101"
SPLIT_ROOT = "/kaggle/input/ucf101/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist"
OUTPUT_DIR = "/kaggle/working/ucf101_frames_subset"

# Hyperparameters
IMG_SIZE = 224
SEQUENCE_LENGTH = 16
NUM_CLASSES = 101
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
EPOCHS = 15

os.makedirs(OUTPUT_DIR, exist_ok=True)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


Number of GPUs: 2


In [2]:
def load_class_indices(path):
    class_map = {}
    with open(path, "r") as f:
        for line in f:
            idx, name = line.strip().split()
            class_map[name] = int(idx) - 1
    return class_map

class_map = load_class_indices(os.path.join(SPLIT_ROOT, "classInd.txt"))


In [3]:
def read_split_file(path):
    samples = []
    with open(path, "r") as f:
        for line in f:
            video_path = line.strip().split()[0]
            class_name = video_path.split("/")[0]
            label = class_map[class_name]
            samples.append((video_path, label))
    return samples

train_samples = read_split_file(os.path.join(SPLIT_ROOT, "trainlist01.txt"))
test_samples = read_split_file(os.path.join(SPLIT_ROOT, "testlist01.txt"))

# Select a representative subset to fit Kaggle storage (approx 3000 videos)
random.seed(42)
subset_train = random.sample(train_samples, 2500)
subset_test = random.sample(test_samples, 500)

print("Subset train videos:", len(subset_train))
print("Subset test videos:", len(subset_test))

Subset train videos: 2500
Subset test videos: 500


In [4]:
def extract_frames(video_path, sequence_length=SEQUENCE_LENGTH):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(total_frames // sequence_length, 1)

    for i in range(sequence_length):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        frame = frame / 255.0
        frames.append(frame)

    cap.release()

    # Pad if less than SEQUENCE_LENGTH
    while len(frames) < sequence_length:
        frames.append(frames[-1])

    return np.array(frames, dtype=np.float16)  # memory-efficient

In [5]:
# Pre-extract frames only for subset to fit Kaggle storage
for video_rel_path, _ in tqdm(subset_train + subset_test):
    npy_path = os.path.join(OUTPUT_DIR, video_rel_path.replace(".avi", ".npy"))
    if not os.path.exists(npy_path):  # avoid re-processing
        video_full_path = os.path.join(VIDEO_ROOT, video_rel_path)
        frames = extract_frames(video_full_path)
        os.makedirs(os.path.dirname(npy_path), exist_ok=True)
        np.save(npy_path, frames)

100%|██████████| 3000/3000 [07:09<00:00,  6.99it/s]


In [6]:
def npy_generator(samples, batch_size=BATCH_SIZE):
    while True:
        np.random.shuffle(samples)
        for i in range(0, len(samples), batch_size):
            batch_samples = samples[i:i+batch_size]
            X, y = [], []
            for video_rel_path, label in batch_samples:
                npy_path = os.path.join(OUTPUT_DIR, video_rel_path.replace(".avi", ".npy"))
                frames = np.load(npy_path)
                X.append(frames)
                y.append(label)
            yield np.array(X, dtype=np.float16), tf.one_hot(y, NUM_CLASSES)

In [7]:
train_dataset = tf.data.Dataset.from_generator(
    lambda: npy_generator(subset_train, BATCH_SIZE),
    output_signature=(
        tf.TensorSpec(shape=(None, SEQUENCE_LENGTH, IMG_SIZE, IMG_SIZE, 3), dtype=tf.float16),
        tf.TensorSpec(shape=(None, NUM_CLASSES), dtype=tf.float32)
    )
).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_generator(
    lambda: npy_generator(subset_test, BATCH_SIZE),
    output_signature=(
        tf.TensorSpec(shape=(None, SEQUENCE_LENGTH, IMG_SIZE, IMG_SIZE, 3), dtype=tf.float16),
        tf.TensorSpec(shape=(None, NUM_CLASSES), dtype=tf.float32)
    )
).prefetch(tf.data.AUTOTUNE)

In [8]:
with strategy.scope():
    cnn_base = MobileNetV2(weights="imagenet", include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    cnn_base.trainable = False

    model = Sequential([
        TimeDistributed(cnn_base, input_shape=(SEQUENCE_LENGTH, IMG_SIZE, IMG_SIZE, 3)),
        TimeDistributed(GlobalAveragePooling2D()),
        LSTM(128),
        Dense(128, activation="relu"),
        Dense(NUM_CLASSES, activation="softmax")
    ])

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  super().__init__(**kwargs)


In [12]:
# Early stop at target val accuracy
class StopAtValAccuracy(Callback):
    def __init__(self, target=0.85):
        super().__init__()
        self.target = target

    def on_epoch_end(self, epoch, logs=None):
        val_acc = logs.get("val_accuracy")
        if val_acc and val_acc >= self.target:
            print(f"\nReached {val_acc*100:.2f}% val accuracy. Stopping training.")
            self.model.stop_training = True

early_stop = StopAtValAccuracy(target=0.85)

# Checkpoint
checkpoint = ModelCheckpoint(
    "best_ucf101_cnn_lstm_model.h5",
    monitor="val_accuracy",
    save_best_only=True,
    verbose=1
)

In [13]:
with strategy.scope():
    cnn_base = MobileNetV2(
        weights="imagenet",
        include_top=False,
        input_shape=(IMG_SIZE, IMG_SIZE, 3)
    )
    cnn_base.trainable = False

    model = Sequential([
        TimeDistributed(cnn_base, input_shape=(SEQUENCE_LENGTH, IMG_SIZE, IMG_SIZE, 3)),
        TimeDistributed(GlobalAveragePooling2D()),
        LSTM(128),
        Dense(128, activation="relu"),
        Dense(NUM_CLASSES, activation="softmax")
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4, clipnorm=1.0),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

model.summary()

In [19]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS,
    steps_per_epoch=len(subset_train)//BATCH_SIZE,
    validation_steps=len(subset_test)//BATCH_SIZE,
    callbacks=[early_stop, checkpoint]
)

Epoch 1/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step - accuracy: 0.1328 - loss: 4.1782
Epoch 1: val_accuracy improved from 0.06048 to 0.12500, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 392ms/step - accuracy: 0.1330 - loss: 4.1775 - val_accuracy: 0.1250 - val_loss: 4.0917
Epoch 2/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step - accuracy: 0.2997 - loss: 3.6231
Epoch 2: val_accuracy improved from 0.12500 to 0.21774, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 347ms/step - accuracy: 0.2998 - loss: 3.6225 - val_accuracy: 0.2177 - val_loss: 3.7588
Epoch 3/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step - accuracy: 0.3958 - loss: 3.1386
Epoch 3: val_accuracy improved from 0.21774 to 0.25000, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 343ms/step - accuracy: 0.3961 - loss: 3.1377 - val_accuracy: 0.2500 - val_loss: 3.4966
Epoch 4/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - accuracy: 0.5389 - loss: 2.6294
Epoch 4: val_accuracy improved from 0.25000 to 0.30040, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 346ms/step - accuracy: 0.5390 - loss: 2.6290 - val_accuracy: 0.3004 - val_loss: 3.2143
Epoch 5/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - accuracy: 0.6485 - loss: 2.2305
Epoch 5: val_accuracy improved from 0.30040 to 0.33468, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 347ms/step - accuracy: 0.6485 - loss: 2.2301 - val_accuracy: 0.3347 - val_loss: 2.9904
Epoch 6/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step - accuracy: 0.7271 - loss: 1.8445
Epoch 6: val_accuracy improved from 0.33468 to 0.35484, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 343ms/step - accuracy: 0.7271 - loss: 1.8443 - val_accuracy: 0.3548 - val_loss: 2.8496
Epoch 7/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - accuracy: 0.7825 - loss: 1.5703
Epoch 7: val_accuracy improved from 0.35484 to 0.38105, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 345ms/step - accuracy: 0.7825 - loss: 1.5699 - val_accuracy: 0.3810 - val_loss: 2.6621
Epoch 8/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step - accuracy: 0.8281 - loss: 1.2855
Epoch 8: val_accuracy did not improve from 0.38105
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 343ms/step - accuracy: 0.8281 - loss: 1.2854 - val_accuracy: 0.3810 - val_loss: 2.5938
Epoch 9/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step - accuracy: 0.8700 - loss: 1.0952
Epoch 9: val_accuracy improved from 0.38105 to 0.40121, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 343ms/step - accuracy: 0.8701 - loss: 1.0949 - val_accuracy: 0.4012 - val_loss: 2.4747
Epoch 10/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step - accuracy: 0.9129 - loss: 0.9130
Epoch 10: val_accuracy did not improve from 0.40121
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 342ms/step - accuracy: 0.9128 - loss: 0.9128 - val_accuracy: 0.3931 - val_loss: 2.4197
Epoch 11/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - accuracy: 0.9301 - loss: 0.7613
Epoch 11: val_accuracy improved from 0.40121 to 0.42339, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 344ms/step - accuracy: 0.9302 - loss: 0.7610 - val_accuracy: 0.4234 - val_loss: 2.3620
Epoch 12/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - accuracy: 0.9444 - loss: 0.6181
Epoch 12: val_accuracy improved from 0.42339 to 0.43145, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 345ms/step - accuracy: 0.9444 - loss: 0.6180 - val_accuracy: 0.4315 - val_loss: 2.3106
Epoch 13/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step - accuracy: 0.9547 - loss: 0.5183
Epoch 13: val_accuracy improved from 0.43145 to 0.44153, saving model to best_ucf101_cnn_lstm_model.h5




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 344ms/step - accuracy: 0.9547 - loss: 0.5181 - val_accuracy: 0.4415 - val_loss: 2.2867
Epoch 14/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step - accuracy: 0.9767 - loss: 0.4000
Epoch 14: val_accuracy did not improve from 0.44153
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 343ms/step - accuracy: 0.9767 - loss: 0.4000 - val_accuracy: 0.4355 - val_loss: 2.2608
Epoch 15/15
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - accuracy: 0.9840 - loss: 0.3413
Epoch 15: val_accuracy did not improve from 0.44153
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 343ms/step - accuracy: 0.9840 - loss: 0.3412 - val_accuracy: 0.4415 - val_loss: 2.2378


In [20]:
model.save("ucf101_cnn_lstm_final_model.h5")
print("Final model saved successfully.")



Final model saved successfully.
