In [278]:
import tensorflow as tf
from tensorflow.keras import layers, models
tf.config.run_functions_eagerly(True)
from tensorflow.keras.optimizers import Adam
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [279]:
def augment_video(video):
    # Ensure video is float32
    video = tf.cast(video, tf.float32)
    
    # Random brightness adjustment
    delta = tf.random.uniform([], -0.2, 0.2, dtype=tf.float32)
    video = tf.clip_by_value(video + delta, 0.0, 1.0)
    
    # Random contrast adjustment
    factor = tf.random.uniform([], 0.8, 1.2, dtype=tf.float32)
    mean = tf.reduce_mean(video, axis=[1, 2, 3], keepdims=True)
    video = (video - mean) * factor + mean
    video = tf.clip_by_value(video, 0.0, 1.0)
    
    # Random flip left-right (applied to all frames)
    if tf.random.uniform([], dtype=tf.float32) > 0.5:
        video = video[:, :, ::-1, :]
    
    return video

In [280]:
import numpy as np
import cv2
from tensorflow.keras.utils import Sequence
import tensorflow as tf

# class VideoDataGenerator(Sequence):
#     def __init__(self, video_paths, labels, batch_size=4, dim=(100, 100), n_frames=64, n_channels=3, n_classes=85, shuffle=True, augment=False):
#         self.video_paths = video_paths
#         self.labels = labels
#         self.batch_size = batch_size
#         self.dim = dim
#         self.n_frames = n_frames
#         self.n_channels = n_channels
#         self.n_classes = n_classes
#         self.shuffle = shuffle
#         self.indexes = np.arange(len(self.video_paths))
#         self.augment = augment
#         self.on_epoch_end()

#     def __len__(self):
#         return int(np.ceil(len(self.video_paths) / self.batch_size))

#     def __getitem__(self, index):
#         batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

#         batch_video_paths = [self.video_paths[k] for k in batch_indexes]
#         batch_labels = [self.labels[k] for k in batch_indexes]

#         X, y = self.__data_generation(batch_video_paths, batch_labels)
#         if self.augment:
#             X = tf.map_fn(augment_video, X, dtype=tf.float32)
#         if X.shape[0] == 0 or y.shape[0] == 0:
#             return self.__getitem__((index + 1) % len(self))

#         return X, y

#     def __data_generation(self, batch_video_paths, batch_labels):
#         X = np.empty((self.batch_size, self.n_frames, *self.dim, self.n_channels))
#         y = np.empty((self.batch_size), dtype=int)
        
#         for i, video_path in enumerate(batch_video_paths):
#             video = self.load_video(video_path)
#             X[i,] = video
#             y[i] = batch_labels[i]
        
#         return X, tf.keras.utils.to_categorical(y, num_classes=self.n_classes)

#     # def load_video(self, path):
#     #     cap = cv2.VideoCapture(path)
#     #     frames = []
#     #     while cap.isOpened() and len(frames) < self.n_frames:
#     #         ret, frame = cap.read()
#     #         if not ret:
#     #             break
#     #         frame = cv2.resize(frame, self.dim)
#     #         frame = frame / 255.0
#     #         frames.append(frame)
        
#     #     cap.release()
        
#     #     if len(frames) < self.n_frames:
#     #         frames.extend([np.zeros((*self.dim, self.n_channels))] * (self.n_frames - len(frames)))
            
#     #     return np.array(frames)
#     def load_video(self, path):
#         cap = cv2.VideoCapture(path)
#         frames = []
#         while cap.isOpened() and len(frames) < self.n_frames:
#             ret, frame = cap.read()
#             if not ret:
#                 break
#             frame = cv2.resize(frame, self.dim)
#             frame = frame.astype(np.float32) / 255.0  # Convert to float32 and normalize
#             frames.append(frame)
        
#         cap.release()
        
#         if len(frames) < self.n_frames:
#             frames.extend([np.zeros((*self.dim, self.n_channels), dtype=np.float32)] * (self.n_frames - len(frames)))
            
#         return np.array(frames)

#     def on_epoch_end(self):
#         if self.shuffle:
#             np.random.shuffle(self.indexes)

def video_data_generator(video_paths, labels, batch_size=32, dim=(128, 128), n_frames=32, n_channels=3, n_classes=85, shuffle=True, augment=False):
    num_samples = len(video_paths)
    
    def load_video(path):
        cap = cv2.VideoCapture(path)
        frames = []
        while cap.isOpened() and len(frames) < n_frames:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, dim)
            frame = frame.astype(np.float32) / 255.0  # Convert to float32 and normalize
            frames.append(frame)
        
        cap.release()
        
        if len(frames) < n_frames:
            frames.extend([np.zeros((*dim, n_channels), dtype=np.float32)] * (n_frames - len(frames)))
            
        return np.array(frames)
    
    while True:
        if shuffle:
            indices = np.random.permutation(num_samples)
        else:
            indices = np.arange(num_samples)
        
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            batch_indices = indices[start:end]
            
            batch_video_paths = [video_paths[i] for i in batch_indices]
            batch_labels = [labels[i] for i in batch_indices]
            
            # Load and preprocess videos
            batch_videos = []
            for path in batch_video_paths:
                video = load_video(path)
                batch_videos.append(video)
            
            X = np.array(batch_videos)
            y = np.array(batch_labels)
            
            if augment:
                X = tf.map_fn(augment_video, X, dtype=tf.float32)
            
            yield X, tf.keras.utils.to_categorical(y, num_classes=n_classes)


In [281]:
import os
import glob

def get_video_paths_and_labels(base_dir):
    video_paths = []
    labels = []
    label_map = {}
    current_label = 0
    
    
    if not os.path.exists(base_dir):
        print(f"Error: Directory {base_dir} does not exist!")
        return [], [], {}
    
    for label_name in sorted(os.listdir(base_dir)):
        label_dir = os.path.join(base_dir, label_name)
        
        if os.path.isdir(label_dir):
            label_map[label_name] = current_label
            video_count = 0
            
            for video_name in os.listdir(label_dir):
                video_path = os.path.join(label_dir, video_name)
                if video_path.endswith('.avi'):
                    video_paths.append(video_path)
                    labels.append(current_label)
                    video_count += 1
            
            current_label += 1
        else:
            print(f"Warning: {label_dir} is not a directory")
    
    return video_paths, labels, label_map

In [282]:
train_dir = 'D:\studystuff\DNNProjects\\datasets\\UCF-101\\train'
validation_dir = 'D:\studystuff\DNNProjects\\datasets\\UCF-101\\validation'

train_video_paths, train_labels, train_label_map = get_video_paths_and_labels(train_dir)
val_video_paths, val_labels, val_label_map = get_video_paths_and_labels(validation_dir)

print(f"Training classes: {train_label_map}")
print(f"Validation classes: {val_label_map}")

Training classes: {'ApplyEyeMakeup': 0, 'ApplyLipstick': 1, 'Archery': 2, 'BabyCrawling': 3, 'BalanceBeam': 4, 'BandMarching': 5, 'BaseballPitch': 6, 'Basketball': 7, 'BasketballDunk': 8, 'BenchPress': 9, 'Biking': 10, 'Billiards': 11, 'BlowDryHair': 12, 'BlowingCandles': 13, 'BodyWeightSquats': 14, 'Bowling': 15, 'BoxingPunchingBag': 16, 'BoxingSpeedBag': 17, 'BreastStroke': 18, 'BrushingTeeth': 19, 'CleanAndJerk': 20, 'CliffDiving': 21, 'CricketBowling': 22, 'CricketShot': 23, 'CuttingInKitchen': 24, 'Diving': 25, 'Fencing': 26, 'FieldHockeyPenalty': 27, 'FloorGymnastics': 28, 'FrontCrawl': 29, 'GolfSwing': 30, 'Haircut': 31, 'HammerThrow': 32, 'HandstandPushups': 33, 'HandstandWalking': 34, 'HighJump': 35, 'HorseRace': 36, 'HorseRiding': 37, 'IceDancing': 38, 'JavelinThrow': 39, 'JugglingBalls': 40, 'JumpRope': 41, 'Kayaking': 42, 'Knitting': 43, 'Lunges': 44, 'MilitaryParade': 45, 'Mixing': 46, 'Nunchucks': 47, 'ParallelBars': 48, 'PizzaTossing': 49, 'PlayingDaf': 50, 'PlayingDhol'

In [283]:
print(f"Number of training videos: {len(train_video_paths)}")

Number of training videos: 8810


In [284]:
n_classes = len(np.unique(train_labels))
n_classes

85

In [285]:
train_data_gen = VideoDataGenerator(
    video_paths=train_video_paths,
    labels=train_labels,
    batch_size=1,
    dim=(64, 64),
    n_frames=64,
    n_channels=3,
    n_classes=n_classes,
    shuffle=False,
    augment=True
)

val_data_gen = VideoDataGenerator(
    video_paths=val_video_paths,
    labels=val_labels,
    batch_size=1,
    dim=(64, 64),
    n_frames=64,
    n_channels=3,
    n_classes=n_classes,
    shuffle=False
)

In [286]:
import tensorflow as tf
from tensorflow.keras import layers, models

def build_i3d_inception(input_shape=(64, 64, 64, 3), n_classes=85):
    inputs = layers.Input(shape=input_shape)
    
    # Initial Conv3D and Pooling layers
    x = layers.Conv3D(32, (3, 7, 7), strides=(1, 2, 2), padding='same', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling3D((1, 3, 3), strides=(1, 2, 2), padding='same')(x)
    
    # Simplified Inception Block
    def inception_block(input_tensor, filters):
        branch1x1 = layers.Conv3D(filters[0], (1, 1, 1), padding='same', activation='relu')(input_tensor)
        branch1x1 = layers.BatchNormalization()(branch1x1)
        
        branch3x3 = layers.Conv3D(filters[1], (3, 3, 3), padding='same', activation='relu')(input_tensor)
        branch3x3 = layers.BatchNormalization()(branch3x3)
        
        branch_pool = layers.MaxPooling3D((1, 3, 3), strides=(1, 1, 1), padding='same')(input_tensor)
        branch_pool = layers.Conv3D(filters[2], (1, 1, 1), padding='same', activation='relu')(branch_pool)
        branch_pool = layers.BatchNormalization()(branch_pool)
        
        output = layers.Concatenate(axis=-1)([branch1x1, branch3x3, branch_pool])
        return output
    
    # Apply simplified Inception blocks
    x = inception_block(x, [32, 64, 32])
    x = inception_block(x, [64, 128, 64])
    x = layers.MaxPooling3D((2, 2, 2), strides=(2, 2, 2), padding='same')(x)
    
    # Additional Inception blocks
    x = inception_block(x, [128, 256, 128])
    x = inception_block(x, [256, 512, 256])
    x = layers.MaxPooling3D((2, 2, 2), strides=(2, 2, 2), padding='same')(x)
    
    # Final layers
    x = layers.GlobalAveragePooling3D()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(n_classes, activation='softmax')(x)
    
    model = models.Model(inputs, outputs)
    return model

def build_simplified_i3d(input_shape=(64, 64, 64, 3), n_classes=85):
    inputs = layers.Input(shape=input_shape)
    
    # Initial Conv3D and Pooling layers
    x = layers.Conv3D(32, (3, 7, 7), strides=(1, 2, 2), padding='same', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling3D((1, 3, 3), strides=(1, 2, 2), padding='same')(x)
    
    # Simplified 3D convolution blocks
    x = layers.Conv3D(64, (3, 3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling3D((2, 2, 2), strides=(2, 2, 2), padding='same')(x)
    
    x = layers.Conv3D(128, (3, 3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling3D((2, 2, 2), strides=(2, 2, 2), padding='same')(x)
    
    x = layers.Conv3D(256, (3, 3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.GlobalAveragePooling3D()(x)
    
    # Final layers
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(n_classes, activation='softmax')(x)
    
    model = models.Model(inputs, outputs)
    return model


In [287]:
i3d_model = build_i3d_inception(input_shape=(64, 64, 64, 3), n_classes=n_classes)

In [288]:
i3d_model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
i3d_model.summary()

Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_24 (InputLayer)          [(None, 64, 64, 64,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv3d_278 (Conv3D)            (None, 64, 32, 32,   14144       ['input_24[0][0]']               
                                32)                                                               
                                                                                                  
 batch_normalization_210 (Batch  (None, 64, 32, 32,   128        ['conv3d_278[0][0]']             
 Normalization)                 32)                                                        

In [289]:
print(f"Number of unique labels: {len(np.unique(train_labels))}")
print(f"Number of classes in model: {n_classes}")

Number of unique labels: 85
Number of classes in model: 85


In [290]:
history = i3d_model.fit(
    train_data_gen,
    epochs=10,
    validation_data=val_data_gen,
    steps_per_epoch=len(train_data_gen),
     validation_steps=len(val_video_paths),
    verbose=1
)

Epoch 1/10
 823/8810 [=>............................] - ETA: 13:30 - loss: 4.4417 - accuracy: 0.0049

InvalidArgumentError: {{function_node __wrapped__Slice_device_/job:localhost/replica:0/task:0/device:GPU:0}} Expected begin[0] in [0, 2], but got -1 [Op:Slice]

In [59]:
print("Calculating final validation accuracy...")
val_loss, val_accuracy = i3d_model.evaluate(val_data_gen, steps=len(val_data_gen), verbose=1)
print(f"Final validation accuracy: {val_accuracy:.4f}")

Calculating final validation accuracy...
Final validation accuracy: 0.0357
