In [1]:
import os, glob, math, json, random, time
import numpy as np
import cv2

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import (Input, Conv2D, Dense, Dropout, GlobalAveragePooling2D, BatchNormalization, Activation)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score

print("TensorFlow:", tf.__version__)
gpus = tf.config.list_physical_devices('GPU')
print("GPUs:", gpus)
try:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass


TensorFlow: 2.10.1
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# %%
class CFG:
    # ===== ĐƯỜNG DẪN =====
    DATA_ROOT = r"E:\Pycharm\Advanced-Reading-on-Computer-Vision\Datasets\UCF50\UCF50"   # <-- đổi path của bạn
    OUTPUT_DIR = "./runs_2d_fusion"
    SEED = 1337

    # ===== SAMPLING (TSN) =====
    NUM_SEGMENTS = 4        # số đoạn TSN
    CLIP_LEN = 4            # khung/đoạn
    NUM_FRAMES = NUM_SEGMENTS * CLIP_LEN

    IMG_SIZE = 224          # dùng backbone pretrain -> 224
    CENTER_CROP = False

    # ===== TRAIN =====
    VAL_RATIO = 0.2
    BATCH_SIZE = 8
    EPOCHS = 10
    LR = 1e-4
    LABEL_SMOOTHING = 0.0
    DROPOUT = 0.2

    # ===== AUGMENT =====
    HFLIP_PROB = 0.5
    BRIGHTNESS_DELTA = 20  # 0..255

    # ===== FUSION =====
    FUSION_MODE = "two_stream_late"  # one of: 'rgb_only' | 'flow_only' | 'two_stream_late' | 'early_fusion'
    LATE_FUSION_ALPHA = 0.5          # weight for RGB logits; (1-alpha) for Flow

    # ===== BACKBONE =====
    BACKBONE = "efficientnetb0"      # 'efficientnetb0' | 'mobilenetv2' | 'resnet50'
    TRAIN_BACKBONE = True            # fine-tune head + backbone

# seed + io
random.seed(CFG.SEED); np.random.seed(CFG.SEED); tf.random.set_seed(CFG.SEED)
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)


In [3]:
# %%
def tsn_sample_indices(total, num_segments, clip_len):
    if total <= 0:
        return np.array([], dtype=int)
    seg_len = max(total // num_segments, 1)
    idxs = []
    for s in range(num_segments):
        start = s * seg_len
        end = (s + 1) * seg_len
        center = (start + end) // 2
        for i in range(clip_len):
            idxs.append(min(max(0, center + i - clip_len // 2), total - 1))
    return np.array(sorted(set(idxs)), dtype=int)

def _center_crop(img):
    h, w = img.shape[:2]
    side = min(h, w)
    y0 = (h - side) // 2
    x0 = (w - side) // 2
    return img[y0:y0+side, x0:x0+side]

def read_rgb_frames(path, idxs, img_size=224, center_crop=False):
    cap = cv2.VideoCapture(path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
    if total <= 0:
        cap.release();
        return None
    want = set(idxs.tolist())
    frames = []
    i = 0
    while True:
        ret, frame = cap.read()
        if not ret: break
        if i in want:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if center_crop:
                frame = _center_crop(frame)
            frame = cv2.resize(frame, (img_size, img_size), interpolation=cv2.INTER_AREA)
            frames.append(frame)
            if len(frames) == len(idxs): break
        i += 1
    cap.release()
    if len(frames) != len(idxs):
        fill = frames[-1].copy() if frames else np.zeros((img_size,img_size,3), np.uint8)
        while len(frames) < len(idxs):
            frames.append(fill.copy())
    return np.stack(frames, axis=0)  # (T,H,W,3)


In [4]:
# %%
def flow_tvl1(rgb_clip_uint8):
    """
    rgb_clip_uint8: (T,H,W,3) uint8 (khung đã resize)
    Trả về: list ảnh flow dạng 3-kênh (HxWx3) uint8 (dx, dy, magnitude) đã scale 0..255
    """
    T, H, W, _ = rgb_clip_uint8.shape
    gray = [cv2.cvtColor(rgb_clip_uint8[t], cv2.COLOR_RGB2GRAY) for t in range(T)]

    # Check if DualTVL1OpticalFlow is available, otherwise use Farneback
    try:
        of = cv2.optflow.DualTVL1OpticalFlow_create()
        use_tvl1 = True
    except (AttributeError, ImportError):
        # Silently fall back to Farneback optical flow
        use_tvl1 = False

    flow_imgs = []
    for t in range(T-1):
        if use_tvl1:
            f = of.calc(gray[t], gray[t+1], None)  # (H,W,2) float32
        else:
            # Use Farneback optical flow as fallback
            f = cv2.calcOpticalFlowFarneback(gray[t], gray[t+1], None,
                                           0.5, 3, 15, 3, 5, 1.2, 0)

        dx, dy = f[...,0], f[...,1]
        mag = np.sqrt(dx*dx + dy*dy)

        # scale -> 0..255 ổn định
        def to_u8(x):
            x = np.tanh(x/10.0)  # squash outliers
            x = (x - x.min()) / (x.max() - x.min() + 1e-6)
            return (x * 255.0).astype(np.uint8)
        dx_u8 = to_u8(dx)
        dy_u8 = to_u8(dy)
        mag_u8 = to_u8(mag)

        flow_rgb = np.stack([dx_u8, dy_u8, mag_u8], axis=-1)  # (H,W,3)
        flow_imgs.append(flow_rgb)

    # số ảnh flow = T-1, pad 1 ảnh cuối để khớp T
    flow_imgs.append(flow_imgs[-1].copy())
    return np.stack(flow_imgs, axis=0)  # (T,H,W,3)


In [5]:
# %%
def scan_dataset(root):
    classes = sorted([d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))])
    class_to_idx = {c:i for i,c in enumerate(classes)}
    samples = []
    for c in classes:
        vids = []
        for ext in ("*.avi","*.mp4","*.mkv","*.mov"):
            vids.extend(glob.glob(os.path.join(root, c, ext)))
        for v in vids:
            samples.append({"path": v, "label": class_to_idx[c], "cls": c})
    return samples, classes

samples, classes = scan_dataset(CFG.DATA_ROOT)
num_classes = len(classes)
print(f"Tổng video: {len(samples)} | Số lớp: {num_classes}")

train_idx, val_idx = train_test_split(
    np.arange(len(samples)),
    test_size=CFG.VAL_RATIO,
    random_state=CFG.SEED,
    stratify=[s["label"] for s in samples]
)
train_samples = [samples[i] for i in train_idx]
val_samples   = [samples[i] for i in val_idx]
print(f"Train: {len(train_samples)} | Val: {len(val_samples)}")


Tổng video: 6681 | Số lớp: 50
Train: 5344 | Val: 1337


In [6]:
# %%
def random_horizontal_flip(x, p=0.5):
    if random.random() < p:
        return np.flip(x, axis=2).copy()
    return x

def random_brightness(x, delta=20):
    if delta <= 0: return x
    shift = random.randint(-delta, delta)
    y = np.clip(x.astype(np.int32) + shift, 0, 255).astype(np.uint8)
    return y

def load_clip_paths_and_targets(item):
    path = item["path"]; label = item["label"]
    cap = cv2.VideoCapture(path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
    cap.release()
    idxs = tsn_sample_indices(total, CFG.NUM_SEGMENTS, CFG.CLIP_LEN)
    rgb_clip = read_rgb_frames(path, idxs, CFG.IMG_SIZE, CFG.CENTER_CROP)  # (T,H,W,3) uint8
    return rgb_clip, label

def preprocess_sample(item, training=True, mode="rgb_only"):
    rgb_clip, label = load_clip_paths_and_targets(item)
    if rgb_clip is None:
        rgb_clip = np.zeros((CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 3), np.uint8)

    if training:
        rgb_clip = random_horizontal_flip(rgb_clip, CFG.HFLIP_PROB)
        rgb_clip = random_brightness(rgb_clip, CFG.BRIGHTNESS_DELTA)

    if mode in ["rgb_only", "two_stream_late"]:
        rgb_f32 = rgb_clip.astype(np.float32) / 255.0  # (T,H,W,3)
    else:
        rgb_f32 = None

    if mode in ["flow_only", "two_stream_late", "early_fusion"]:
        flow_clip = flow_tvl1(rgb_clip)                # (T,H,W,3) uint8
        if training:
            flow_clip = random_horizontal_flip(flow_clip, CFG.HFLIP_PROB)
        flow_f32 = flow_clip.astype(np.float32) / 255.0
    else:
        flow_f32 = None

    if mode == "early_fusion":
        # concat kênh -> (T,H,W,6)
        fused = np.concatenate([rgb_clip, flow_clip], axis=-1).astype(np.float32) / 255.0
        x = fused
    elif mode == "rgb_only":
        x = rgb_f32
    elif mode == "flow_only":
        x = flow_f32
    elif mode == "two_stream_late":
        x = (rgb_f32, flow_f32)
    else:
        raise ValueError("Unsupported mode")

    y = tf.keras.utils.to_categorical(label, num_classes)
    return x, y

def make_dataset(sample_list, batch_size, training=True, mode="rgb_only"):
    def gen():
        for it in sample_list:
            yield preprocess_sample(it, training=training, mode=mode)

    if mode == "two_stream_late":
        out_sig = (
            (
                tf.TensorSpec((CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 3), tf.float32),
                tf.TensorSpec((CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 3), tf.float32),
            ),
            tf.TensorSpec((num_classes,), tf.float32),
        )
    elif mode == "early_fusion":
        out_sig = (
            tf.TensorSpec((CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 6), tf.float32),
            tf.TensorSpec((num_classes,), tf.float32),
        )
    else:
        out_sig = (
            tf.TensorSpec((CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 3), tf.float32),
            tf.TensorSpec((num_classes,), tf.float32),
        )

    ds = tf.data.Dataset.from_generator(gen, output_signature=out_sig)
    if training:
        ds = ds.shuffle(256, seed=CFG.SEED, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


In [7]:
# %%
def build_backbone(input_shape=(224,224,3), weights='imagenet', trainable=True):
    if CFG.BACKBONE.lower() == "efficientnetb0":
        base = keras.applications.EfficientNetB0(include_top=False, weights=weights, input_shape=input_shape, pooling='avg')
    elif CFG.BACKBONE.lower() == "mobilenetv2":
        base = keras.applications.MobileNetV2(include_top=False, weights=weights, input_shape=input_shape, pooling='avg')
    elif CFG.BACKBONE.lower() == "resnet50":
        base = keras.applications.ResNet50(include_top=False, weights=weights, input_shape=input_shape, pooling='avg')
    else:
        raise ValueError("Unknown backbone")
    base.trainable = trainable
    return base

def build_rgb_stream(num_classes):
    # Input: (T,H,W,3) -> TimeDistributed(backbone) -> logits per frame -> mean over time
    inp = Input(shape=(CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 3))
    td = layers.TimeDistributed(build_backbone((CFG.IMG_SIZE, CFG.IMG_SIZE, 3), weights='imagenet', trainable=CFG.TRAIN_BACKBONE))(inp)
    # td: (B,T,feat)
    td = layers.TimeDistributed(Dropout(CFG.DROPOUT))(td)
    logits_t = layers.TimeDistributed(Dense(num_classes))(td)  # (B,T,C)
    logits = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(logits_t)  # temporal mean
    out = layers.Activation('softmax')(logits)
    return Model(inp, out, name="RGB_Stream"), Model(inp, logits, name="RGB_Logits")

def build_flow_stream(num_classes):
    inp = Input(shape=(CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 3))
    td = layers.TimeDistributed(build_backbone((CFG.IMG_SIZE, CFG.IMG_SIZE, 3), weights=None, trainable=CFG.TRAIN_BACKBONE))(inp)
    td = layers.TimeDistributed(Dropout(CFG.DROPOUT))(td)
    logits_t = layers.TimeDistributed(Dense(num_classes))(td)
    logits = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(logits_t)
    out = layers.Activation('softmax')(logits)
    return Model(inp, out, name="Flow_Stream"), Model(inp, logits, name="Flow_Logits")

def build_early_fusion(num_classes):
    """
    Input (T,H,W,6). Stem 1x1 Conv (->3 kênh) rồi backbone.
    """
    inp = Input(shape=(CFG.NUM_FRAMES, CFG.IMG_SIZE, CFG.IMG_SIZE, 6))
    # stem đưa về 3 kênh từng frame
    stem = layers.TimeDistributed(Conv2D(3, kernel_size=1, padding='same', use_bias=False))(inp)
    stem = layers.TimeDistributed(BatchNormalization())(stem)
    stem = layers.TimeDistributed(Activation('relu'))(stem)

    td = layers.TimeDistributed(build_backbone((CFG.IMG_SIZE, CFG.IMG_SIZE, 3), weights='imagenet', trainable=CFG.TRAIN_BACKBONE))(stem)
    td = layers.TimeDistributed(Dropout(CFG.DROPOUT))(td)
    logits_t = layers.TimeDistributed(Dense(num_classes))(td)
    logits = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(logits_t)
    out = layers.Activation('softmax')(logits)
    return Model(inp, out, name="EarlyFusion_Stream")


In [8]:
# %%
def build_model(num_classes, mode):
    if mode == "rgb_only":
        model, _ = build_rgb_stream(num_classes)
        return model
    if mode == "flow_only":
        model, _ = build_flow_stream(num_classes)
        return model
    if mode == "two_stream_late":
        # build logits models
        _, rgb_logits = build_rgb_stream(num_classes)
        _, flow_logits = build_flow_stream(num_classes)
        rgb_inp  = rgb_logits.input
        flow_inp = flow_logits.input
        logits = CFG.LATE_FUSION_ALPHA * rgb_logits.output + (1.0 - CFG.LATE_FUSION_ALPHA) * flow_logits.output
        out = layers.Activation('softmax')(logits)
        return Model([rgb_inp, flow_inp], out, name="TwoStream_LateFusion")
    if mode == "early_fusion":
        return build_early_fusion(num_classes)
    raise ValueError("Unknown mode")

model = build_model(num_classes, CFG.FUSION_MODE)
model.compile(
    optimizer=Adam(learning_rate=CFG.LR),
    loss=CategoricalCrossentropy(label_smoothing=CFG.LABEL_SMOOTHING),
    metrics=['accuracy']
)
model.summary(line_length=120, expand_nested=True)


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
Model: "TwoStream_LateFusion"
________________________________________________________________________________________________________________________
 Layer (type)                          Output Shape               Param #       Connected to                            
 input_1 (InputLayer)                  [(None, 16, 224, 224, 3)]  0             []                                      
                                                                                                                        
 input_3 (InputLayer)                  [(None, 16, 224, 224, 3)]  0             []                                      
                                                                                                                        
 time_distributed (TimeDistributed)    (None, 16, 1280)           4049571       ['input_1[0][0]']                       
                           

In [9]:
# %%
train_ds = make_dataset(train_samples, CFG.BATCH_SIZE, training=True, mode=CFG.FUSION_MODE)
val_ds   = make_dataset(val_samples,   CFG.BATCH_SIZE, training=False, mode=CFG.FUSION_MODE)

for xb, yb in train_ds.take(1):
    if CFG.FUSION_MODE == "two_stream_late":
        xrgb, xflow = xb
        print("RGB batch:", xrgb.shape, "Flow batch:", xflow.shape, "Labels:", yb.shape)
    else:
        print("X batch:", xb.shape, "Labels:", yb.shape)


UnknownError: {{function_node __wrapped__IteratorGetNext_output_types_3_device_/job:localhost/replica:0/task:0/device:CPU:0}} AttributeError: module 'cv2' has no attribute 'optflow'
Traceback (most recent call last):

  File "C:\Users\buian\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\ops\script_ops.py", line 271, in __call__
    ret = func(*args)

  File "C:\Users\buian\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\buian\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 1035, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "C:\Users\buian\AppData\Local\Temp\ipykernel_17180\1718138285.py", line 63, in gen
    yield preprocess_sample(it, training=training, mode=mode)

  File "C:\Users\buian\AppData\Local\Temp\ipykernel_17180\1718138285.py", line 37, in preprocess_sample
    flow_clip = flow_tvl1(rgb_clip)                # (T,H,W,3) uint8

  File "C:\Users\buian\AppData\Local\Temp\ipykernel_17180\1326869266.py", line 9, in flow_tvl1
    of = cv2.optflow.DualTVL1OpticalFlow_create()

AttributeError: module 'cv2' has no attribute 'optflow'


	 [[{{node PyFunc}}]] [Op:IteratorGetNext]

In [None]:
# %%
ckpt_path = os.path.join(CFG.OUTPUT_DIR, f"{model.name}_best.keras")
callbacks = [
    keras.callbacks.ModelCheckpoint(ckpt_path, monitor="val_accuracy", mode="max", save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_accuracy", mode="max", patience=5, restore_best_weights=True, verbose=1),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.5, patience=2, verbose=1),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=CFG.EPOCHS,
    callbacks=callbacks,
    verbose=1
)

with open(os.path.join(CFG.OUTPUT_DIR, f"{model.name}_history.json"), "w") as f:
    json.dump(history.history, f, indent=2)
print("Best ckpt:", ckpt_path)


In [None]:
# %%
# Gom val vào RAM để predict một lượt
val_X, val_y = [], []
for it in val_samples:
    x, y = preprocess_sample(it, training=False, mode=CFG.FUSION_MODE)
    val_X.append(x); val_y.append(np.argmax(y))

# tổ chức đầu vào theo mode
if CFG.FUSION_MODE == "two_stream_late":
    X_rgb = np.stack([xx[0] for xx in val_X], axis=0)
    X_flow= np.stack([xx[1] for xx in val_X], axis=0)
    inputs = [X_rgb, X_flow]
elif CFG.FUSION_MODE == "early_fusion":
    inputs = np.stack(val_X, axis=0)   # (N,T,H,W,6)
else:
    inputs = np.stack(val_X, axis=0)   # (N,T,H,W,3)

probs = model.predict(inputs, batch_size=CFG.BATCH_SIZE, verbose=1)
preds = np.argmax(probs, axis=1)
val_y = np.array(val_y)

acc = (preds == val_y).mean()
bal_acc = balanced_accuracy_score(val_y, preds)
print(f"[{model.name}] Val Accuracy: {acc:.4f} | Balanced Acc: {bal_acc:.4f}\n")

print("Classification Report:")
print(classification_report(val_y, preds, target_names=classes, digits=4))

cm = confusion_matrix(val_y, preds)
print("Confusion Matrix (top-left 10x10):")
size = min(10, cm.shape[0])
print(cm[:size,:size])


In [None]:
# %%
import matplotlib.pyplot as plt
import itertools

def plot_history(h):
    plt.figure()
    plt.plot(h['loss'], label='train_loss'); plt.plot(h['val_loss'], label='val_loss')
    plt.title('Loss'); plt.legend(); plt.show()

    plt.figure()
    plt.plot(h['accuracy'], label='train_acc'); plt.plot(h['val_accuracy'], label='val_acc')
    plt.title('Accuracy'); plt.legend(); plt.show()

def plot_cm(cm, classes, normalize=False, title='Confusion matrix'):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
    plt.figure(figsize=(8,6)); plt.imshow(cm, interpolation='nearest'); plt.title(title); plt.colorbar()
    ticks = np.arange(len(classes)); plt.xticks(ticks, classes, rotation=90); plt.yticks(ticks, classes)
    fmt = '.2f' if normalize else 'd'; thr = cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), ha='center', va='center', color="white" if cm[i, j] > thr else "black")
    plt.ylabel('True'); plt.xlabel('Pred'); plt.tight_layout()

plot_history(history.history)
plot_cm(cm, classes, False, f'CM - {model.name}')
plot_cm(cm, classes, True,  f'CM (norm) - {model.name}')
