# Test stratified k-fold cross validation
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

print(skf)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")


StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
Fold 0:
  Train: index=[1 3]
  Test:  index=[0 2]
Fold 1:
  Train: index=[0 2]
  Test:  index=[1 3]


## Classify ImageNet classes with ResNet50
https://keras.io/api/applications/#usage-examples-for-image-classification-models

In [2]:
import keras
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np

model = ResNet50(weights='imagenet')

img_path = 'elephant.jpg'
img = keras.utils.load_img(img_path, target_size=(224, 224))
x = keras.utils.img_to_array(img)
x
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

# preds = model.predict(x)
# # decode the results into a list of tuples (class, description, probability)
# # (one such list for each sample in the batch)
# print('Predicted:', decode_predictions(preds, top=3)[0])

## Try predicing from video frames

In [3]:
import cv2
def video_to_frames(video_path, img_size=(64, 64), sequence_length=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        # frame = cv2.resize(frame, img_size)
        img = keras.utils.load_img(frame, target_size=(224, 224))
        x = keras.utils.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        frames.append(x)
        if len(frames) == sequence_length:
            break
    cap.release()

    if len(frames) < sequence_length:
        return None  # Ignore short videos

    return np.array(frames)

https://keras.io/examples/vision/video_classification/

## Define hyperparameters

In [4]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [15]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

### Make a CSV with video filenames and classes

In [6]:
import os
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# List files and ignore .DS_Store if on a Mac
def list_files(directory):
    visible_files = []
    for file in os.listdir(directory):
        if not file.startswith('.'):
            visible_files.append(file)

    return visible_files

def catalog_videos(folder_path):
    # classes = os.listdir(folder_path)
    classes = list_files(folder_path)
    videos, labels, encoded_labels, paths = [], [], [], []

    for label, activity in enumerate(classes):
        activity_folder = os.path.join(folder_path, activity)
        # for video_file in os.listdir(activity_folder):
        for video_file in list_files(activity_folder):
            video_path = os.path.join(activity_folder, video_file)
            videos.append(video_file)
            labels.append(activity)
            encoded_labels.append(label)
            paths.append(video_path)

    # Encode labels
    le = LabelEncoder()
    le.fit(labels)
    le.transform(labels)

    return videos, labels, encoded_labels, paths

In [7]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()


In [8]:
import pandas as pd

def make_catalog_csv(path, name):
    # Create catalog of dataset details
    catalog = catalog_videos(path)

    # Make dataframe
    df = pd.DataFrame({'video': catalog[0], 'label': catalog[1],
                            'encoded_label': catalog[2], 'path': catalog[3]})
    # Export CSV
    filename = f'{name}.csv'
    df.to_csv(filename, index=False)

    print(f'Saved to {filename}')

    return df


In [9]:
# Make CSV for training/testing sets
train_path = "../../downloads/old_clips/full_res/train"
test_path = "../../downloads/old_clips/full_res/test"

train_df = make_catalog_csv(train_path, "train_df")
test_df = make_catalog_csv(test_path, "test_df")

Saved to train_df.csv
Saved to test_df.csv


In [16]:
test_df

Unnamed: 0,video,label,encoded_label,path
0,7394377634358283409.mp4,EVS Visit,0,../../downloads/old_clips/full_res/test/EVS Vi...
1,7394376363047963793.mp4,EVS Visit,0,../../downloads/old_clips/full_res/test/EVS Vi...
2,7394375134687317137.mp4,EVS Visit,0,../../downloads/old_clips/full_res/test/EVS Vi...
3,7394377110372273297.mp4,EVS Visit,0,../../downloads/old_clips/full_res/test/EVS Vi...
4,7394376070990187665.mp4,EVS Visit,0,../../downloads/old_clips/full_res/test/EVS Vi...
...,...,...,...,...
441,7395538125931728017.mp4,Nurse Visit,10,../../downloads/old_clips/full_res/test/Nurse ...
442,7395888535133539473.mp4,Nurse Visit,10,../../downloads/old_clips/full_res/test/Nurse ...
443,7394438386670685329.mp4,Nurse Visit,10,../../downloads/old_clips/full_res/test/Nurse ...
444,7393435193979489425.mp4,Transfer To Bed,11,../../downloads/old_clips/full_res/test/Transf...


In [10]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["label"])
)
print(label_processor.get_vocabulary())


['Asleep-Trying to sleep', 'Doctor Visit', 'EVS Visit', 'Eating', 'Family', 'Lying In Bed', 'Nurse Visit', 'Sitting In Wheelchair', 'Talking on the Phone', 'Therapy', 'Transfer To Bed', 'Watching TV']


In [None]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["path"].values.tolist()
    labels = df["label"].values
    labels = keras.ops.convert_to_numpy(label_processor(labels[..., None]))

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        # frames = load_video(os.path.join(root_dir, path))
        frames = load_video(path)
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(
            shape=(
                1,
                MAX_SEQ_LENGTH,
            ),
            dtype="bool",
        )
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :], verbose=0,
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

# Use paths declared earlier
train_path = "../../downloads/old_clips/full_res/train"
test_path = "../../downloads/old_clips/full_res/test"

train_data, train_labels = prepare_all_videos(train_df, train_path)
test_data, test_labels = prepare_all_videos(test_df, test_path)

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")



In [160]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.5)(x)
    # x = keras.layers.Dense(8, activation="softmax")(x)
    x = keras.layers.Dense(1, activation='sigmoid')(x)
    
    output = keras.layers.Dense(len(class_vocab), activation="sigmoid")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)
    
    # opt = keras.optimizers.Adam(learning_rate=0.01)
    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
        # loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"]
    )
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = "/tmp/video_classifier/ckpt.weights.h5"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
        batch_size=BATCH_SIZE
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


In [161]:
_, sequence_model = run_experiment()


Epoch 1/10
[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m5s[0m 1s/step - accuracy: 0.0000e+00 - loss: 2.5246
Epoch 1: val_loss improved from inf to 2.50697, saving model to /tmp/video_classifier/ckpt.weights.h5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 65ms/step - accuracy: 0.0050 - loss: 2.5089 - val_accuracy: 0.0000e+00 - val_loss: 2.5070
Epoch 2/10
[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 24ms/step - accuracy: 0.0469 - loss: 2.4935
Epoch 2: val_loss improved from 2.50697 to 2.50667, saving model to /tmp/video_classifier/ckpt.weights.h5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0341 - loss: 2.4870 - val_accuracy: 0.0000e+00 - val_loss: 2.5067
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0700 - loss: 2.4810
Epoch 3: val_loss did not improve from 2.50667
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.07

In [171]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(
        shape=(
            1,
            MAX_SEQ_LENGTH,
        ),
        dtype="bool",
    )
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames


# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, duration=100)
    return Image("animation.gif")


# train_path = f"../../downloads/train"
# test_path = f"../../downloads/test"

# train_data, train_labels = prepare_all_videos(train_df, train_path)
# test_data, test_labels = prepare_all_videos(test_df, test_path)

test_video = np.random.choice(test_df["path"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)
# to_gif(test_frames[:MAX_SEQ_LENGTH])


Test video path: ../../downloads/test/Eating/7395553781087521937.mp4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
  Doctor Visit: 56.92%
  Sitting In Wheelchair: 53.53%
  Transfer To Bed: 50.68%
  Talking on the Phone: 49.06%
  EVS Visit: 47.47%
  Asleep-Trying to sleep: 47.40%
  Nurse Visit: 46.25%
  Watching TV: 43.84%
  Family: 43.70%
  Lying In Bed: 41.84%
  Eating: 41.83%
  Therapy: 41.81%



(<unknown>:65618): GStreamer-CRITICAL **: 15:40:02.276: gst_element_make_from_uri: assertion 'gst_uri_is_valid (uri)' failed
OpenCV: Couldn't read video stream from file "test/../../downloads/test/Eating/7395553781087521937.mp4"
[ WARN:0@36291.280] global cap.cpp:166 open VIDEOIO(CV_IMAGES): raised OpenCV exception:

OpenCV(4.10.0) /private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_49s_p64pd6/croot/opencv-suite_1722029132360/work/modules/videoio/src/cap_images.cpp:274: error: (-215:Assertion failed) number < max_number in function 'icvExtractPattern'


