In [1]:
import cv2
import os

In [96]:
import cv2
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import MultiHeadAttention, Input, Dense, LSTM, LayerNormalization, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Data Preprocessing

In [8]:
# Extract frames from video file
def extract_frames(video_path, frame_directory):
    os.makedirs(frame_directory, exist_ok=True)

    # Read video
    cap = cv2.VideoCapture(video_path)
    print(f"Processing {video_path}... Total Frames: {int(cap.get(cv2.CAP_PROP_FRAME_COUNT))}")

    # Check if video opened successfully
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}.")
        return

    frame_count = 0
    while True:
        # Read a frame
        success, frame = cap.read()
        
        # If read was successful, save the frame
        if success:
            frame_filename = os.path.join(frame_directory, f"frame_{frame_count:04d}.jpg")
            cv2.imwrite(frame_filename, frame)
            frame_count += 1
        else:
            # No more frames to be read
            break

    # Release the video capture object
    cap.release()
    print(f"Frames extracted for {video_path}: {frame_count}")

# Adjust this path to where your videos are stored
## video_folder_path = '/Users/lyricli/Documents/Visualization for Machine Learning/final project/Normal'
## video_folder_path = '/Users/lyricli/Documents/Visualization for Machine Learning/final project/Crash'
# video_folder_path = 'data/raw/Normal'
video_folder_path = 'data/raw/Crash'


# Adjust this path to where you want to save the frames
# frames_save_path = 'data/frames/Normal'
frames_save_path = 'data/frames/Crash'

# Get a list of all video files
video_files = [f for f in os.listdir(video_folder_path) if f.endswith('.mp4')]

# Process each video file to save them as frames
for video_file in video_files:
    video_path = os.path.join(video_folder_path, video_file)
    video_name = video_file.split('.')[0]  # Assuming the file name is the video name
    frame_directory = os.path.join(frames_save_path, video_name)
    extract_frames(video_path, frame_directory)

## Feature Extraction (CNN Feature Map Creation for Each Frame)

### Using TensorFlow

In [5]:
# Loads and returns frames as np.array in RGB or Grayscale format
# :param frame_directory: String path to find all frames of one video
# :return np.array(frames): A numpy.array of tensor frame-arrays of pixel values

def load_frames(frame_directory):
    
    frames = []
    
    for frame in sorted(os.listdir(frame_directory)):
        img = cv2.imread(os.path.join(frame_directory, frame))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # load RGB img
        # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # load Grayscale img
        frames.append(img)
        
    return np.array(frames)

In [6]:
# Create a dataset of video frames for the CNN model
# :param video_folder_path: path for storing "Normal" and "Crash" video frames
# :param video_number: number of videos one desires to create a dataset
# :return np.array(dataset): datasets of videos
# :return np.array(labels): labels corresponding to each of the video in videos

def create_dataset(video_folder_path, video_number):
    
    categories = ["Normal", "Crash"]
    
    dataset = [] # stores loaded video frames, i.e., [[frame1, frame2, ...], [frame1, frame2, ...], ...]
    labels = []  # 0 - Normal, 1 - Crash
    
    for category in categories:
        
        path = os.path.join(video_folder_path, category) # e.g. "data/frames/Normal"
        video_directories = os.listdir(path)[:video_number] # e.g. [000023, ..., 000022] not in order
        
        for video_dir in video_directories:
            full_video_path = os.path.join(path, video_dir) # e.g. "data/frames/Normal/000023"
            frames = load_frames(full_video_path) # see func `load_frames` above
            dataset.append(frames)
            labels.append(1 if category == "Crash" else 0)
    
    # dataset shape: (video_number * 2, frame number 50, frame height 720, frame width 1280, color channel 3)
    # labels shape:  (video_number * 2, )
    return np.array(dataset), np.array(labels)

#### Load dataset

**video_number** = How many videos per ["Normal", "Crash"] folder do you want to extract features from

if video_number = 3, then you will extract features of 3 normal and 3 crash videos

In [18]:
## create_dataset() inputs: video_folder_path, video_number

# Path name for storing the "Normal" and "Crash" video frames
video_folder_path = 'data/frames'

# Number of processed video-frames for each category of "Normal" and "Crash"
video_number = 10


## create_dataset() outputs: Output data and labels

data, labels = create_dataset(video_folder_path, video_number)

####  Load a pre-trained model

**Pre-trained Model?** We used pre-trained model for the baseline performance and for efficiency

**Is it good?** Not sure, since ResNet50 is trained on general images, but let's try, it saves time

**Why ResNet50?**: ResNet50 was pre-trained on general image classification task and it is fast to retrieve

In [19]:
# Load a pre-trained ResNet50 model

model = ResNet50(include_top=False, weights='imagenet')

# from tensorflow.keras.models import Model
# model = Model(inputs=resnet_model.input, outputs=resnet_model.output)

In [20]:
# Feature extraction
# :param: data
# :return: np.array(features)

def extract_features(data):
    features = []  # List to hold all feature vectors
    
    for video_frames in data:
        video_features = []  # List to hold feature vectors for one video
        
        # Process each frame in the video
        for frame in video_frames:
            img_tensor = image.img_to_array(frame)  # Convert frame to a numpy array
            img_tensor = np.expand_dims(img_tensor, axis=0)  # Add batch dimension
            img_tensor = tf.keras.applications.resnet.preprocess_input(img_tensor)  # Preprocess the frame
            
            feature = model.predict(img_tensor)  # Extract features using ResNet50
            video_features.append(feature)
            
        features.append(np.array(video_features))
        
    return np.array(features)

features = extract_features(data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 476ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 462ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 512ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 516ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [21]:
print(features.shape) # (video_number * 2, frame number 50, layer depth 1, height 23, width 40, depth 2048)

(20, 50, 1, 23, 40, 2048)


### Train-Test Split

In [27]:
#因为chatgpt说一般放进transformer里的只要这三个dimension就可以了，剩下三个可以不要

# Assuming 'features' is your NumPy array with the shape (20, 50, 1, 23, 40, 2048)
# Apply global average pooling over the spatial dimensions (23, 40)
pooled_features = np.mean(features, axis=(2, 3, 4))  # This will pool over the 1, 23, and 40 dimensions

print(pooled_features.shape)  

(20, 50, 2048)


data分成了train，validation，和test

validation用来tune hyperparameter，test用来最后检测

比例分别为75% for training, 15% for validation, 15% for test (对于这里试验的20个video，就是13, 4,3)

In [28]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    pooled_features, labels, test_size=0.15, random_state=42, stratify=labels
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val
)

In [57]:
print('number for training videos:',len(y_train))
print('number for validation videos:',len(y_val))
print('number for test videos:',len(y_test))

number for training videos: 13
number for validation videos: 4
number for test videos: 3


### Transformer

没有用transformer,结果出奇的好

In [101]:
feature_dimension = 2048  # Adjust based on your actual feature extraction output
sequence_length = 50  # One feature vector per frame, 50 frames per video

inputs = Input(shape=(sequence_length, feature_dimension))

# Example Transformer encoder layer setup
def transformer_encoder(inputs):
    # This is a simplified version; normally you'd have MultiHeadAttention, etc.
    x = Dense(feature_dimension, activation='relu')(inputs)
    x = LayerNormalization(epsilon=1e-6)(x)
    x = Dropout(0.1)(x)
    x = GlobalAveragePooling1D()(x)  # This will average the features over the time dimension
    return x

x = transformer_encoder(inputs)
outputs = Dense(1, activation='sigmoid')(x)  # Now outputs will have the shape (None, 1)
basic_without_transformer_model = Model(inputs=inputs, outputs=outputs)
basic_without_transformer_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


chatgpt给的代码，用的一个simplified transformer encoder，accuracy只有1/3

In [104]:
def transformer_block(inputs, embed_dim, num_heads, ff_dim, rate=0.1):
    # Multi-head attention
    attention_output = MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim, dropout=rate
    )(inputs, inputs)
    
    # Skip connection and layer normalization
    attention_output = Dropout(rate)(attention_output)
    proj_input = LayerNormalization(epsilon=1e-6)(inputs + attention_output)
    
    # Feed-forward part of the transformer
    ffn_output = Dense(ff_dim, activation="relu")(proj_input)
    ffn_output = Dense(embed_dim)(ffn_output)
    
    # Second skip connection and layer normalization
    ffn_output = Dropout(rate)(ffn_output)
    return LayerNormalization(epsilon=1e-6)(proj_input + ffn_output)

# Assuming each feature vector has 2048 dimensions after feature extraction
embed_dim = 2048  # Embedding size for each token
num_heads = 8  # Number of attention heads
ff_dim = 256  # Hidden layer size in feed forward network inside transformer

inputs = Input(shape=(sequence_length, embed_dim))
transformer_block_output = transformer_block(inputs, embed_dim, num_heads, ff_dim)
pooled_output = GlobalAveragePooling1D()(transformer_block_output)
outputs = Dense(1, activation='sigmoid')(pooled_output)

simple_transformer_model = Model(inputs=inputs, outputs=outputs)

simple_transformer_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# Assuming you have X_train, y_train, X_val, y_val prepared
simple_transformer_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.5385 - loss: 0.7794 - val_accuracy: 0.5000 - val_loss: 23.9800
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.4615 - loss: 24.5476 - val_accuracy: 0.5000 - val_loss: 2.5420
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.5385 - loss: 2.0952 - val_accuracy: 0.5000 - val_loss: 5.8665
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.4615 - loss: 6.0123 - val_accuracy: 0.5000 - val_loss: 4.2315
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.4615 - loss: 4.3146 - val_accuracy: 0.5000 - val_loss: 0.7576
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.4615 - loss: 0.7143 - val_accuracy: 0.5000 - val_loss: 2.6514
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x4214e3710>

#### VIVT

In [None]:
#网上copy的vivt代码，还没开改

import av
import numpy as np
import torch

from transformers import VivitImageProcessor, VivitForVideoClassification
from huggingface_hub import hf_hub_download

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 32 frames
indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container=container, indices=indices)

image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

inputs = image_processor(list(video), return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# model predicts one of the 400 Kinetics-400 classes
predicted_label = logits.argmax(-1).item()
print(model.config.id2label[predicted_label])

### Validation

这个结果是最那个基础的model，竟然accuracy和各个指标都是1

In [102]:

# Evaluate the model on the test set
loss, accuracy = basic_without_transformer_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate predictions
predictions = basic_without_transformer_model.predict(X_test)

# If your final layer is a sigmoid, you will need to convert these predictions to binary labels based on a threshold (default is 0.5)
predicted_labels = (predictions > 0.5).astype(int)

# Calculate additional metrics
precision = precision_score(y_test, predicted_labels)
recall = recall_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels)
roc_auc = roc_auc_score(y_test, predictions)  # Use raw predictions here, not the binary labels

# Print out the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")


Test Loss: 0.38606521487236023
Test Accuracy: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC-ROC: 1.0


In [105]:
# Evaluate the model on the test set
loss, accuracy = simple_transformer_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate predictions
predictions = simple_transformer_model.predict(X_test)

# If your final layer is a sigmoid, you will need to convert these predictions to binary labels based on a threshold (default is 0.5)
predicted_labels = (predictions > 0.5).astype(int)

# Calculate additional metrics
precision = precision_score(y_test, predicted_labels)
recall = recall_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels)
roc_auc = roc_auc_score(y_test, predictions)  # Use raw predictions here, not the binary labels

# Print out the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")


Test Loss: 0.5862208008766174
Test Accuracy: 0.3333333432674408
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step
Precision: 0.3333333333333333
Recall: 1.0
F1 Score: 0.5
AUC-ROC: 1.0


### 废物屯放处

In [93]:
# Process each frame of each video through the ViViT model
num_videos, sequence_length, _  = pooled_features.shape

# Reshape pooled_features for processing by ViViT
flattened_features = tf.reshape(pooled_features, (-1, sequence_length, 1))

# Extract features for each frame using the feature extractor
input_ids = feature_extractor(flattened_features)['pixel_values']

# Pass the input_ids through the ViViT model
outputs = model(input_ids)

# Assuming you want to aggregate the logits for each video
# Reshape the logits to be consistent with the original video format
video_logits = tf.reshape(outputs.logits, (num_videos, sequence_length, -1))

# Now you can further process the logits, for example, by averaging across frames
video_logits_avg = tf.reduce_mean(video_logits, axis=1)

# Define the inputs to your model, which match the shape of the aggregated logits
inputs = Input(shape=(feature_dimension,))  # Adjust shape based on the aggregation method used
x = LSTM(units=256, return_sequences=False)(inputs)
outputs = Dense(1, activation='sigmoid')(x)

# Create your model
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Assuming you have labels prepared
# Fit the model
model.fit(video_logits_avg, labels, validation_split=0.2, epochs=10, batch_size=32)

NameError: name 'feature_extractor' is not defined