In [1]:
import cv2
import os
import json
import ast

import torch
from torch.utils.data import Dataset

import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models


In [2]:
!ls ../data

[34m__pycache__[m[m        vig_annotation.pkl vig_dl.lst         [34mvig_train[m[m
utils.py           vig_class_map.pkl  [34mvig_test[m[m


In [3]:
!ls

HosseinzadehKhaligh_Ehsan_finalproject_v2.ipynb
VIG_EK.ipynb
class_annotation.txt
vig_annotation.txt
vig_dl.lst


In [4]:
!pip install -U scikit-learn
!pip install opencv-python-headless torch torchvision



#### Get list of files for test & train:

In [5]:
# Directory paths for test and train videos
test_video_dir = '../data/vig_test/'
train_video_dir = '../data/vig_train/'

# Function to read and process a video
def read_video(video_path):
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print(f"Error: Could not open video file '{video_path}'")
        return
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process each frame here
        # You can perform operations like displaying, saving frames, or analyzing them
        
        # Display the frame (for demonstration purposes)
        cv2.imshow('Video Frame', frame)
        
        # Press 'q' to exit the video playback
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

# Read and process test videos
test_videos = [os.path.join(test_video_dir, filename) for filename in os.listdir(test_video_dir) if filename.endswith('.mp4')]
print(test_videos)

# Read and process train videos
train_videos = [os.path.join(train_video_dir, filename) for filename in os.listdir(train_video_dir) if filename.endswith('.mp4')]
print(train_videos)

['../data/vig_test/_lgEm3J3cj4.mp4', '../data/vig_test/_VdCrHztInA.mp4']
['../data/vig_train/_4NEAgqguN0.mp4', '../data/vig_train/_3O_oBEzWAE.mp4', '../data/vig_train/_-ZSRcrbRGI.mp4']


#### Save annotations in Dict:

In [6]:
# File paths
class_annotation_file = 'class_annotation.txt'
vig_annotation_file = 'vig_annotation.txt'

def read_annotation(file_path):

    # Read the contents of the text file
    with open(file_path, 'r') as file:
        file_contents = file.read()
    
    # Safely evaluate the contents as a Python dictionary
    try:
        data_dict = ast.literal_eval(file_contents)
    except (SyntaxError, ValueError):
        print("Error: The file does not contain a valid Python dictionary.")
        data_dict = {}

    return data_dict

class_annotation_dict = read_annotation(class_annotation_file)
vig_annotation_dict = read_annotation(vig_annotation_file)

print(class_annotation_dict)
print()
print(vig_annotation_dict)
print(vig_annotation_dict[3058])

{'Bleat': 2, 'Fireworks': 12, 'Splash, splatter': 13, 'Fire alarm': 9, 'Chicken, rooster': 3, 'Thunderstorm': 5, 'Gunshot, gunfire': 11, 'Rail transport': 7, 'Spray': 14, 'Race car, auto racing': 6, 'Church bell': 4, 'Helicopter': 8, 'Bark': 0, 'Hammer': 10, 'Cattle, bovinae': 1}

{327681: {'class_id': 1, 'start_time': 24.0, 'end_time': 34.0}, 425986: {'class_id': 11, 'start_time': 300.0, 'end_time': 310.0}, 13748: {'class_id': 11, 'start_time': 30.0, 'end_time': 40.0}, 955830: {'class_id': 13, 'start_time': 0.0, 'end_time': 10.0}, 65546: {'class_id': 0, 'start_time': 30.0, 'end_time': 40.0}, 11: {'class_id': 12, 'start_time': 370.0, 'end_time': 380.0}, 65549: {'class_id': 6, 'start_time': 20.0, 'end_time': 30.0}, 131086: {'class_id': 7, 'start_time': 40.0, 'end_time': 50.0}, 65555: {'class_id': 8, 'start_time': 90.0, 'end_time': 100.0}, 297879: {'class_id': 0, 'start_time': 60.0, 'end_time': 70.0}, 518830: {'class_id': 8, 'start_time': 30.0, 'end_time': 40.0}, 22: {'class_id': 10, 'st

#### Save vig_dl.lst into Dict:

In [7]:
vig_dl_dict = {}

# Open the file for reading
with open('vig_dl.lst', 'r') as file:
    # Read each line from the file
    for line in file:
        # Split the line by comma (,) to get the entries
        entries = line.strip().split(',')
        video_youtube_id = entries[0]
        video_int_id = int(entries[1])

        # Print the entries
        #print("video_youtube_id:", video_youtube_id)
        #print("video_int_id:", video_int_id)

        if video_youtube_id not in vig_dl_dict:
            vig_dl_dict[video_youtube_id] = video_int_id

print(vig_dl_dict)

{'kFZhc_d-2V4': 11, 'IcmAVLJQAXs': 22, 'njodYtK0Hqg': 36, 'LC4vDpnkDtM': 56, 'zNuT6b5vAnU': 66, '1XwOM3Hf_Bs': 68, 'OazWqV22fh4': 95, 'nvBPPOzcW-A': 99, 'qPsoKSYzuyo': 115, 'xvanhCQ9jho': 119, 'EOKUV6hgMhM': 132, 'QP_ZCssCySw': 147, 'BLEthZXy35s': 176, '9wMeb3Q-IiQ': 179, 'V2OjPsg7ydY': 190, 'VFjxWdPPY1Q': 195, 'M3tAFphkQow': 200, 'fVfkPyZl6o0': 220, 'kTyq_6AnoSw': 228, 'DIxTruxPbdk': 252, 'ITYq5xoDGK0': 297, 'd-do1XZ8f_E': 305, 'VWyrns4rBwc': 312, 'wuz0curb_hg': 315, 'XO1wG3ne_Cw': 354, 'qnUax_XrBC8': 359, 'Dks0c0Ii48o': 361, 'tz0avWZoqjg': 373, '7C_jcD_FOQI': 377, 'ldvGteIeoMQ': 399, 'POuhGn61ZR8': 428, 'lo3yvn1av2E': 437, 'DHkK8sw3dd0': 452, 'OfefVMM5L44': 459, 'ZnEEAfLC9Sg': 481, 'Amd-OLa3tMI': 500, 'ZKocFGL8LIE': 532, 'GO_1gBjIR1c': 550, 'tfMC-N8vX54': 580, 'aCX6vJhHO2c': 585, 'mTUsHkNjSWo': 594, '3EE5WKQMBek': 642, 'DpLi9YH5idY': 646, 'ItIo1RvO0PE': 659, 'v2i3nOEwupw': 668, '3hEgl8N2FBA': 685, 'qC5M7BAsKOA': 687, '0fsntL6RKww': 798, 'ijgwCwnKZUM': 812, 'fbJ0yNuwmpg': 829, 'vi6czb

#### Get train & test download assigned IDs:

#### train

In [8]:
# Directory path for train videos
train_video_dir = '../data/vig_train/'

# Get a list of all files in the train directory
file_list = os.listdir(train_video_dir)

# Remove the ".mp4" extension from each file name
file_list_without_extension = [os.path.splitext(file)[0] for file in file_list if file.endswith('.mp4')]

# Now, 'file_list_without_extension' contains the list of file names without the ".mp4" extension
print(file_list_without_extension[:4])

['_4NEAgqguN0', '_3O_oBEzWAE', '_-ZSRcrbRGI']


In [9]:
# Define the output file
output_file = "train_labels.txt"

# Open the file in write mode
with open(output_file, "w") as file:
    
    for youtube_video_id in file_list_without_extension:
        video_id_value = vig_dl_dict.get(youtube_video_id, None)
        if video_id_value is not None:
            class_id = vig_annotation_dict.get(video_id_value, {}).get("class_id", 0)
            
            file.write(youtube_video_id + ".mp4," + str(class_id) + "\n")
            
            print("youtube_video_id:", youtube_video_id)
            print("video_id_value:", video_id_value)
            print("class_id:", class_id)
            print("-----------")


youtube_video_id: _4NEAgqguN0
video_id_value: 7866
class_id: 4
-----------
youtube_video_id: _3O_oBEzWAE
video_id_value: 2082
class_id: 11
-----------
youtube_video_id: _-ZSRcrbRGI
video_id_value: 8036
class_id: 7
-----------


In [10]:
# Directory path for train videos
test_video_dir = '../data/vig_test/'

# Get a list of all files in the train directory
file_list = os.listdir(test_video_dir)

# Remove the ".mp4" extension from each file name
file_list_without_extension = [os.path.splitext(file)[0] for file in file_list if file.endswith('.mp4')]

# Now, 'file_list_without_extension' contains the list of file names without the ".mp4" extension
print(file_list_without_extension[:4])

['_lgEm3J3cj4', '_VdCrHztInA']


#### test

In [11]:
# Define the output file
output_file = "test_labels.txt"

# Open the file in write mode
with open(output_file, "w") as file:
    
    for youtube_video_id in file_list_without_extension:
        video_id_value = vig_dl_dict.get(youtube_video_id, None)
        if video_id_value is not None:
            class_id = vig_annotation_dict.get(video_id_value, {}).get("class_id", 0)
            
            file.write(youtube_video_id + ".mp4," + str(class_id) + "\n")
            
            print("youtube_video_id:", youtube_video_id)
            print("video_id_value:", video_id_value)
            print("class_id:", class_id)
            print("-----------")

youtube_video_id: _lgEm3J3cj4
video_id_value: 19623
class_id: 4
-----------
youtube_video_id: _VdCrHztInA
video_id_value: 19811
class_id: 7
-----------


#### model training 

In [8]:
import cv2
import os
import numpy as np


# Define paths
labels_file = 'train_labels.txt'
video_directory = '../data/vig_train/'

# Load and preprocess data
labels = []
video_data = []

num_frames = 15

with open(labels_file, 'r') as file:
    lines = file.readlines()
    for line in lines:
        video_name, label = line.strip().split(',')
        video_data.append((video_name, int(label)))

        print(video_name+","+label)
        labels.append(int(label))
        
        # Read the video file using OpenCV
        video_path = video_directory+video_name

        if '.DS_Store' in video_path:
            continue
        else:
            print(video_path)
        cap = cv2.VideoCapture(video_path)

        # Initialize an empty list to store the frames
        frames = []

        # Loop through L equally spaced frames in the video
        for j in range(num_frames):

            # Set the frame index to read
            frame_idx = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) * j / num_frames)

            # Read the frame and resize it to (224, 224) for the model
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (224, 224))

            # Normalize the pixel values to [0, 1]
            frame = frame / 255.0

            # Add the frame to the list
            frames.append(frame)

        # If the video has less than L frames, pad the list with the last frame
        while len(frames) < num_frames:
            frames.append(frames[-1])

        # Convert the list of frames to a NumPy array
        frames = np.array(frames)

        #print(frames)

#TODO implement CNN Alexnet here to train with videos converted into frames


_4NEAgqguN0.mp4,4
../data/vig_train/_4NEAgqguN0.mp4
[[[[0.15686275 0.15294118 0.17254902]
   [0.15294118 0.14901961 0.16862745]
   [0.15686275 0.15294118 0.17254902]
   ...
   [0.56862745 0.69019608 0.74509804]
   [0.57647059 0.69803922 0.75294118]
   [0.57254902 0.69803922 0.75294118]]

  [[0.15686275 0.15294118 0.17254902]
   [0.16078431 0.15686275 0.17647059]
   [0.16078431 0.15686275 0.17647059]
   ...
   [0.56078431 0.68235294 0.7372549 ]
   [0.56470588 0.68627451 0.74117647]
   [0.56470588 0.68627451 0.74117647]]

  [[0.16078431 0.15686275 0.17647059]
   [0.16470588 0.16078431 0.18039216]
   [0.16470588 0.16078431 0.18039216]
   ...
   [0.56470588 0.68627451 0.74117647]
   [0.56862745 0.69019608 0.74509804]
   [0.56862745 0.69019608 0.74509804]]

  ...

  [[0.19607843 0.20392157 0.2       ]
   [0.17254902 0.18431373 0.18039216]
   [0.18039216 0.16862745 0.16862745]
   ...
   [0.23529412 0.21176471 0.23137255]
   [0.23137255 0.20784314 0.22745098]
   [0.22745098 0.20392157 0.21960

In [14]:
import cv2
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import models, transforms

# Define paths
labels_file = 'train_labels.txt'
video_directory = '../data/vig_train/'

# Load and preprocess data
labels = []
video_data = []

num_frames = 1

frames_list = []  # Store all frames as a list
labels_list = []  # Store corresponding labels

with open(labels_file, 'r') as file:
    lines = file.readlines()
    for line in lines:
        video_name, label = line.strip().split(',')
        labels_list.append(int(label))
        video_path = os.path.join(video_directory, video_name)

        if '.DS_Store' in video_path:
            continue

        cap = cv2.VideoCapture(video_path)

        frames = []

        for j in range(num_frames):
            frame_idx = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) * j / num_frames)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (224, 224))
            frame = frame / 255.0
            frames.append(frame)

        while len(frames) < num_frames:
            frames.append(frames[-1])

        frames_list.append(frames)

# Convert the lists to NumPy arrays
frames_array = np.array(frames_list)
labels_array = np.array(labels_list)

# Define a function to apply transformations to frames (required for AlexNet)
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
])

# Define the AlexNet model
alexnet_model = models.alexnet(pretrained=False, num_classes=5)

# Modify the classifier part of AlexNet to match your number of output classes
num_classes = 5  # Change this to your desired number of classes
alexnet_model.classifier[6] = nn.Linear(4096, num_classes)

# Create an instance of the AlexNet model
alexnet_model = alexnet_model

print("Frames array shape:", frames_array.shape)
print("Labels array shape:", labels_array.shape)

print(frames_array[:1])
print(labels_array[:1])

# Reshape 'frames_array' to have the shape (num_samples, num_channels, height, width)
# In this case, it seems that you have RGB images (3 channels).
frames_array = frames_array.reshape((3, 3, 224, 224))

# Convert data to PyTorch tensors
frames_tensor = torch.tensor(frames_array, dtype=torch.float32)
labels_tensor = torch.tensor(labels_array, dtype=torch.long)

# Create a TensorDataset
data = TensorDataset(frames_tensor, labels_tensor)

# Create a DataLoader
batch_size = 32  # You can adjust the batch size as needed
data_loader = DataLoader(data, batch_size=batch_size, shuffle=True)

# Define a loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(alexnet_model.parameters(), lr=0.001, momentum=0.9)

# Training loop
num_epochs = 5  # Adjust the number of epochs as needed
for epoch in range(num_epochs):
    for frames, labels in data_loader:
        optimizer.zero_grad()
        outputs = alexnet_model(frames)
        print(outputs)
        print(labels)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


Frames array shape: (3, 1, 224, 224, 3)
Labels array shape: (3,)
[[[[[0.15686275 0.15294118 0.17254902]
    [0.15294118 0.14901961 0.16862745]
    [0.15686275 0.15294118 0.17254902]
    ...
    [0.56862745 0.69019608 0.74509804]
    [0.57647059 0.69803922 0.75294118]
    [0.57254902 0.69803922 0.75294118]]

   [[0.15686275 0.15294118 0.17254902]
    [0.16078431 0.15686275 0.17647059]
    [0.16078431 0.15686275 0.17647059]
    ...
    [0.56078431 0.68235294 0.7372549 ]
    [0.56470588 0.68627451 0.74117647]
    [0.56470588 0.68627451 0.74117647]]

   [[0.16078431 0.15686275 0.17647059]
    [0.16470588 0.16078431 0.18039216]
    [0.16470588 0.16078431 0.18039216]
    ...
    [0.56470588 0.68627451 0.74117647]
    [0.56862745 0.69019608 0.74509804]
    [0.56862745 0.69019608 0.74509804]]

   ...

   [[0.19607843 0.20392157 0.2       ]
    [0.17254902 0.18431373 0.18039216]
    [0.18039216 0.16862745 0.16862745]
    ...
    [0.23529412 0.21176471 0.23137255]
    [0.23137255 0.20784314 0.22

IndexError: Target 11 is out of bounds.

Reading:

https://towardsdatascience.com/the-w3h-of-alexnet-vggnet-resnet-and-inception-7baaaecccc96

https://blog.paperspace.com/alexnet-pytorch/

https://theaisummer.com/cnn-architectures/

https://pytorch.org/hub/pytorch_vision_alexnet/