# IOC Video Offset

Find the offset in the IOC video files. This is an exploratory notebook that is the basis for extracting the sequences at the right timestamp in the video files.

In [6]:
import pandas as pd
import os

In [None]:
# Import metadata to get all the candidate files
metadata = '/home/andre/rts/data/metadata.hdf5'
with pd.HDFStore(metadata) as store:
    keys = store.keys()
    print(keys)
    df = store.get(keys[0]) 

In [8]:
def find_mp4_files(root_folder):
    mp4_files = []

    ids = {}

    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.mp4'):
                fn = filename.split('.')[0]
                mp4_files.append(os.path.join(foldername, filename))
    return mp4_files

In [9]:
mp4_files = find_mp4_files("/media/data/ioc/videos/")

In [21]:
file_map = {}
for mp4 in mp4_files:
    file_map[mp4.split('/')[-1].split('.')[0]] = mp4

guids_df = set(df.guid.tolist())
# filter by what ID is in the dataframe
file_map = {k: v for k, v in file_map.items() if k in guids_df}

guids = list(file_map.keys())
len(guids)

3228

In [22]:
len(file_map)

3228

In [26]:
file_map[guids[0]]

'/media/data/ioc/videos/MP4_PROXY_HR/MDA43417188.LR.mp4'

In [23]:
# Sanity check for the number of videos and sequences
df_files = df[df.guid.isin(set(file_map.keys()))]
print(f"Number of sequences: {len(df_files)}, Number of videos: {len(df_files[df_files.guid == df_files.seq_id])}")

Number of sequences: 677607, Number of videos: 3225


In [30]:
file_map[guids[0]]

'/media/data/ioc/videos/MP4_PROXY_HR/MDA43417188.LR.mp4'

In [34]:
import cv2
import numpy as np

# TODO: Compare this against pyscenedetect library

# Load the video
cap = cv2.VideoCapture(file_map[guids[0]])

fps = cap.get(cv2.CAP_PROP_FPS)

# Parameters
THRESHOLD = 20  # adjust this value based on your needs

# Read the first frame
ret, prev_frame = cap.read()

# Convert the first frame to grayscale
prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

frame_count = 1  # to keep track of the current frame

while True:
    # Read the next frame
    ret, frame = cap.read()

    if not ret:
        break

    # Convert the frame to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Compute the absolute difference between the current and previous frame
    diff = cv2.absdiff(prev_frame, gray)

    # Compute the mean difference
    mean_diff = np.mean(diff)

    if mean_diff > THRESHOLD:
        timestamp = frame_count / fps
        print(f'Possible sequence change detected at {timestamp} seconds')
        # break

    # Update the previous frame
    prev_frame = gray
    frame_count += 1

cap.release()

Possible sequence change detected at 20.754066666666667 seconds
Possible sequence change detected at 38.60523333333333 seconds
Possible sequence change detected at 42.842800000000004 seconds
Possible sequence change detected at 53.31993333333333 seconds
Possible sequence change detected at 61.49476666666667 seconds
Possible sequence change detected at 69.5695 seconds
Possible sequence change detected at 78.8788 seconds


KeyboardInterrupt: 

In [28]:
prev_frame

array([[101, 101, 101, ...,  72,  72,  72],
       [101, 101, 101, ...,  71,  71,  71],
       [101, 101, 101, ...,  71,  71,  71],
       ...,
       [ 93,  93,  92, ..., 102, 102, 102],
       [ 93,  93,  92, ..., 102, 102, 102],
       [ 93,  93,  92, ..., 102, 102, 102]], dtype=uint8)