In [6]:
import json, os, sys
import subprocess
from pathlib import Path

import pandas as pd

from tqdm import tqdm

WORK_DIR = Path.cwd().parent
print(f"Work directory: {WORK_DIR}")
sys.path.append(str(WORK_DIR / 'src'))


Work directory: /home/lusha/star_code


In [7]:
video_dir = WORK_DIR / "data/datasets/action-genome/Charades_v1_480"


In [8]:
with open(WORK_DIR / "data/datasets/STAR/STAR_annotations/STAR_val.json") as in_file:
    star_data = json.load(in_file)

print(len(star_data))


7098


The data is stored in json format as a list of objects. The object is centered around the question (i.e. the question works as the foundamental element to be used as id), and all other inforamation are correlate to the question. That is to say that each object contains the question and the other informations like gorunding video, scene graph etc.

In [9]:
test_id = 101
test_sample = star_data[test_id]
list(test_sample.keys())


['question_id',
 'question',
 'video_id',
 'start',
 'end',
 'answer',
 'question_program',
 'choices',
 'situations']

## Video data

In [10]:
star_data_df = pd.DataFrame(star_data)
star_data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7098 entries, 0 to 7097
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question_id       7098 non-null   object 
 1   question          7098 non-null   object 
 2   video_id          7098 non-null   object 
 3   start             7098 non-null   float64
 4   end               7098 non-null   float64
 5   answer            7098 non-null   object 
 6   question_program  7098 non-null   object 
 7   choices           7098 non-null   object 
 8   situations        7098 non-null   object 
dtypes: float64(2), object(7)
memory usage: 499.2+ KB


In [11]:
star_data_df['video_id'].nunique()


914

In [12]:
star_data_df.groupby(['video_id', 'start', 'end']).ngroups


3373

In [13]:
import csv

outfile = WORK_DIR / "data/datasets/STAR/STAR_annotations/video_ids_val.csv"
video_ids = star_data_df['video_id'].unique()

if not outfile.exists():
    with open(outfile, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows([[item] for item in video_ids])


In [14]:
def get_video_duration(video_path):
    """Get video duration in seconds using ffprobe."""
    cmd = [
        'ffprobe',
        '-v', 'error',
        '-show_entries', 'format=duration',
        '-of', 'default=noprint_wrappers=1:nokey=1',
        video_path
    ]
    try:
        output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
        duration = float(output)
        return duration
    except subprocess.CalledProcessError as e:
        print(f"\nError getting duration for {video_path}: {e.output.decode('utf-8')}")
        return None
    except ValueError:
        print(f"\nCould not parse duration for {video_path}")
        return None

def process_videos_directory(directory):
    """Process all video files in directory and return duration info."""
    video_info = []
    
    # First count how many video files we have
    video_files = [
        filename for filename in os.listdir(directory)
        if os.path.isfile(os.path.join(directory, filename)) and
        filename.lower().endswith('.mp4')
    ]
    
    # Process with progress bar
    for filename in tqdm(video_files, desc="Processing videos", unit="video"):
        filepath = os.path.join(directory, filename)
        duration = get_video_duration(filepath)
        if duration is not None:
            video_id = os.path.splitext(filename)[0]
            video_info.append({
                'video_id': video_id,
                'length': duration
            })
    
    return video_info



In [19]:
outfile = WORK_DIR / "data/datasets/STAR/STAR_annotations/video_metadata.jsonl"

video_len_data = None
if outfile.exists():
    with open(outfile, 'r') as in_f:
        video_len_data = [json.loads(line) for line in in_f.readlines()]
else:
    video_len_data = process_videos_directory(video_dir)

    with open(outfile, 'w') as out_f:
        for entry in video_len_data:
            line = json.dumps(entry) + '\n'
            out_f.write(line)
    


In [20]:
video_len_data_series = pd.DataFrame(video_len_data).set_index('video_id')['length']
video_len_data_series.describe()


count    9848.000000
mean       29.764684
std         9.182348
min         2.392000
25%        26.912000
50%        30.674000
75%        32.323000
max       194.421000
Name: length, dtype: float64