In [17]:
from pathlib import Path
import ffmpeg
import yaml
import hashlib
import tqdm

import pandas as pd
import numpy as np

from simple_file_checksum import get_checksum
from joblib import Parallel, delayed

In [22]:
def video_info_from_filename(f):
    if f[:6] == 'pexels':
        cr_str = f[7:].split(' ')[0].split('.')[0]
        cr_str = cr_str.replace('_', '-')
        return ' '.join(cr_str.split('-')[:-1]), cr_str.split('-')[-1]
    elif f[:10] == 'production':
        cr_str = f[14:].split(' ')[0].split('.')[0]
        return 'pexels_unknown', cr_str
    else:
        return 'unknown', 'unknown'

In [19]:
settings = yaml.load(open('settings.yaml'), Loader=yaml.FullLoader)
fp_f_names = list(Path(settings['raw_video_directory']).glob('*.mp4'))
f_names = [p.name for p in fp_f_names]
video_info = [video_info_from_filename(f) for f in f_names]

hashes = Parallel(n_jobs=settings['preferred_n_jobs'])(delayed(get_checksum)(f, algorithm="MD5") for f in tqdm.tqdm(fp_f_names))

if not Path('raw_db.tsv').exists():
    db_df = pd.DataFrame(columns=['v_index', 'file_name', 'MD5_hash', 'cut_start', 'cut_end'])
    db_df['v_index'] = [str(n+1).zfill(4) for n in np.arange(len(f_names))]
    db_df['file_name'] = f_names
    db_df['MD5_hash'] = hashes
    db_df['cut_start'] = np.ones(len(f_names)) * np.nan
    db_df['cut_end'] = np.ones(len(f_names)) * np.nan
    db_df['creator'], db_df['pexels_video_id'] = zip(*video_info)

    db_df.to_csv('raw_db.tsv', sep='\t', index=False)


  0%|          | 0/2090 [00:00<?, ?it/s]

100%|██████████| 2090/2090 [00:22<00:00, 94.62it/s] 
