In [21]:
import multiprocessing
import os
import shutil
import subprocess
from collections import namedtuple, defaultdict

import helpers

from PIL import Image
from tqdm import tqdm_notebook

In [23]:
SPLIT_ID = 1

DATASET_PATH = '/media/d/vsd/data/ucf101'
DATA_PATH = os.path.join(DATASET_PATH, 'UCF-101')

PREPROCESSED_PATH = '/media/e/vsd/data/ucf101_preprocessed'
PREPROCESSED_SPLIT_PATH = os.path.join(PREPROCESSED_PATH, 'split_{0:02d}'.format(SPLIT_ID))

FPS = 25
WIDTH = 320
HEIGHT = 240

SKIP_EXIST = False

FNULL = open(os.devnull, 'w')

if not SKIP_EXIST:
    shutil.rmtree(PREPROCESSED_SPLIT_PATH, ignore_errors=True)
    
helpers.ensure_path_exists(PREPROCESSED_SPLIT_PATH)

In [25]:
class Video(namedtuple('VideoPath', ['class_', 'name', 'extension'])):
    @classmethod
    def parse_split_line(cls, line):
        path, extension = os.path.splitext(line.strip())
        class_, name = os.path.split(path)
        
        return cls(class_=class_, name=name, extension=extension)
    
    @property
    def path(self):
        return '{0.class_}/{0.name}{0.extension}'.format(self)
    
    @property
    def path_no_ext(self):
        return '{0.class_}/{0.name}'.format(self)   
    
    
def get_split_videos(train_or_test):
    assert train_or_test in {'train', 'test'}
    path = os.path.join(DATASET_PATH, 'ucfTrainTestlist', '{0}list{1:02d}.txt'.format(train_or_test, SPLIT_ID))
    
    with open(path) as f:
        return [Video.parse_split_line(l.strip().split()[0]) for l in f.readlines()]

### Для разбиения sp1 генерим RGB с помощью FFMPEG

In [26]:
def generate_rgb(train_or_test, n_jobs=8):
    dst_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test)
    rgb_path = os.path.join(dst_path, 'rgb')
    
    videos = get_split_videos(train_or_test)
    
    # prepare dirs
    classes = {video.class_ for video in videos}
    [helpers.ensure_path_exists(os.path.join(rgb_path, class_)) for class_ in classes]
    
    def prepare_tasks():
        for video in videos:
            src_video_path = os.path.join(DATA_PATH, video.path)
            dst_video_path = os.path.join(rgb_path, video.path_no_ext)
        
            if SKIP_EXIST and os.path.exists(dst_video_path)\
                and os.path.isdir(dst_video_path) and os.listdir(dst_video_path):
                    continue
                    
            helpers.ensure_path_exists(dst_video_path)
            dst_frames_template_path = os.path.join(dst_video_path, '%04d.jpg')
            
            yield src_video_path, dst_frames_template_path, FPS, WIDTH, HEIGHT
    
    errors = []
    
    def do_work(pool):
        with tqdm_notebook(desc='[{}] RGB Generation'.format(train_or_test), total=len(videos)) as pbar:
            for is_ok, src_video_path in pool(helpers.extract_rgb_frames, prepare_tasks()):
                pbar.update(1)
                
                if not is_ok:
                    errors.append(src_video_path)
                    
    def dummy_pool(func, tasks):
        for task in tasks:
            yield func(task)

    
    if n_jobs > 1:
        with multiprocessing.Pool(n_jobs) as pool:
            do_work(pool.imap_unordered)
    else:
        do_work(dummy_pool)

    return errors

In [27]:
train_errors = generate_rgb('train', 6)
test_errors = generate_rgb('test', 6)







### [уже не нужно] Будем генерить оптический поток с помощью flownet 2.0 из контейнера

In [21]:
def generate_flow_tasks(train_or_test):
    _1st_images, _2nd_images, flow_images = [], [], []
    
    n_skipped = 0
    
    rgb_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, 'rgb')
    flow_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, 'flow')
    helpers.ensure_path_exists(flow_path)
    
    path_prefix_len = len(PREPROCESSED_SPLIT_PATH) + 1
    
    for class_ in tqdm_notebook(os.listdir(rgb_path), desc='[{}] Flow Task Generation'.format(train_or_test)):
        src_class_path = os.path.join(rgb_path, class_)
        dst_class_path = os.path.join(flow_path, class_)
        helpers.ensure_path_exists(dst_class_path)
        
        for video_name in os.listdir(src_class_path):
            src_video_path = os.path.join(src_class_path, video_name)
            dst_video_path = os.path.join(dst_class_path, video_name)
            helpers.ensure_path_exists(dst_video_path)            
            
            frame_names = list(sorted(os.listdir(src_video_path)))
            
            for _1st_frame, _2nd_frame in zip(frame_names, frame_names[1:]):
                _1st_frame_path = os.path.join(src_video_path, _1st_frame)
                _2nd_frame_path = os.path.join(src_video_path, _2nd_frame)
                
                flow_frame_path = os.path.join(dst_video_path, _1st_frame.replace('.jpg', '.flo'))
                
                if SKIP_EXIST and os.path.exists(flow_frame_path):
                    n_skipped += 1
                    continue
                    
                _1st_frame_path = _1st_frame_path[path_prefix_len:]
                _2nd_frame_path = _2nd_frame_path[path_prefix_len:]
                flow_frame_path = flow_frame_path[path_prefix_len:]
                
                _1st_images.append(_1st_frame_path)
                _2nd_images.append(_2nd_frame_path)
                flow_images.append(flow_frame_path)               
    
    tasks_dir = os.path.join(PREPROCESSED_SPLIT_PATH, 'flow_tasks', train_or_test)
    helpers.ensure_path_exists(tasks_dir)
    
    path_list_name = [
        [_1st_images, '1st'],
        [_2nd_images, '2nd'],
        [flow_images, 'flow']
    ]
        
    for path_list, name in path_list_name:
        with open(os.path.join(tasks_dir, '{}.txt'.format(name)), 'w+') as f:
            for path in path_list:
                f.write(path + '\n')
            f.write(path_list[-1])
                
    print(train_or_test, 'skipped', n_skipped, 'frames')

In [None]:
# generate_flow_tasks('test')
generate_flow_tasks('train')

# сгенерим .sh для запуска генерации потока

In [None]:
def gen_sh(train_or_test):
    run_file_path = 'run-network.sh'

    tasks_dir = os.path.join(PREPROCESSED_SPLIT_PATH, 'flow_tasks', train_or_test)

    base_command = 'sh {run_file} -n FlowNet2-s {{img1}} {{img2}} {{flow}}'.format(run_file=run_file_path)

    with open(os.path.join(tasks_dir, 'gen_flow.sh'), 'w+') as f:
        img1_list_path = os.path.join('flow_tasks', train_or_test, '1st.txt')
        img2_list_path = os.path.join('flow_tasks', train_or_test, '2nd.txt')
        flow_list_path = os.path.join('flow_tasks', train_or_test, 'flow.txt')

        f.write(base_command.format(img1=img1_list_path, img2=img2_list_path, flow=flow_list_path) + '\n')
            
gen_sh('test')
gen_sh('train')

## Построение списка файлов

In [30]:
import pandas as pd

def create_class_index(split_id):
    classes = {v.class_ for v in get_split_videos('train')}
    return {class_: idx for idx, class_ in enumerate(sorted(classes))}

def generate_file_list(train_or_test, flow_or_rgb, class_index):
    data_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, flow_or_rgb)
    
    rows = []
    
    for class_ in tqdm_notebook(os.listdir(data_path), desc=train_or_test):
        class_path = os.path.join(data_path, class_)
        
        for video in os.listdir(class_path):
            video_path = os.path.join(class_path, video)
            
            rows.append({
                'class': class_index[class_],
                'path': video_path,
                'n_frames': len(os.listdir(video_path))
            })
            
    file_lists_dir = os.path.join(PREPROCESSED_SPLIT_PATH, 'file_lists')
    helpers.ensure_path_exists(file_lists_dir)
    
    file_list_path = os.path.join(file_lists_dir, '{}_{}.txt'.format(train_or_test, flow_or_rgb))
    pd.DataFrame(rows)[['path', 'n_frames', 'class']].to_csv(file_list_path, index=False, header=False, sep=' ')
    
class_index = create_class_index(SPLIT_ID)

print('n_classes:', len(class_index))

generate_file_list('train', 'rgb', class_index)
generate_file_list('test', 'rgb', class_index)

n_classes: 101






