In [1]:
import multiprocessing
import os
import glob
import shutil
import subprocess
from collections import namedtuple, defaultdict
import skimage.io
import skimage.transform

import helpers

import numpy as np
from PIL import Image
from tqdm import tqdm_notebook

In [4]:
SPLIT_ID = 1

DATASET_PATH = '/data/data/ucf101/'
DATA_PATH = os.path.join(DATASET_PATH, 'UCF-101')

PREPROCESSED_PATH = '/data/data/ucf101_preprocessed'
PREPROCESSED_SPLIT_PATH = os.path.join(PREPROCESSED_PATH, 'split_{0:02d}'.format(SPLIT_ID))

FPS = 5
WIDTH = 320
HEIGHT = 240

SKIP_EXIST = True

FNULL = open(os.devnull, 'w')

if not SKIP_EXIST:
    shutil.rmtree(PREPROCESSED_SPLIT_PATH, ignore_errors=True)
    
helpers.ensure_path_exists(PREPROCESSED_SPLIT_PATH)

In [3]:
class Video(namedtuple('VideoPath', ['class_', 'name', 'extension'])):
    @classmethod
    def parse_split_line(cls, line):
        path, extension = os.path.splitext(line.strip())
        class_, name = os.path.split(path)
        
        return cls(class_=class_, name=name, extension=extension)
    
    @property
    def path(self):
        return '{0.class_}/{0.name}{0.extension}'.format(self)
    
    @property
    def path_no_ext(self):
        return '{0.class_}/{0.name}'.format(self)   
    
    
def get_split_videos(train_or_test):
    assert train_or_test in {'train', 'test'}
    
    path = os.path.join(DATASET_PATH, 'ucfTrainTestlist', '{0}list{1:02d}.txt'.format(train_or_test, SPLIT_ID))

    with open(path) as f:
        return [Video.parse_split_line(l.strip().split()[0]) for l in f.readlines()]

### Для разбиения sp1 генерим RGB с помощью FFMPEG

In [4]:
def generate_rgb(train_or_test, n_jobs=8):
    dst_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test)
    rgb_path = os.path.join(dst_path, 'rgb')
    
    videos = get_split_videos(train_or_test)
    
    # prepare dirs
    classes = {video.class_ for video in videos}
    [helpers.ensure_path_exists(os.path.join(rgb_path, class_)) for class_ in classes]
    
    def prepare_tasks():
        for video in videos:
            src_video_path = os.path.join(DATA_PATH, video.path)
            dst_video_path = os.path.join(rgb_path, video.path_no_ext)
        
            if SKIP_EXIST and os.path.exists(dst_video_path)\
                and os.path.isdir(dst_video_path) and os.listdir(dst_video_path):
                    continue
                    
            helpers.ensure_path_exists(dst_video_path)
            dst_frames_template_path = os.path.join(dst_video_path, '%04d.jpg')
            
            yield src_video_path, dst_frames_template_path, FPS, WIDTH, HEIGHT
    
    errors = []
    
    def do_work(pool):
        with tqdm_notebook(desc='[{}] RGB Generation'.format(train_or_test), total=len(videos)) as pbar:
            for is_ok, src_video_path in pool(helpers.extract_rgb_frames, prepare_tasks()):
                pbar.update(1)
                
                if not is_ok:
                    errors.append(src_video_path)
                    
    def dummy_pool(func, tasks):
        for task in tasks:
            yield func(task)

    
    if n_jobs > 1:
        with multiprocessing.Pool(n_jobs) as pool:
            do_work(pool.imap_unordered)
    else:
        do_work(dummy_pool)

    return errors

In [5]:
train_errors = generate_rgb('train', 4)
test_errors = generate_rgb('test', 4)







### Будем генерить оптический поток с помощью flownet 2.0 из контейнера

In [6]:
def generate_flow_tasks(train_or_test):
    _1st_images, _2nd_images, flow_images = [], [], []
    
    n_skipped = 0
    
    rgb_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, 'rgb')
    flow_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, 'flow')
    helpers.ensure_path_exists(flow_path)
    
    path_prefix_len = len(PREPROCESSED_SPLIT_PATH) + 1
    
    for class_ in tqdm_notebook(os.listdir(rgb_path), desc='[{}] Flow Task Generation'.format(train_or_test)):
        src_class_path = os.path.join(rgb_path, class_)
        dst_class_path = os.path.join(flow_path, class_)
        helpers.ensure_path_exists(dst_class_path)
        
        for video_name in os.listdir(src_class_path):
            src_video_path = os.path.join(src_class_path, video_name)
            dst_video_path = os.path.join(dst_class_path, video_name)
            helpers.ensure_path_exists(dst_video_path)            
            
            frame_names = list(sorted(os.listdir(src_video_path)))
            
            for _1st_frame, _2nd_frame in zip(frame_names, frame_names[1:]):
                _1st_frame_path = os.path.join(src_video_path, _1st_frame)
                _2nd_frame_path = os.path.join(src_video_path, _2nd_frame)
                
                flow_frame_path = os.path.join(dst_video_path, _1st_frame.replace('.jpg', '.flo'))
                
                if SKIP_EXIST and os.path.exists(flow_frame_path):
                    n_skipped += 1
                    continue
                    
                _1st_frame_path = _1st_frame_path[path_prefix_len:]
                _2nd_frame_path = _2nd_frame_path[path_prefix_len:]
                flow_frame_path = flow_frame_path[path_prefix_len:]
                
                _1st_images.append(_1st_frame_path)
                _2nd_images.append(_2nd_frame_path)
                flow_images.append(flow_frame_path)               
    
    tasks_dir = os.path.join(PREPROCESSED_SPLIT_PATH, 'flow_tasks', train_or_test)
    helpers.ensure_path_exists(tasks_dir)
    
    path_list_name = [
        [_1st_images, '1st'],
        [_2nd_images, '2nd'],
        [flow_images, 'flow']
    ]
        
    for path_list, name in path_list_name:
        with open(os.path.join(tasks_dir, '{}.txt'.format(name)), 'w+') as f:
            for path in path_list:
                f.write(path + '\n')
            f.write(path_list[-1])
                
    print(train_or_test, 'skipped', n_skipped, 'frames')

In [7]:
generate_flow_tasks('test')
generate_flow_tasks('train')


test skipped 0 frames



train skipped 0 frames


# сгенерим .sh для запуска генерации потока

In [8]:
def gen_sh(train_or_test):
    run_file_path = 'run-network.sh'

    tasks_dir = os.path.join(PREPROCESSED_SPLIT_PATH, 'flow_tasks', train_or_test)

    base_command = 'sh {run_file} -n FlowNet2-s {{img1}} {{img2}} {{flow}}'.format(run_file=run_file_path)
п
    with open(os.path.join(tasks_dir, 'gen_flow.sh'), 'w+') as f:
        img1_list_path = os.path.join('flow_tasks', train_or_test, '1st.txt')
        img2_list_path = os.path.join('flow_tasks', train_or_test, '2nd.txt')
        flow_list_path = os.path.join('flow_tasks', train_or_test, 'flow.txt')

        f.write(base_command.format(img1=img1_list_path, img2=img2_list_path, flow=flow_list_path) + '\n')
            
gen_sh('test')
gen_sh('train')

## Посчитаем mean/std по rgb и потокам

In [115]:
def clip_img_by_channel(img):
    def get_real_max(x):
        """assume all x are positive"""
        max_diff = np.median(x) * 6
        x = np.sort(x)[::-1]

        diffs = x[:-1] - x[1:]

        max_diff_i = None

        for i, diff in enumerate(diffs[:-1]):
            if diff >= max_diff:
                max_diff_i = i
                break

        if max_diff_i is None:
            return

        return x[i + 2]
    
    img = img.copy()

    for channel_i in range(img.shape[2]):
        channel = img[:,:,channel_i]
        
        neg_abs, pos = np.abs(channel[channel < 0].ravel()), channel[channel > 0].ravel()
        
        if len(neg_abs):
            min_val = -(get_real_max(neg_abs) or neg_abs.max())
        else:
            min_val = channel.min()
            
        if len(pos):
            max_val = get_real_max(pos) or pos.max()
        else:
            max_val = channel.max()
        
        img[:,:,channel_i] = np.clip(channel, min_val, max_val)
        
    return img

In [121]:
def calc_train_mean_std(img_type):
    def get_file_count(path):
        return int(subprocess.getoutput(r'find ' + path + ' -type f | wc -l'))
    
    def load_flo(path):
        img = np.fromfile(path, np.float32, HEIGHT * WIDTH * 2)
        img = img.reshape(HEIGHT, WIDTH, 2)
        img[0, 0, :] = img.reshape(HEIGHT * WIDTH, 2)[1:].mean(axis=0)
        img -= np.min(img, axis=(0, 1))
        img = img / np.max(img, axis=(0, 1))
        return img
    
    def load_rgb(path):
        img = skimage.io.imread(path)
        img = img / 255.
        return img
    
    if img_type == 'rgb':
        extension = 'jpg'
        load_img_fn = load_rgb
        n_channels = 3
    elif img_type == 'flow':
        extension = 'flo'
        load_img_fn = load_flo
        n_channels = 2
    else:
        raise Exception('Invalid type')
    
    train_path = os.path.join(PREPROCESSED_SPLIT_PATH, 'train', img_type)

    file_gen = glob.iglob('{}/**/*.{}'.format(train_path, extension), recursive=True)
    file_count = get_file_count(train_path)

    mean, std = np.zeros((n_channels,)), np.zeros((n_channels,))
    count = 0
    
    for img_path in tqdm_notebook(file_gen, total=file_count):
        img = load_img_fn(img_path)
        
        count += 1
        mean += img.mean(axis=(0, 1))
        std += img.std(axis=(0, 1))
        
    return {
        'img_type': img_type,
        'mean': mean / count,
        'std': std / count
    }

In [125]:
rgb_stat = calc_train_mean_std('rgb')
flow_stat = calc_train_mean_std('flow')








In [127]:
rgb_stat

{'img_type': 'rgb',
 'mean': array([ 0.39674431,  0.38155164,  0.35283176]),
 'std': array([ 0.24261944,  0.23637967,  0.23292163])}

In [128]:
flow_stat

{'img_type': 'flow',
 'mean': array([ 0.49372829,  0.54430225]),
 'std': array([ 0.13754837,  0.14878529])}

## Починим потоки

In [131]:
def fix_flows(train_or_test):
    def get_file_count(path):
        return int(subprocess.getoutput(r'find ' + path + ' -type f | wc -l'))
    
    def load_flo(path):
        img = np.fromfile(path, np.float32, HEIGHT * WIDTH * 2)
        img = img.reshape(HEIGHT, WIDTH, 2)
        img[0, 0, :] = img.reshape(HEIGHT * WIDTH, 2)[1:].mean(axis=0)
        img -= np.min(img, axis=(0, 1))
        img = img / np.max(img, axis=(0, 1))
        return img
    
    extension = 'flo'
    load_img_fn = load_flo
    n_channels = 2
    
    train_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, 'flow')

    file_gen = glob.iglob('{}/**/*.{}'.format(train_path, extension), recursive=True)
    file_count = get_file_count(train_path)

    
    for img_path in tqdm_notebook(file_gen, total=file_count):
        img = load_flo(img_path)
        img.tofile(img_path)

In [None]:
# fix_flows('train')
fix_flows('test')

In [None]:
img.tofile()

In [126]:
with open('stat.txt', 'w+') as f:
    f.write(str((rgb_stat, flow_stat)))

In [32]:
import os
import time
import subprocess

In [38]:
def get_count1():
    return int(subprocess.getoutput('find /media/d/vsd/data/ucf101_preprocessed/split_01/test/rgb/ -type f | wc -l'))

def get_count2():
    return int(subprocess.getoutput('find /media/e/vsd/data/ucf101_preprocessed/split_01/train/rgb/ -type f | wc -l'))

In [12]:
get_count2()

NameError: name 'get_count2' is not defined

In [19]:
get_count()

18366

In [17]:
import time 

def get_count():
    return int(subprocess.getoutput('find /data/data/ucf101_preprocessed/split_01/train/flow/ -type f | wc -l'))

while False:
    t1 = time.time()
    c1 = get_count()
    time.sleep(30)
    c2 = get_count()
    t2 = time.time()
    
    dc = c2 - c1
    dt = t2 - t1
    
    fps = dc / dt
    print(fps, 'f/s')
    time.sleep(60)

In [37]:
get_count()

693433

In [14]:
os.system('find /media/d/vsd/data/ucf101_preprocessed/split_01/test/flow/ -type f | wc -l')

0

In [53]:
# cd PREPROCESSED_SPLIT_PATH
# cp run-notwork.sh to PREPROCESSED_SPLIT_PATH
# run gen_flow.sh

## повалидируем

## TODO: clip flow

In [9]:
def validate_generated_flow(train_or_test):
    rgb_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, 'rgb')
    flow_path = os.path.join(PREPROCESSED_SPLIT_PATH, train_or_test, 'flow')
    helpers.ensure_path_exists(flow_path)
    
    path_prefix_len = len(PREPROCESSED_SPLIT_PATH) + 1
    
    errors = []
    
    for class_ in tqdm_notebook(os.listdir(rgb_path), desc='[{}] Flow Validation'.format(train_or_test)):
        src_class_path = os.path.join(rgb_path, class_)
        dst_class_path = os.path.join(flow_path, class_)
        helpers.ensure_path_exists(dst_class_path)
        
        for video_name in os.listdir(src_class_path):
            src_video_path = os.path.join(src_class_path, video_name)
            dst_video_path = os.path.join(dst_class_path, video_name)
            
            if len(os.listdir(src_video_path)) - len(os.listdir(dst_video_path)) != 0:
                errors.append(video_name)
                
    return errors