In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [16]:
import os
import cv2
import numpy as np
from warnings import warn
from time import sleep
from multiprocessing import Pool
from multiprocessing import TimeoutError as MP_TimeoutError


In [27]:
START = "START"
FINISH = "FINISH"
WARNING = "WARNING"
FAIL = "FAIL"

def log2str(pid, comment, logs):
    str_log = ''
    if isinstance(logs, str):
        logs = [logs]
    for log in logs:
        str_log += "# JOB %d : --%s-- %s\n" % (pid, comment, log)
    return str_log

def log_print(pid, comment, logs, log_path='./pretreatment.log'):
    str_log = log2str(pid, comment, logs)
    if comment in [WARNING, FAIL]:
        with open(log_path, 'a') as log_f:
            log_f.write(str_log)
    if comment in [START, FINISH]:
        if pid % 500 != 0:
            return
    print(str_log, end='')

def cut_img(img, seq_info, frame_name, pid):
    if img.sum() <= 10000:
        message = 'seq:%s, frame:%s, no data, %d.' % ('-'.join(seq_info), frame_name, img.sum())
        warn(message)
        log_print(pid, WARNING, message)
        return None

    y = img.sum(axis=1)
    y_top = (y != 0).argmax(axis=0)
    y_btm = (y != 0).cumsum(axis=0).argmax(axis=0)
    img = img[y_top:y_btm + 1, :]

    _r = img.shape[1] / img.shape[0]
    _t_w = int(64 * _r)
    img = cv2.resize(img, (_t_w, 64), interpolation=cv2.INTER_CUBIC)

    sum_point = img.sum()
    sum_column = img.sum(axis=0).cumsum()
    x_center = -1
    for i in range(sum_column.size):
        if sum_column[i] > sum_point / 2:
            x_center = i
            break

    if x_center < 0:
        message = 'seq:%s, frame:%s, no center.' % ('-'.join(seq_info), frame_name)
        warn(message)
        log_print(pid, WARNING, message)
        return None

    h_T_W = int(64 / 2)
    left = x_center - h_T_W
    right = x_center + h_T_W
    if left <= 0 or right >= img.shape[1]:
        left += h_T_W
        right += h_T_W
        _ = np.zeros((img.shape[0], h_T_W))
        img = np.concatenate([_, img, _], axis=1)

    img = img[:, left:right]
    return img.astype('uint8')

def process_video(seq_info, pid):
    seq_name = '-'.join(seq_info)
    log_print(pid, START, seq_name)

    video_path = os.path.join(INPUT_PATH, *seq_info)
    out_dir = os.path.join(OUTPUT_PATH, seq_info[1].rsplit('.', 1)[0])  # Create directory with video name
    os.makedirs(out_dir, exist_ok=True)

    cap = cv2.VideoCapture(video_path)
    count_frame = 0

    frame_idx = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        img = cut_img(gray_frame, seq_info, f'frame_{frame_idx}', pid)

        if img is not None:
            save_path = os.path.join(out_dir, f'frame_{frame_idx}.png')
            cv2.imwrite(save_path, img)
            count_frame += 1

        frame_idx += 1

    cap.release()

    if count_frame < 5:
        message = 'seq:%s, less than 5 valid data.' % ('-'.join(seq_info))
        warn(message)
        log_print(pid, WARNING, message)

    log_print(pid, FINISH, 'Contain %d valid frames. Saved to %s.' % (count_frame, out_dir))

INPUT_PATH = '/kaggle/input/data-silhouette/data_silhouette'
OUTPUT_PATH = '/kaggle/working/data_silhouette_processed2'
LOG_PATH = './pretreatment.log'
WORKERS = 4

pool = Pool(WORKERS)
results = []
pid = 0

print('Pretreatment Start.\nInput path: %s\nOutput path: %s\nLog file: %s\nWorker num: %d' % (INPUT_PATH, OUTPUT_PATH, LOG_PATH, WORKERS))

id_list = os.listdir(INPUT_PATH)
id_list.sort()
for _id in id_list:
    seq_type = os.listdir(os.path.join(INPUT_PATH, _id))
    seq_type.sort()
    for video_file in seq_type:
        seq_info = [_id, video_file]
        results.append(pool.apply_async(process_video, args=(seq_info, pid)))
        sleep(0.02)
        pid += 1

pool.close()
unfinish = 1
while unfinish > 0:
    unfinish = 0
    for i, res in enumerate(results):
        try:
            res.get(timeout=0.1)
        except Exception as e:
            if type(e) == MP_TimeoutError:
                unfinish += 1
                continue
            else:
                print('\n\n\nERROR OCCUR: PID ##%d##, ERRORTYPE: %s\n\n\n' % (i, type(e)))
                raise e
pool.join()


# JOB 0 : --START-- 001-output-k1.mp4
Pretreatment Start.
Input path: /kaggle/input/data-silhouette/data_silhouette
Output path: /kaggle/working/data_silhouette_processed2
Log file: ./pretreatment.log
Worker num: 4
# JOB 0 : --FINISH-- Contain 337 valid frames. Saved to /kaggle/working/data_silhouette_processed2/output-k1.


In [28]:
import shutil

output_dir = '/kaggle/working/data_silhouette_processed2'
shutil.make_archive('/kaggle/working/data_silhouette_processed2', 'zip', output_dir)
print("Zipping complete.")


Zipping complete.
