# Data Process Pipeline

### Data Process Commands

In [1]:
import os

OPEN_SORA_HOME = "/home/zhaowangbo/zangwei/opensora/"


def convert_dataset_cmd(input_dir, output_file, datatype="video"):
    commands = []
    commands.append(f'echo "Converting {input_dir} to {output_file}"')
    output_dir = os.path.dirname(output_file)

    commands.append(f"mkdir -p {output_dir}")
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(f"python -m tools.datasets.convert {datatype} {input_dir} --output {output_file}")
    return " && ".join(commands), output_file


def get_video_info(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_info{ext}"
    output_format = ext[1:]

    commands.append(f'echo "Getting info of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(
        f"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --info --fmin 1"
    )
    return " && ".join(commands), output_file


def get_video_info_torchvision(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_info{ext}"
    output_format = ext[1:]

    commands.append(f'echo "Getting info of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(
        f"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --video-info --fmin 1"
    )
    return " && ".join(commands), output_file


def get_caption_llava7b_video(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_caption{ext}"
    output_format = ext[1:]

    commands.append(f'echo "Getting info of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(f"conda activate llava2")
    commands.append(
        f"torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava {input_file} --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video"
    )
    commands.append(f"conda activate opensora")
    commands.append(
        f"python -m tools.datasets.datautil {base}_caption_part*{ext} --output {output_file} --format {output_format} --intersection {input_file} --clean-caption --refine-llm-caption --remove-empty-caption"
    )
    return " && ".join(commands), output_file


def get_caption_load(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_caption{ext}"
    output_format = ext[1:]

    commands.append(f'echo "Getting caption of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(
        f"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --load-caption json --remove-empty-caption --clean-caption"
    )
    return " && ".join(commands), output_file


def get_aesthetic_score(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_aes{ext}"
    output_format = ext[1:]

    commands.append(f'echo "Getting aesthetic score of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(f"torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference {input_file}")
    commands.append(
        f"python -m tools.datasets.datautil {base}_aes_part*{ext} --output {output_file} --format {output_format} --sort aes"
    )
    return " && ".join(commands), output_file


def get_flow_score(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_flow{ext}"

    commands.append(f'echo "Getting flow score of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(f"torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference {input_file}")
    return " && ".join(commands), output_file


def get_match_score(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_match{ext}"

    commands.append(f'echo "Getting match score of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(f"torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference {input_file}")
    return " && ".join(commands), output_file


def get_cmotion_score(input_file):
    commands = []
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_cmotion{ext}"

    commands.append(f'echo "Getting cmotion score of {input_file} to {output_file}"')
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append(f"python -m tools.caption.camera_motion_detect {input_file}")
    return " && ".join(commands), output_file


def get_commands(job_list):
    commands = []
    output_file = None
    for job in job_list:
        cmd = job.pop("cmd")
        if output_file is None:
            command, output_file = cmd(**job)
            commands.append(command)
        else:
            job["input_file"] = output_file
            command, output_file = cmd(**job)
            commands.append(command)
    commands.append(f'echo "All Done!"')
    return " && ".join(commands), output_file

### Remote Launch via Paramiko

First, you should add hosts in your ~/.ssh/config file

In [2]:
import paramiko

HOSTS = ["h800-80", "h800-81", "h800-82", "h800-83", "h800-84", "h800-85", "h800-86", "h800-170", "h800-171"]

# load from ~/.ssh/config
ssh_config = paramiko.SSHConfig()
user_config_file = os.path.expanduser("~/.ssh/config")
if os.path.exists(user_config_file):
    with open(user_config_file) as f:
        ssh_config.parse(f)


def get_ssh_config(hostname):
    # get the configuration for the host
    user_config = ssh_config.lookup(hostname)
    cfg = {
        "hostname": user_config["hostname"],
        "username": user_config["user"],
        "port": int(user_config["port"]),
        "key_filename": user_config["identityfile"],
    }
    return cfg


def connect(hostname):
    cfg = get_ssh_config(hostname)
    # connect
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(**cfg)
    return client


def run_command(command, hostname, nohup=False, log_file=None, sleep=None):
    client = connect(hostname)
    print("HOST:", hostname)
    if sleep:
        command = f"sleep {sleep}; {command}"
    command = f"bash -ic '{command}'"
    if log_file:
        command = f"{command} >> {log_file} 2>&1"
    if nohup:
        command = f"nohup {command} &"
    print("COMMAND:", command)
    stdin, stdout, stderr = client.exec_command(command, get_pty=False)

    stdout_str = stdout.read().decode()
    stderr_str = stderr.read().decode()
    if stdout_str:
        print("==== STDOUT ====\n", stdout_str)
    if stderr_str:
        print("==== STDERR ====\n", stderr_str)

    client.close()


def run_command_all_hosts(command, hosts=HOSTS):
    for hostname in hosts:
        run_command(command, hostname)

Here are tools to examine machine's status.

In [3]:
def nvidia_smi(host):
    if host:
        run_command("nvidia-smi", host)
    else:
        run_command_all_hosts("nvidia-smi")


def nvitop(host=None):
    if host:
        run_command(f"/home/zhaowangbo/.local/bin/nvitop -1", host)
    else:
        run_command_all_hosts("/home/zhaowangbo/.local/bin/nvitop -1")


def ps(host=None, interest="python|sleep|torchrun|colossal", all=True):
    cmd = "ps aux" if all else "ps ux"
    if host:
        if interest is None:
            run_command(f"{cmd} | cat", host)
        else:
            run_command(f'{cmd} | cat | grep --color=never -E "{interest}"', host)
    else:
        if interest is None:
            run_command_all_hosts(f"{cmd} | cat")
        else:
            run_command_all_hosts(f'{cmd} | cat | grep --color=never -E "{interest}"')


def kill(pid, host):
    run_command(f"kill -KILL {pid}", host)


def pkill(interest, host):
    run_command(f'pkill -9 -f "{interest}"', host)

### Examples

The following is the pipeline for panda.

In [49]:
# panda
host = "h800-83"
split = 16
input_dir = f"/mnt/disk1/data-panda/{split}"
log_file = os.path.join(OPEN_SORA_HOME, f"logs/data-panda-{split}-split.log")
output_file = f"/mnt/hdd/data/panda70m_by/raw/meta/split-{split}/meta.csv"
cmd, output_file = get_commands(
    [
        # {
        #     "cmd": convert_dataset_cmd,
        #     "input_dir": input_dir,
        #     "output_file": output_file,
        # },
        # {
        #     "cmd": get_caption_load,
        # },
        # {
        #     "cmd": get_video_info_torchvision,
        # },
        # {
        #     "cmd": get_aesthetic_score,
        # },
        # {
        #     "cmd": get_flow_score,
        # },
        # {
        #     "cmd": get_match_score,
        # },
        # {
        #     "cmd": get_cmotion_score,
        # },
    ]
)
print(cmd)
print(output_file)

echo "Getting aesthetic score of /mnt/hdd/data/panda70m_by/raw/meta/split-16/meta_info_caption_info.csv to /mnt/hdd/data/panda70m_by/raw/meta/split-16/meta_info_caption_info_aes.csv" && cd /home/zhaowangbo/zangwei/opensora/ && torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference /mnt/hdd/data/panda70m_by/raw/meta/split-16/meta_info_caption_info.csv && python -m tools.datasets.datautil /mnt/hdd/data/panda70m_by/raw/meta/split-16/meta_info_caption_info_part*.csv --output /mnt/hdd/data/panda70m_by/raw/meta/split-16/meta_info_caption_info_aes.csv --format csv --sort aes && echo "All Done!"
/mnt/hdd/data/panda70m_by/raw/meta/split-16/meta_info_caption_info_aes.csv


In [45]:
# panda
host = "h800-82"
split = 7
log_file = os.path.join(OPEN_SORA_HOME, f"logs/data-panda-{split}-split.log")
cmd, output_file = get_commands(
    [
        {
            "cmd": get_video_info_torchvision,
            "input_file": f"/mnt/hdd/data/panda70m_by/raw/meta/split-7/meta_loadjson_noempty_clean.csv",
        },
    ]
)
print(cmd)
print(output_file)

echo "Getting info of /mnt/hdd/data/panda70m_by/raw/meta/split-7/meta_loadjson_noempty_clean.csv to /mnt/hdd/data/panda70m_by/raw/meta/split-7/meta_loadjson_noempty_clean_info.csv" && cd /home/zhaowangbo/zangwei/opensora/ && python -m tools.datasets.datautil /mnt/hdd/data/panda70m_by/raw/meta/split-7/meta_loadjson_noempty_clean.csv --output /mnt/hdd/data/panda70m_by/raw/meta/split-7/meta_loadjson_noempty_clean_info.csv --format csv --video-info --fmin 1 && echo "All Done!"
/mnt/hdd/data/panda70m_by/raw/meta/split-7/meta_loadjson_noempty_clean_info.csv


In [83]:
# v2text
host = "h800-86"
log_file = os.path.join(OPEN_SORA_HOME, f"logs/data-v2text-18.log")
input_file = "/home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted.csv"
cmd, output_file = get_commands(
    [
        {
            "cmd": get_caption_llava7b_video,
            "input_file": input_file,
        },
    ]
)
print(cmd)
print(output_file)

echo "Getting info of /home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted.csv to /home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted_info.csv" && cd /home/zhaowangbo/zangwei/opensora/ && conda activate llava2 && torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video && conda activate opensora && python -m tools.datasets.datautil /home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted_caption_part*.csv --output /home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted_info.csv --format csv --intersection /home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted.csv --clean-caption --refine-llm-caption --remove-empty-caption && echo "All Done!"
/home/zhaowangbo/data/v2text/raw/meta/split-18/meta_remove_corrupted_info.csv


Remote launch via paramiko.

In [67]:
sleep = None
run_command(cmd, host, log_file=log_file, nohup=True, sleep=sleep)
ps(host)

HOST: h800-80
COMMAND: nohup bash -ic 'echo "Getting info of /home/zhaowangbo/data/v2text/raw/meta/split-12/meta_remove_corrupted.csv to /home/zhaowangbo/data/v2text/raw/meta/split-12/meta_remove_corrupted_info.csv" && cd /home/zhaowangbo/zangwei/opensora/ && conda activate llava2 && torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /home/zhaowangbo/data/v2text/raw/meta/split-12/meta_remove_corrupted.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video && conda activate opensora && python -m tools.datasets.datautil /home/zhaowangbo/data/v2text/raw/meta/split-12/meta_remove_corrupted_part*.csv --output /home/zhaowangbo/data/v2text/raw/meta/split-12/meta_remove_corrupted_info.csv --format csv --intersection /home/zhaowangbo/data/v2text/raw/meta/split-12/meta_remove_corrupted.csv --clean-caption --refine-llm-caption --remove-empty-caption && echo "All Done!"' >> /home/zhaowangbo/zangwei/opensora/logs/data-v2text-12.log 2>&1 &
HOST:

In [81]:
# pkill("split-6", "h800-84")
ps("h800-84")

HOST: h800-84
COMMAND: bash -ic 'ps ux | cat | grep --color=never -E "python|sleep|torchrun|colossal"'
==== STDOUT ====
 zhaowan+  697488  0.8  0.0 21302928 982860 ?     Sl   07:54   3:19 python -m tools.datasets.datautil /mnt/hdd/data/panda70m_by/raw/meta/split-4/meta_loadjson_noempty_clean_info.csv --output /mnt/hdd/data/panda70m_by/raw/meta/split-4/meta_loadjson_noempty_clean_info_info.csv --format csv --video-info --fmin 1
zhaowan+  756910  2.3  0.0 28226540 982328 ?     Sl   07:55   9:04 python -m tools.datasets.datautil /mnt/hdd/data/panda70m_by/raw/meta/split-4/meta_loadjson_noempty_clean_info.csv --output /mnt/hdd/data/panda70m_by/raw/meta/split-4/meta_loadjson_noempty_clean_info_info.csv --format csv --video-info --fmin 1
zhaowan+  757066 22.0  0.1 56222740 4023872 ?    Il   07:55  86:56 python -m tools.datasets.datautil /mnt/hdd/data/panda70m_by/raw/meta/split-4/meta_loadjson_noempty_clean_info.csv --output /mnt/hdd/data/panda70m_by/raw/meta/split-4/meta_loadjson_noempty_clea

Using following commands to monitor the status of the jobs.

In [4]:
ps()

HOST: h800-80
COMMAND: bash -ic 'ps aux | cat | grep --color=never -E "python|sleep|torchrun|colossal"'
==== STDOUT ====
 root        4838  0.0  0.0  29820 18308 ?        Ss   Apr08   0:01 /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
lisheng+ 1551124  0.0  0.0 3090356 185552 pts/9  Sl   11:48   0:02 /home/lishenggui/.conda/envs/opensora/bin/python /home/lishenggui/.conda/envs/opensora/bin/torchrun --master_addr 10.20.1.80 --master_port 29550 --nproc_per_node 8 --nnodes 8 --node_rank 0 /home/lishenggui/projects/sora/Open-Sora-dev/scripts/train.py configs/opensora-v1-1/train/video.py --data-path /home/zhaowangbo/data/csv/video_image_test_2.csv --wandb True --load /mnt/hdd/zangwei/opensora/outputs/789-STDiT2-XL-2/epoch1-global_step6500
lisheng+ 1565730  101  0.1 71931860 4155240 ?    Ssl  11:48 377:44 /home/lishenggui/.conda/envs/opensora/bin/python -u /home/lishenggui/projects/sora/Open-Sora-dev/scripts/train.py configs/opensora-v1-1/train/video.py --data-path /ho

In [None]:
nvitop(host)

In [None]:
kill(, host)

# Training

In [23]:
def colossal_run(data_path, load_path=None):
    commands = []
    commands.append(f"cd {OPEN_SORA_HOME}")
    command = f"colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora-v1-1/train/video.py --wandb True --data-path {data_path}"
    if load_path:
        command = f"{command} --load-path {load_path}"
    commands.append(command)
    cmd = " && ".join(commands)
    return cmd


def kill_all():
    commands = []
    commands.append(f"cd {OPEN_SORA_HOME}")
    commands.append('cat hostfile  | xargs -I "{}" ssh "{}" pkill -9 python')
    cmd = " && ".join(commands)
    return cmd

### Examples

In [24]:
host = "h800-80"
log_file = os.path.join(OPEN_SORA_HOME, "logs/train_02.log")
data_path = "/home/zhaowangbo/data/csv/video_image_test_2.csv"
ckpt_path = "outputs/764-STDiT2-XL-2/epoch1-global_step6000"
cmd = colossal_run(data_path, ckpt_path)
print(cmd)

cd /home/zhaowangbo/zangwei/opensora/ && colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora-v1-1/train/video.py --wandb True --data-path /home/zhaowangbo/data/csv/video_image_test_2.csv --load-path outputs/764-STDiT2-XL-2/epoch1-global_step6000


In [None]:
run_command(cmd, host, log_file=log_file, nohup=True)

In [27]:
cmd = kill_all()
run_command(cmd, host)

HOST: h800-80
COMMAND: bash -ic 'cd /home/zhaowangbo/zangwei/opensora/ && cat hostfile  | xargs -I "{}" ssh "{}" pkill -9 python'
==== STDERR ====
 bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
pkill: killing pid 382879 failed: Operation not permitted

