In [1]:
import os
import shutil
from mmengine.config import Config
import json

In [2]:
current_dir = os.getcwd()

def listdir(*rel_paths):
    return os.listdir(os.path.join(current_dir, *rel_paths))

def isdir(*rel_paths):
    return os.path.isdir(os.path.join(current_dir, *rel_paths))

def join(*rel_paths):
    return os.path.join(current_dir, *rel_paths)

def isfile(*rel_paths):
    return os.path.isfile(join(*rel_paths))

def rmdir(*rel_paths):
    return shutil.rmtree(join(*rel_paths), ignore_errors=True)

def get_log_runs():
    work_dirs = [d for d in os.listdir() if isdir(d)]
    print(f'work dirs: {len(work_dirs)}')
    log_runs = [(work_dir, d) for work_dir in work_dirs for d in listdir(work_dir) if isdir(work_dir,d)]
    print(f'log runs: {len(log_runs)}')
    return log_runs

def started_train_run(work_dir, d):
    return isfile(work_dir, d, 'vis_data', f'{d}.json')

def started_test_run(work_dir, d):
    return isfile(work_dir, d, f'{d}.json')

def remove_runs(runs):
    for run in runs:
        rmdir(*run)
    

In [3]:
log_runs = get_log_runs()
failed_to_start = [log_run for log_run in log_runs if (not started_train_run(*log_run) and not started_test_run(*log_run))]
print(f'failed to start: {len(failed_to_start)}')
remove_runs(failed_to_start)

work dirs: 4
log runs: 10
failed to start: 0


In [4]:
log_runs = get_log_runs()

def load_training_data(work_dir, d):
    with open(join(work_dir, d, 'vis_data', f'{d}.json')) as f:
        data = f.readlines()
        data = [json.loads(line) for line in data]
        return data

def completed_training(work_dir, d):
    data = load_training_data(work_dir, d)
    iteration = data[-2].get('iter')
    if iteration is None:
        return False
    config = Config.fromfile(join(work_dir, d, 'vis_data', 'config.py'))
    return iteration == config.train_cfg.max_iters

train_runs = [log_run for log_run in log_runs if started_train_run(*log_run)]

failed_to_complete = [train_run for train_run in train_runs if not completed_training(*train_run)]
print(f'failed to complete: {len(failed_to_complete)}')
remove_runs(failed_to_complete)

work dirs: 4
log runs: 10
failed to complete: 0
