In [1]:
import os
import shutil
from mmengine.config import Config
import json

In [2]:
current_dir = os.getcwd()

def listdir(*rel_paths):
    return os.listdir(os.path.join(current_dir, *rel_paths))

def isdir(*rel_paths):
    return os.path.isdir(os.path.join(current_dir, *rel_paths))

def join(*rel_paths):
    return os.path.join(current_dir, *rel_paths)

def isfile(*rel_paths):
    return os.path.isfile(join(*rel_paths))

def rmdir(*rel_paths):
    return shutil.rmtree(join(*rel_paths), ignore_errors=True)

def get_log_runs():
    work_dirs = [d for d in os.listdir() if isdir(d)]
    print(f'work dirs: {len(work_dirs)}')
    log_runs = [(work_dir, d) for work_dir in work_dirs for d in listdir(work_dir) if isdir(work_dir,d)]
    print(f'log runs: {len(log_runs)}')
    return log_runs

def is_train_run(work_dir, d):
    return isfile(work_dir, d, 'vis_data', f'{d}.json')

def is_test_run(work_dir, d):
    return isfile(work_dir, d, f'{d}.json')
    

In [3]:
log_runs = get_log_runs()

work dirs: 4
log runs: 14


In [4]:
failed_to_start = [(work_dir,d) for work_dir,d in log_runs if (not isfile(join(work_dir,d,'vis_data', f'{d}.json')) and not isfile(join(work_dir, d, f'{d}.json')))]
print(f'failed to start: {len(failed_to_start)}')
for work_dir, d in failed_to_start:
    rmdir(work_dir, d)

failed to start: 0


In [10]:
log_runs = get_log_runs()
# there are still more runs that may have failed to complete, we can identify them by looking at the contents of the json files (does the second last line iteration count match the max_iters in the config?)


work dirs: 4
log runs: 10


In [11]:
train_runs = [(work_dir, d) for work_dir, d in log_runs if isfile(work_dir, d, 'vis_data', f'{d}.json')]

def load_json(work_dir, d):
    with open(join(work_dir, d, 'vis_data', f'{d}.json')) as f:
        data = f.readlines()
        data = [json.loads(line) for line in data]
        return data

def completed(work_dir, d):
    data = load_json(work_dir, d)
    iteration = data[-2].get('iter')
    if iteration is None:
        return False
    config = Config.fromfile(join(work_dir, d, 'vis_data', 'config.py'))
    return iteration == config.train_cfg.max_iters

train_runs = [train_run for train_run in train_runs if is_train_run(*train_run)]

failed_to_complete = [train_run for train_run in train_runs if not completed(*train_run)]
print(f'failed to complete: {len(failed_to_complete)}')
for failed_train_run in failed_to_complete:
    rmdir(*failed_train_run)
    # print(*failed_train_run)

failed to complete: 0
