In [1]:
import os

import pandas as pd

In [2]:
DATASET_FOLDER = "/mnt/datasets/DeepCT/dataset_data/Biox_et_al/complete_clique1_with0/samples"
LOG_FOLDER = f"{DATASET_FOLDER}/logs"
SAMPLES_FILE = "complete_clique1_with0_samples_200.bed"
LARGEST_CHROM = 'chr2'
N_TRACKS = 3026

MAIN_PROCESS_PID = 27573

In [3]:
ls_dir = os.popen(f'ls -lah {DATASET_FOLDER}').read().split('\n')
chrom_lines = ls_dir[3:-1]
sizes = []
for line in chrom_lines:
    for sub in line.split():
        if sub.endswith('G'):
            sizes.append(float(sub[:-1]))
        elif sub.endswith('M'):
            sizes.append(float(sub[:-1]) / 1024)
print(f"Total expected output size: {sum(sizes):.2f}G")
print(f"Current total size: {ls_dir[0].split()[-1]}")

Total expected output size: 26.67G
Current total size: 1.7G


In [4]:
samples = pd.read_csv(SAMPLES_FILE, sep='\t', header=None)
samples.columns = ['chrom', 'start', 'end', 'index']

In [5]:
total_largest_chrom_writes = samples.groupby('chrom').start.count()[LARGEST_CHROM] * N_TRACKS

In [6]:
def time_str_to_secs(time):
    if '-' in time:
        days, hours = time.split('-')
    else:
        days = 0
        hours = time
    hours_split = hours.split(':')
    secs = int(days) * 24 * 60 * 60
    for i, it in zip(range(len(hours_split) - 1, -1, -1), hours_split):
        secs += int(it) * 60 ** i
    return secs

In [7]:
log = pd.read_csv(os.path.join(LOG_FOLDER, f'{LARGEST_CHROM}_writer_log.txt'), sep='\t', header=None)
log.columns = ['mode', 'chrom', 'sample_idx', 'track_idx', 'time']
n_written = log[log['mode'] == 'w'].sample_idx.count()
share_written = n_written / total_largest_chrom_writes

proc_time = os.popen(f'ps -p {MAIN_PROCESS_PID} -o etime').read()
time = proc_time.split()[-1]
secs_passed = time_str_to_secs(time)
total_secs = secs_passed / share_written

print(f"Progress: {share_written * 100:2.2f}% ({n_written}/{total_largest_chrom_writes} tracks processed)")
print(f"Time passed: {secs_passed / 60 / 60:.3f}h, estimated total: {total_secs / 60 / 60:.3f}h")

Progress: 3.21% (18480000/575678344 tracks processed)
Time passed: 3.137h, estimated total: 97.712h


In [20]:
del log

In [11]:
samples[samples['chrom'] == 'chr16']

Unnamed: 0,chrom,start,end
13400,chr16,65473,196545
13401,chr16,130946,262018
13402,chr16,196419,327491
13403,chr16,261892,392964
13404,chr16,327365,458437
...,...,...,...
14589,chr16,89894429,90025501
14590,chr16,89959902,90090974
14591,chr16,90025375,90156447
14592,chr16,90090848,90221920
