# CreateControls
### Creates the control datasets
### These datasets are generated by randomly sampling from the human genome.
### Three constraints can be applied: length must be similar to that of enhancers (L), GC-content must be similar to that of enhancers (G), and exclusion of repeats (R/NR).
### Four datasets are generated: LR (length-keep_repeats), LNR (length-no_repeats), LGR, LGNR

## Imports

In [None]:
from RandomHuman import RandomHuman
import os

## Files

In [None]:
# Input files
data_dir = "../Data"
gnm_path = f"{data_dir}/HG38/HG38_main.fa"
fantom_path = f"{data_dir}/FANTOM/F5.hg38.enhancers.expression.usage.matrix"
exc_dir = f"{data_dir}/RED_HG38/"

for path in [data_dir, gnm_path, fantom_path, exc_dir]:
    assert os.path.exists(path), path 

In [None]:
# Output files
dataset_dir = f"{data_dir}/Datasets"
out_path_lr = f"{dataset_dir}/LR/lr.fa"
out_path_lnr = f"{dataset_dir}/LNR/lnr.fa"
out_path_lgr = f"{dataset_dir}/LGR/lgr.fa"
out_path_lgnr = f"{dataset_dir}/LGNR/lgnr.fa"

for path in [dataset_dir, out_path_lr, out_path_lnr, out_path_lgr, out_path_lgnr]:
    assert os.path.exists(os.path.dirname(path)), os.path.dirname(path)

## Generate controls

In [None]:
rh = RandomHuman(gnm_path, fantom_path, exc_dir)

# LR
rh.make_random_data(exclude_repeats = False, control_gc = False, out_file=out_path_lr)

# LNR
rh.make_random_data(exclude_repeats = True, control_gc = False, out_file=out_path_lnr)

# LGR
rh.make_random_data(exclude_repeats = False, control_gc = True, out_file=out_path_lgr)

# LGNR
rh.make_random_data(exclude_repeats = True, control_gc = True, out_file=out_path_lgnr)