# SplitControlsToSets
### Splits each control dataset into training, validation, and test sets

## Imports

In [1]:
from Bio import SeqIO
import os

## Files

In [None]:
data_dir = "../Data/Datasets/"
lr_path = f"{data_dir}/LR/"
lnr_path = f"{data_dir}/LNR/"
lgr_path = f"{data_dir}/LGR/"
lgnr_path = f"{data_dir}/LGNR/"

path_list = [lr_path, lnr_path, lgr_path, lgnr_path]
file_list = ["lr.fa", "lnr.fa", "lgr.fa", "lgnr.fa"]

for path in path_list:
    assert os.path.exists(path), f"Path {path} does not exist"

## Parameters
### MAKE SURE THESE ARE THE SAME AS THE ONES IN FantomData.ipynb

In [3]:
train_ratio = 0.7
valid_ratio = 0.15
test_ratio = 0.15

In [None]:
max_len = 600

## Split the control datasets

In [None]:
for path, file_name in zip(path_list, file_list):

    # Get paths
    fasta_path = f"{path}/{file_name}"
    train_path = f"{path}/train_{file_name}"
    valid_path = f"{path}/valid_{file_name}"
    test_path = f"{path}/test_{file_name}"

    # Read in fasta file
    seq_list = list(SeqIO.parse(fasta_path, 'fasta'))

    # Filter out sequences longer than max_len
    keep_list = []
    for rec in seq_list:
        if len(str(rec.seq)) <= max_len:
            keep_list.append(rec)

    # Get cutoffs (percentages) for train and valid (test is the rest)
    train_cutoff = int(len(keep_list) * train_ratio)
    valid_cutoff = int(len(keep_list) * (train_ratio + valid_ratio))

    # cut up the list into train, valid, and test
    train_list = keep_list[:train_cutoff]
    valid_list = keep_list[train_cutoff:valid_cutoff]
    test_list = keep_list[valid_cutoff:]

    # Write out the files
    SeqIO.write(train_list, train_path, 'fasta')
    SeqIO.write(valid_list, valid_path, 'fasta')
    SeqIO.write(test_list, test_path, 'fasta')