# The purpose of this notebook is to generate fasta files for use with Red
### We first remove the alternate loci. Then we create a directory containing fasta files for the main chromosomes and scaffolds. Then we create a single multi-fasta format file for just the main chromosomes (no mitochondria)

## Imports

In [1]:
from Bio import SeqIO
import os

## Files

In [2]:
gnm_file='/home/transposons/Data/CRM/HG38/HG38.fa'

In [6]:
data_dir = "../Data/HG38"

main_path = f"{data_dir}/HG38_main.fa"
scaffold_path  = f"{data_dir}/Scaffolds"
chromosome_path = f"{data_dir}/Chromosomes"

In [7]:
assert os.path.exists(main_path), f'{main_path} doesn\'t exist'
assert os.path.exists(scaffold_path), f'{scaffold_path} doesn\'t exist'
assert os.path.exists(chromosome_path), f'{chromosome_path} doesn\'t exist'

## Loading Genome Multi-Fasta File

In [8]:
rec_list = list(SeqIO.parse(gnm_file, "fasta"))

## Parsing Genome

In [9]:
chr_dict = {str(x):0 for x in list(range(1, 23)) + ['X', 'Y']} # key -> value = chr -> counter
unplaced_counter = 0
final_list = []
for rec in rec_list:
    token_list = rec.description.split()
    
    if 'alternate' not in token_list:
        if ',' in token_list[4]:
            rec.id=f"chr{token_list[4][:-1]}"
        elif 'genomic' == token_list[4]:
            rec.id=f"genomic_{unplaced_counter}"
            unplaced_counter += 1
        elif token_list[4] in chr_dict:
            rec.id=f"chr{token_list[4]}_{chr_dict[token_list[4]]}"
            chr_dict[token_list[4]] += 1
        elif token_list[4] == "complete":
            rec.id="chrM"
        else:
            raise RuntimeError(f"Unknown pattern: {token_list}")
        
        rec.description='' 
        final_list.append(rec)

## Saving

In [None]:
main_list = []
scaffold_list = []
for rec in final_list:
    if "_" not in rec.id and rec.id != "chrM":
        main_list.append(rec)
    elif rec.id != "chrM":
        scaffold_list.append(rec)

### Write main chromosomes to one file

In [11]:
SeqIO.write(main_list, main_path, 'fasta')

### Writing scaffolds to separate files

In [10]:
for rec in scaffold_list:
    chr_file_name =f"{scaffold_path}/{rec.id}.fa"
    SeqIO.write(rec, chr_file_name, 'fasta')

### Write main chromosomes to separate files

In [None]:
for rec in main_list:
    SeqIO.write(rec, f"{chromosome_path}/{rec.id}.fa")