In [4]:
import os 
import shutil

## CM file storage
format: `<Subunit><aes_number>.cm`
description: P denotes RnasP, L denotes LSU and S denotes SSU

In [5]:
cm_files_location = "/home/sumon/repos/aes_db/data/all_cms"
os.makedirs(cm_files_location, exist_ok=True)

In [6]:
unit_mapping = {"16S": "S", "23S": "L", "bactRNAseP": "P"}
base_path = "/home/sumon/repos/aes_db/cm_builder_storage/results"
for unit in os.listdir(base_path):
    cm_dir = os.path.join(base_path, unit, "covariance_models")
    for aes in os.listdir(cm_dir):
        if not aes.endswith(".cm"):
            continue
        cm_name = unit_mapping[unit] + aes.split("_")[-1]
        src_cm = os.path.join(cm_dir, aes)
        
        dst_cm = os.path.join(cm_files_location, cm_name)
        shutil.copyfile(src_cm, dst_cm)
    

## Fasta file Storage
Format: <br>
`> <Species>_<Subunit>_<aes_number>`<br>
`<sequence>`<br>
Not aligned 

In [91]:
cm_files_location = "/home/sumon/repos/aes_db/data/all_fasta"
os.makedirs(cm_files_location, exist_ok=True)

In [129]:
def process_msa_line(line, unit, aes_number, dst="./all_fasta", mapping=None):
    
    header, sequence = line.split(" ")
    # _, sp1, _, sp2 = header.split("_")[:4]
    # strain = header.split("_")[-1]
    
    if mapping:
        header = mapping[header].replace(" ", "_").replace("/", "_")
        
    header = header.split("|")[0].replace("Bacteria_", "") + "|" + unit + aes_number
    
    content = f"> {header}\n{sequence}"
    filename = f"{header}.fas"
    
    with open(os.path.join(dst, filename), "w") as file:
        file.writelines(content)


In [130]:
def stockholm_to_sequence(path, unit, aes_number, dst, mapping=None):
    with open(path, "r") as f:
        for line in f.readlines(): 
            if line.startswith("#") or line.startswith("//") or line.startswith("Structure"):
                continue
            process_msa_line(line.strip(), unit, aes_number, dst, mapping)
        

In [94]:
base_path = "/home/sumon/repos/aes_db/cm_builder_storage/results"
for unit in os.listdir(base_path):
    st_dir = os.path.join(base_path, unit, "stockholms")
    
    if unit == "bactRNAseP":
        continue
    
    for aes in os.listdir(st_dir):
        aes_number = aes.split(".")[0].split("_")[-1]
        st_path = os.path.join(st_dir, aes)
        stockholm_to_sequence(st_path, unit_mapping[unit], aes_number, cm_files_location)

In [78]:
# header analysis
path = "/home/sumon/repos/aes_db/cm_builder_storage/results/16S/stockholms/aes_0.stockholm"

header_list = []
with open(path, "r") as f:
        for line in f.readlines(): 
            if line.startswith("#") or line.startswith("//") or line.startswith("Structure"):
                continue
            header, sequence = line.split(" ")
            header_list.append(header.split("_")[1])
            
sorted(header_list);

In [120]:
# mapper 
file_path = "/home/sumon/repos/aes_db/RF00010.fa"
mapping = {}

with open(file_path, "r") as f:
    for i in f.readlines():
        if not i.startswith(">"):
            continue
        number = i.split(" ")[0].replace(">", "")
        rest = i.split(",")[0]
        replace_list = ["chromosome", ">"+number, "assembly", "genome", "isolate", "genomic", "DNA", "sequence", "complete"]
        replace_list.extend(number.split("/"))
        
        for j in replace_list:
            rest = rest.replace(j, "")
            
        mapping[number] = rest.strip()

In [113]:
rest.replace(j, "")

'>AE008692.2/ Zymomonas mobilis subsp. mobilis ZM4'

In [131]:
unit = "bactRNAseP"
st_dir = "/home/sumon/repos/aes_db/cm_builder_storage/results/bactRNAseP/stockholms"

for aes in os.listdir(st_dir):
    aes_number = aes.split(".")[0].split("_")[-1]
    st_path = os.path.join(st_dir, aes)
    stockholm_to_sequence(st_path, unit_mapping[unit], aes_number, cm_files_location, mapping)