In [2]:
import os 
import shutil
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## CM file storage
format: `<Subunit><aes_number>.cm`
description: P denotes RnasP, L denotes LSU and S denotes SSU

In [16]:
unit_mapping = {"16S": "S", "23S": "L", "bactRNAseP": "P"}

In [15]:
cm_files_location = "./data/all_cms"
os.makedirs(cm_files_location, exist_ok=True)

In [16]:
cm_version = "covariance_models"


base_path = "./cm_builder_storage/data"
for unit in tqdm(os.listdir(base_path)):
    cm_dir = os.path.join(base_path, unit, cm_version)
    for aes in tqdm(os.listdir(cm_dir)):
        if not aes.endswith(".cm"):
            continue
        cm_name = unit_mapping[unit] + aes.replace("aes_", "")
        src_cm = os.path.join(cm_dir, aes)
        
        dst_cm = os.path.join(cm_files_location, cm_name)
        shutil.copyfile(src_cm, dst_cm)
    

100%|██████████| 32/32 [00:00<00:00, 12769.26it/s]
100%|██████████| 129/129 [00:00<00:00, 13904.13it/s]
100%|██████████| 68/68 [00:00<00:00, 13572.51it/s]
100%|██████████| 3/3 [00:00<00:00, 136.61it/s]


In [13]:
aes

'aes_10_1.cm'

## Fasta file Storage
Format: <br>
`> <Species>_<Subunit>_<aes_number>`<br>
`<sequence>`<br>
Not aligned 

In [42]:
fasta_files_location = "./data/all_fasta"
os.makedirs(fasta_files_location, exist_ok=True)

In [43]:
def process_msa_line(line, unit, aes_number, dst, mapping=None):
    
    header, sequence = line.split(" ")
    # _, sp1, _, sp2 = header.split("_")[:4]
    # strain = header.split("_")[-1]
    
    if mapping:
        header = mapping[header].replace(" ", "_").replace("/", "_")
        
    header = header.split("|")[0].replace("Bacteria_", "") + "|" + unit + aes_number
    # remove gaps
    sequence = sequence.replace("-", "")
    content = f"> {header}\n{sequence}"
    filename = f"{header}.fas"
    with open(os.path.join(dst, filename), "w") as file:
        file.writelines(content)

def stockholm_to_sequence(path, unit, aes_number, dst="./all_fasta", mapping=None):
    with open(path, "r") as f:
        for line in f.readlines(): 
            if line.startswith("#") or line.startswith("//") or line.startswith("Structure"):
                continue
            process_msa_line(line.strip(), unit, aes_number, dst, mapping)

In [44]:
base_path = "./cm_builder_storage/data"
for unit in os.listdir(base_path):
    st_dir = os.path.join(base_path, unit, "stockholms")
    
    if unit == "bactRNAseP":
        continue
    
    for aes in os.listdir(st_dir):
        aes_number = aes.split(".")[0].replace("aes_", "")
        st_path = os.path.join(st_dir, aes)
        stockholm_to_sequence(st_path, unit_mapping[unit], aes_number, cm_files_location)

In [45]:
# # header analysis
# path = "/home/sumon/repos/aes_db/cm_builder_storage/results/16S/stockholms/aes_0.stockholm"

# header_list = []
# with open(path, "r") as f:
#         for line in f.readlines(): 
#             if line.startswith("#") or line.startswith("//") or line.startswith("Structure"):
#                 continue
#             header, sequence = line.split(" ")
#             header_list.append(header.split("_")[1])
            
# sorted(header_list);

In [46]:
# mapper 
file_path = "./RF00010.fa"
mapping = {}

with open(file_path, "r") as f:
    for i in f.readlines():
        if not i.startswith(">"):
            continue
        number = i.split(" ")[0].replace(">", "")
        rest = i.split(",")[0]
        replace_list = ["chromosome", ">"+number, "assembly", "genome", "isolate", "genomic", "DNA", "sequence", "complete"]
        replace_list.extend(number.split("/"))
        
        for j in replace_list:
            rest = rest.replace(j, "")
            
        mapping[number] = rest.strip()

In [48]:
mapping

{'CP073754.1/3621246-3620895': 'Methylomonas paludis strain S2AM',
 'CP014476.1/2129425-2129774': 'Methylomonas denitrificans strain FJG1',
 'CP002738.1/5042412-5042062': 'Methylomonas methanica MC09',
 'CP064653.1/3384204-3384553': 'Methylomonas sp. LL1',
 'FO082060.1/4170233-4169869': 'Methylomicrobium alcaliphilum str. 20Z',
 'CP014360.1/4640302-4640652': 'Methylomonas sp. DH-1',
 'FNQP01000011.1/72224-72578': 'Thiothrix caldifontis strain DSM 21228',
 'LUUI01000142.1/50078-49727': 'Methylomonas lenta strain R-45370 contig_73_(1..65710)',
 'CM001475.1/3915427-3915034': 'Methylomicrobium album BG8',
 'CP072793.1/423762-424110': 'Thiothrix unzii strain A1',
 'QFXE01000005.1/641168-640817': 'MAG: endosymbiont of Escarpia spicata  A1462 scaffold1543_len646362_cov108_read99_cluster10',
 'LNYK01000010.1/140655-140293': 'Legionella londiniensis strain ATCC 49505 Llon_ctg010',
 'CP011797.1/2913782-2913435': 'Reinekea forsetii strain Hel1_31_D35',
 'JAQSDF010000005.1/90601-90250': 'MAG: Cand

In [47]:
unit = "bactRNAseP"
st_dir = "./cm_builder_storage/data/bactRNAseP/stockholms"

for aes in os.listdir(st_dir):
    aes_number = aes.split(".")[0].replace("aes_", "")
    st_path = os.path.join(st_dir, aes)
    stockholm_to_sequence(st_path, unit_mapping[unit], aes_number, fasta_files_location, mapping)

In [33]:
st_path

'./cm_builder_storage/data/bactRNAseP/stockholms/aes_1_0.stockholm'

In [18]:
# concat all cms
cm_path = "/home/sumon/repos/aes_db/cm_builder_storage/data/all_cms"
cm_dest_file = "/home/sumon/repos/aes_db/cm_builder_storage/data/all.cm"

with open(cm_dest_file, 'w') as dst_file:
    for cm_file in os.listdir(cm_path):
        cm_full_path = os.path.join(cm_path, cm_file)
        with open(cm_full_path, 'r') as src_file:
            for line in src_file.readlines():
                dst_file.writelines(line)


In [23]:
# concat all fasta
fasta_path = "/home/sumon/workspace/git_repos/aes_db/data/all_fasta"
fasta_dest_file = "/home/sumon/workspace/git_repos/aes_db/data/bac_ribosome.fasta"

with open(fasta_dest_file, 'w') as dst_file:
    for fasta_file in os.listdir(fasta_path):
        fasta_full_path = os.path.join(fasta_path, fasta_file)
        with open(fasta_full_path, 'r') as fasta_file:
            for line in fasta_file.readlines():
                dst_file.writelines(line)
        
        dst_file.writelines("\n\n")


## Augment AESes

In [2]:
import pandas as pd
data_path = "/home/sumon/repos/aes_db/data/aes_defs/TM_RnasP_AES_Defs.csv" 

df = pd.read_csv(data_path, header=None)
df.columns = ["aes", "em", "range"]
df.head()

Unnamed: 0,aes,em,range
0,1,RP_B_TM_RNA,1-11;46-65;217-221;324-346
1,2,RP_B_TM_RNA,222-240;259-273;298-323
2,3,RP_B_TM_RNA,12-45
3,4,RP_B_TM_RNA,66-72;102-111;119-121;202-216
4,5,RP_B_TM_RNA,73-101


In [1]:
ss_cons = "{{{{.{{.{.{.{.{.{..{{{{{{{,<<<<.<<..<.<<__........................................................................................____.....>>.>.>.>.>>>>,,,,AAA,AAAAA[[[.[...---..BBBB-.[.[[.[[................<<<<<............_____................>>>>><<<<<.......................____......>>>>->((.--...-((.(((,,,,,,,..,,,,.,<<.<<<....<<...<..<<<.........................................................................................................................____>>>.>....>..>...>>>-->>,,,,,,,....<<<...<<.<.._.._......................_____.>>>>>>..<<<<.<.<.<.<<.____..>>.>.>.>.>>>-.>.,,................)))--)).))]]]]].]]]],,,<<<<.----..............................--<<<..<.<<----....................................................-<<<<<_bbbb....>>>>>........->>>...>>.>.....----...-.....>...>.>>,,.....................,..,,,.<<<<.<.<.<<____>>>.>..>.>.>.>.,..,,,,,,,,,.}}}}}}}-.................--------------............................................-------aaaaaaaa-..}.-}}}}}}.}}}}::::"

In [12]:

aes_csv_path = "/home/sumon/repos/aes_db/data/aes_defs/TM_RnasP_AES_Defs.csv"
aes_df = pd.read_csv(aes_csv_path, header=None, names=["aes", "middle", "ranges"])

masked_ranges = aes_df[aes_df["aes"] == "mask"]["ranges"].apply(lambda x: list(map(lambda y: list(map(int, y.split("-"))), x.split(";")))).tolist()[0]
masked_ranges


[[43, 130],
 [192, 207],
 [213, 224],
 [230, 245],
 [256, 278],
 [337, 467],
 [471, 479],
 [521, 542],
 [595, 610],
 [643, 672],
 [681, 739],
 [760, 766],
 [807, 827],
 [886, 909],
 [912, 960]]

In [13]:
ss_cons_list = list(ss_cons)
for a, b in masked_ranges[::-1]:
    del ss_cons_list[a:b+1]
    

In [15]:
print("".join(ss_cons_list))

{{{{.{{.{.{.{.{.{..{{{{{{{,<<<<.<<..<.<<__.___.....>>.>.>.>.>>>>,,,,AAA,AAAAA[[[.[...---..BBBB-.[.[[.[[.<<<<.____.>>>><<<<<.___......>>>>->((.--...-((.(((,,,,,,,..,,,,.,<<.<<<....<<.__>..>...>>>-->>,,,,,,,....<<<...<<.<.._.._.____.>>>>>>..<<<<.<.<.<.<<.____..>>.>.>.>.>>>-.>.,,.))--)).))]]]]].]]]],,,<<<<.----.-<<<..<.<<<<<_bbbb....>>>>>.->>>...>>.>.....----...-.....>...>.>>,,...,,,.<<<<.<.<.<<____>>>.>..>.>.>.>.,..,,,,,,,,,.}}}}}}}-.--------aaaaaaaa-..}.-}}}}}}.}}}}::::


In [1]:
st = '''
UACCUGGUUGAUCCUGCCAGUAGCAU(A2M)UGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUACGCACGGCCGGU
ACAGUGAAACUGCGAAUGGCUCAUUAAAUCAGUUAUGGU(OMU)CC(PSU)U(OMU)GGUCGCUCGCUCCUCUCCCACUU
GGAUAACUGUGGUA(A2M)UUCUAG(A2M)GCUAAUA(OMC)AUGCCGACGGGCGCUGACCCCCUUCGCGGGGGGGAUGC
GUGCAUUUAUCAGUGGUGACUCUAGAUAACCUCGGGCCGAUCGCACGCCCCCCGUGGCGGCGACGACCCAUUCGAACGUC
UGCCCUAUCAACUUUCGAUGGUAGUCGCCGUGCCUACCAUGGUGACCACGGGUGACGGGGAAUCAGGGUUCGAUUCCGGA
GAGGGAGCCUGAGAAACGGCUACCACAUCCAAGGAAGGCAGCAGGCGCGC(A2M)AAUUACCCACUCCCGACCCGGGGA
(OMG)GUAGUGA(OMC)GAAAAAUAACAAUACAGGACUCUUUCGAGGCCCUGUAAUUGGAAUGAGUCCACUUUAAAUCCU
UUAACGAGGAUCCAUUGGAGGGCAAGUCUGG(PSU)GCCAGCAGCCGCGGUAAUUCCAGCUCCAAUA(OMG)CGUAUAUU
AAAGUUGCUGCAGUU(A2M)AAAAGCUCGUAGUU(OMG)GAUCUUGGGAGCGGGCGUCCCCGCCCUCUCGGCCGGGGCCC
GAAGCGUUUACUUUGAAAAAA(5MU)UAGAGUG(PSU)(PSU)CAAAGCAGGCCCGAGCCGCCUGGAUACCGCAGCUAGG
AAUAAUGGAAUAGGACCGCGGUUCUAUUUUGUUGGUUUUCGGAACUGAGGCCAUGAUUAAGAGGGACGGCCGGGGGCAUU
CGUAUUGCGCCGCUAGAGGUGAAAUUCUUGGACCGGCGCAAGACGGACCAGAGCGAAAGCAUUUGCCAAGAAUGUUUUCA
UUAAUCAAGA(A2M)CGAAAGUCGGAGGUUCGAAGACGAUCAGAUACCGUCGUAGUUCCGACCA(PSU)AAACGAUGCCG
ACCGGCGAUGCGGCGGCGUUAUUCCCAUGACCCGCCGGGCAGCUUCCGGGAAACCAAAGUCUUUGGGUUCCGGGGGGAGU
AUGGUUGCAAAGCUGAAACUUAAAGGAAUUGACGGAAGGGCACCAC(JMH)AGGAGUGGAGCCUGCGGCUUAAU(PSU)U
GAC(B8N)CAACACGGGAAACCUCACCCGGCCCGGACACGGACAGGAUUGACAGAUUGAUAGCUCUUUCUCGAUUCCGUG
GGUGGUGGUGCAUGGC(4AC)GUUCUUAGUUGGUGGAGCGAUUUGUCUGGUUAAUUC(5MC)GAUAACGAACGAGACUCU
GGCAUGCUAACUAGUUACGCGACCCCCGAGCGGUCGGCGUCCCCCAACUUCUUAGAGGGACAAGUGGCGUUCAGCCACCC
GAGAUUGAGCAAUAACAGGUCUGUGAUGCCCUUAGAUGUCCGGGGCUGCACGCGCGCUACACUGACUGGCUCAGCGUGUG
CCUACCCUACGCCGGCAGGCGCGGGUAACCCGUUGAACCCCAUUCGUGAUGGGGAUCGGGGAUUGCAAUUAUUCCCCAUG
AACGAGGAAUUCCCAGUAAGUGCGGGUCAUAAGCUUGCGUUGAUU(A2M)AGUCCCUGCCCUUUGUACACACCG(OMC)C
CGUCG(OMC)UACUACCGAUUGGAUGGUUUAGUGAGGCCCUCGGAUCGGCCCCGCCGGGGUCGGCCCACGGCCCUGGCGG
AGCGCUGAGAAGACGGUCGAACUUG(M7A)CUAUCUAGAGGAAGUAAAAGUCG(UR3)A(6MZ)CAAGGUUUC(4AC)GU
AGGUG(MA6)(MA6)CCUGCGGAAGGAUCAUUA
'''.strip().replace("\n", "")

In [8]:
import re
new_st = ""
for i in st.split("("):
    new_st += re.sub("(.+\))", "M", i)

In [10]:
print(new_st)

UACCUGGUUGAUCCUGCCAGUAGCAUMUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUACGCACGGCCGGUACAGUGAAACUGCGAAUGGCUCAUUAAAUCAGUUAUGGUMCCMUMGGUCGCUCGCUCCUCUCCCACUUGGAUAACUGUGGUAMUUCUAGMGCUAAUAMAUGCCGACGGGCGCUGACCCCCUUCGCGGGGGGGAUGCGUGCAUUUAUCAGUGGUGACUCUAGAUAACCUCGGGCCGAUCGCACGCCCCCCGUGGCGGCGACGACCCAUUCGAACGUCUGCCCUAUCAACUUUCGAUGGUAGUCGCCGUGCCUACCAUGGUGACCACGGGUGACGGGGAAUCAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGGCUACCACAUCCAAGGAAGGCAGCAGGCGCGCMAAUUACCCACUCCCGACCCGGGGAMGUAGUGAMGAAAAAUAACAAUACAGGACUCUUUCGAGGCCCUGUAAUUGGAAUGAGUCCACUUUAAAUCCUUUAACGAGGAUCCAUUGGAGGGCAAGUCUGGMGCCAGCAGCCGCGGUAAUUCCAGCUCCAAUAMCGUAUAUUAAAGUUGCUGCAGUUMAAAAGCUCGUAGUUMGAUCUUGGGAGCGGGCGUCCCCGCCCUCUCGGCCGGGGCCCGAAGCGUUUACUUUGAAAAAAMUAGAGUGMMCAAAGCAGGCCCGAGCCGCCUGGAUACCGCAGCUAGGAAUAAUGGAAUAGGACCGCGGUUCUAUUUUGUUGGUUUUCGGAACUGAGGCCAUGAUUAAGAGGGACGGCCGGGGGCAUUCGUAUUGCGCCGCUAGAGGUGAAAUUCUUGGACCGGCGCAAGACGGACCAGAGCGAAAGCAUUUGCCAAGAAUGUUUUCAUUAAUCAAGAMCGAAAGUCGGAGGUUCGAAGACGAUCAGAUACCGUCGUAGUUCCGACCAMAAACGAUGCCGACCGGCGAUGCGGCGGCGUUAUUCCCAUGACCCGCCG

In [11]:
len(new_st)

1740

In [12]:
len('UACCUGGUUGAUCCUGCCAGUAGCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUACGCACGGCCGGUACAGUGAAACUGCGAAUGGCUCAUUAAAUCAGUUAUGGUUCCUUUGGUCGCUCGCUCCUCUCCCACUUGGAUAACUGUGGUAAUUCUAGAGCUAAUACAUGCCGACGGGCGCUGACCCCCUUCGCGGGGGGGAUGCGUGCAUUUAUCAGUGGUGACUCUAGAUAACCUCGGGCCGAUCGCACGCCCCCCGUGGCGGCGACGACCCAUUCGAACGUCUGCCCUAUCAACUUUCGAUGGUAGUCGCCGUGCCUACCAUGGUGACCACGGGUGACGGGGAAUCAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGGCUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCACUCCCGACCCGGGGAGGUAGUGACGAAAAAUAACAAUACAGGACUCUUUCGAGGCCCUGUAAUUGGAAUGAGUCCACUUUAAAUCCUUUAACGAGGAUCCAUUGGAGGGCAAGUCUGGUGCCAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUAAAGUUGCUGCAGUUAAAAAGCUCGUAGUUGGAUCUUGGGAGCGGGCGUCCCCGCCCUCUCGGCCGGGGCCCGAAGCGUUUACUUUGAAAAAAUUAGAGUGUUCAAAGCAGGCCCGAGCCGCCUGGAUACCGCAGCUAGGAAUAAUGGAAUAGGACCGCGGUUCUAUUUUGUUGGUUUUCGGAACUGAGGCCAUGAUUAAGAGGGACGGCCGGGGGCAUUCGUAUUGCGCCGCUAGAGGUGAAAUUCUUGGACCGGCGCAAGACGGACCAGAGCGAAAGCAUUUGCCAAGAAUGUUUUCAUUAAUCAAGAACGAAAGUCGGAGGUUCGAAGACGAUCAGAUACCGUCGUAGUUCCGACCAUAAACGAUGCCGACCGGCGAUGCGGCGGCGUUAUUCCCAUGACCCGCCGGGCAGCUUCCGGGAAACCAAAGUCUUUGGGUUCCGGGGGGAGUAUGGUUGCAAAGCUGAAACUUAAAGGAAUUGACGGAAGGGCACCACCAGGAGUGGAGCCUGCGGCUUAAUUUGACXCAACACGGGAAACCUCACCCGGCCCGGACACGGACAGGAUUGACAGAUUGAUAGCUCUUUCUCGAUUCCGUGGGUGGUGGUGCAUGGCNGUUCUUAGUUGGUGGAGCGAUUUGUCUGGUUAAUUCCGAUAACGAACGAGACUCUGGCAUGCUAACUAGUUACGCGACCCCCGAGCGGUCGGCGUCCCCCAACUUCUUAGAGGGACAAGUGGCGUUCAGCCACCCGAGAUUGAGCAAUAACAGGUCUGUGAUGCCCUUAGAUGUCCGGGGCUGCACGCGCGCUACACUGACUGGCUCAGCGUGUGCCUACCCUACGCCGGCAGGCGCGGGUAACCCGUUGAACCCCAUUCGUGAUGGGGAUCGGGGAUUGCAAUUAUUCCCCAUGAACGAGGAAUUCCCAGUAAGUGCGGGUCAUAAGCUUGCGUUGAUUAAGUCCCUGCCCUUUGUACACACCGCCCGUCGCUACUACCGAUUGGAUGGUUUAGUGAGGCCCUCGGAUCGGCCCCGCCGGGGUCGGCCCACGGCCCUGGCGGAGCGCUGAGAAGACGGUCGAACUUGACUAUCUAGAGGAAGUAAAAGUCGUAACAAGGUUUCNGUAGGUGAACCUGCGGAAGGAUCAUUA')

1740

In [None]:
from Bio import AlignIO
import os
base = "/home/sumon/workspace/git_repos/aes_db/cm_builder_storage/data/23S/stockholms"
align1 = AlignIO.read(os.path.join(base, "aes_1_0.stockholm"), "stockholm")
align2 = AlignIO.read(os.path.join(base, "aes_39_0.stockholm"), "stockholm")