In [None]:
%load_ext autoreload
%autoreload 2

DATA PROCESSING 

In [None]:
from Bio import SeqIO
from Bio.PDB.PDBParser import PDBParser
from pathlib import Path
import numpy as np
import pandas as pd
import torch

In [None]:
basedir = Path('.').resolve()
pdb_file_path = (basedir / 'data/rna/7K00_23S.pdb').as_posix()
parser = PDBParser(PERMISSIVE=1)
struc = parser.get_structure('23S', pdb_file_path)

wt_seq = SeqIO.read(basedir / 'data/rna/7K00_sequence.fasta', 'fasta')
wt_seq = str(wt_seq.seq)

wt_seq_match = np.asarray([nt for nt in wt_seq if nt == nt.upper()])
res_idx_match = np.asarray([i for i,nt in enumerate(wt_seq) if nt == nt.upper()])
# mask is for Ecoli match states (i.e. match states that are not deleted in the E-coli sequence)
mask = np.asarray([(x != x.lower()) and (x != '-') for x in wt_seq_match])

dist_map = pd.read_csv(basedir / 'data/rna/7K00_distance_map.csv', header=None).values[mask, :][:, mask]
dist_map = torch.tensor(dist_map)
np.save(basedir / 'data/rna/processed_for_ml' / 'distance_map.npy', dist_map.cpu().numpy())

In [None]:
allowed_chars = {'A', 'U', 'C', 'G', '-'}
MSA = [str(s.seq) for s in SeqIO.parse(basedir / 'data/rna/23S_rRNA_aligned_7K00.fa', 'fasta')]
MSA = [np.asarray(list(x))[mask].tolist() for x in MSA]
MSA = [''.join([nt if nt in allowed_chars else 'X' for nt in seq]) for seq in MSA]

with open(basedir / 'data/rna/processed_for_ml' / '23S_alignment.txt', 'w') as outfn:
    for s in MSA:
        outfn.write(f'{s}\n')

In [None]:
basedir = Path('.').resolve()
processed_dir = basedir / 'data/rna/processed_for_ml'


i_to_nt = ['A', 'U', 'C', 'G', '-', 'X']
nt_to_i = {nt:i for i, nt in enumerate(i_to_nt)}
MSA = [s.strip() for s in open(processed_dir / '23S_alignment.txt')]
MSA_enc = torch.tensor([[nt_to_i[nt] for nt in s] for s in MSA]).cpu()
idxs = torch.randperm(len(MSA_enc), device='cpu')

In [None]:
num_train = int(0.8*len(MSA_enc))

train_idxs = idxs[:num_train]
val_idxs = idxs[num_train:]

torch.save(MSA_enc[train_idxs], processed_dir / 'train.pt')
torch.save(MSA_enc[val_idxs], processed_dir / 'val.pt')