In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from tensorflow import keras

In [2]:
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 8)

In [3]:
with np.load('data/ncbi_03/dataset.npz', allow_pickle = True) as data:
    encoder_input_data = data['encoder_input_data']
    decoder_input_data = data['decoder_input_data']
    token_index = data['token_index'].tolist()
    parent_acc = data['parent_acc']
    child_acc = data['child_acc']
    acc_mapping = data['mapping']

In [4]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(token_index)

reverse_char_index = dict((i, char) for char, i in token_index.items())
decode_seq = lambda seq: [ ''.join([ reverse_char_index[i] for i in s ]) for s in np.argmax(seq, axis = 2)]

(25865, 3813, 4)
(25865, 3813, 4)
{'A': 0, 'C': 1, 'G': 2, 'T': 3}


In [12]:
acc_mapping = { k: v.split("|") for k, v in acc_mapping }
len(acc_mapping)

26506

In [5]:
X_data = encoder_input_data#[:1000]
Y_data = decoder_input_data#[:1000]
print("X_data", X_data.shape)
print("Y_data", Y_data.shape)

X_data (25865, 3813, 4)
Y_data (25865, 3813, 4)


# Metadata

In [6]:
metadata = pd.read_csv("data/ncbi_03/ncbi_good.csv")
print(metadata.columns)
metadata.head()

Index(['accession', 'protein_accession', 'collection_date', 'sgene_begin',
       'sgene_end', 'location', 'region', 'genome', 'genome_desc',
       'sgene_protein', 'sgene_protein_desc', 'sgene_nucleotide'],
      dtype='object')


Unnamed: 0,accession,protein_accession,collection_date,sgene_begin,sgene_end,location,region,genome,genome_desc,sgene_protein,sgene_protein_desc,sgene_nucleotide
0,BS000784.1,BCX25042.1,2020-11-04,21509,25330,Japan:Kanto,Asia,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,Severe acute respiratory syndrome coronavirus ...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,1-1273 surface glycoprotein [organism=Severe a...,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...
1,BS000783.1,BCX25030.1,2020-11-05,21509,25330,Japan:Kanto,Asia,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,Severe acute respiratory syndrome coronavirus ...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,1-1273 surface glycoprotein [organism=Severe a...,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...
2,BS000782.1,BCX25018.1,2020-10-27,21509,25330,Japan:Kanto,Asia,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,Severe acute respiratory syndrome coronavirus ...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,1-1273 surface glycoprotein [organism=Severe a...,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...
3,BS000781.1,BCX25006.1,2020-10-19,21509,25330,Japan:Kanto,Asia,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,Severe acute respiratory syndrome coronavirus ...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,1-1273 surface glycoprotein [organism=Severe a...,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...
4,BS000780.1,BCX24994.1,2020-10-12,21509,25330,Japan:Kanto,Asia,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,Severe acute respiratory syndrome coronavirus ...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,1-1273 surface glycoprotein [organism=Severe a...,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...


# Classification

In [24]:
refid = 'NC_045512.2'

In [25]:
ref = metadata[metadata.accession == refid].to_dict()
list(ref.keys())

['accession',
 'protein_accession',
 'collection_date',
 'sgene_begin',
 'sgene_end',
 'location',
 'region',
 'genome',
 'genome_desc',
 'sgene_protein',
 'sgene_protein_desc',
 'sgene_nucleotide']

In [26]:
list(ref['sgene_nucleotide'].values())[0]

'ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACA

In [35]:
for refpid, v in tqdm(acc_mapping.items()):
    if refid in v:
        print(refpid, len(v))
        break

  0%|          | 0/26506 [00:00<?, ?it/s]

2538_6593532f926e48cc68421ef20a33018c 2538


In [37]:
np.where(child_acc == refpid)

(array([], dtype=int64),)

```
202 + 6 gaps
422 + 3 gaps
```

In [41]:
list(ref['sgene_protein'].values())[0]

'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITG