## Character-level recurrent sequence-to-sequence model

Example from https://keras.io/examples/nlp/lstm_seq2seq/  
Older post https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [1]:
import numpy as np
import pandas as pd
from itertools import chain
from tqdm.notebook import tqdm
import keras

In [2]:
pairs = pd.read_csv("data/ncbi_03/pairs.csv")
sgenes = pd.read_csv("data/ncbi_03/ncbi_sgene_good_unique_aligned_cleaned.csv")
mapping = pd.read_csv("data/ncbi_03/ncbi_sgene_good_unique.csv")

## Data prep

In [3]:
num_pairs = len(pairs)
num_pairs

25865

In [4]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

In [5]:
for index, pair in tqdm(list(pairs.iterrows())):
    
    #if index == num_pairs:
    #    break
    
    parent_sgene = sgenes[sgenes["accession"]==pair["parent"]]["sgene_nucleotide"].values[0]
    child_sgene = sgenes[sgenes["accession"]==pair["child"]]["sgene_nucleotide"].values[0]
    
    input_texts.append(parent_sgene)
    target_texts.append(child_sgene)
    
#     if pair['parent'] == 'MN975262.1' or pair['child'] == 'MN975262.1':
#         print(pair)
#         break

input_characters = set(chain.from_iterable(input_texts))
target_characters = set(chain.from_iterable(target_texts))

  0%|          | 0/25865 [00:00<?, ?it/s]

In [6]:
print("input_texts ", len(input_texts))
print("target_texts", len(target_texts))
print("input_characters", len(input_characters), input_characters)
print("target_characters", len(target_characters), target_characters)

input_texts  25865
target_texts 25865
input_characters 4 {'A', 'G', 'C', 'T'}
target_characters 4 {'A', 'G', 'C', 'T'}


In [7]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

if input_characters != target_characters:
    raise Error("input_characters does not match target_characters")

characters = input_characters
num_tokens = len(characters)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

if max_encoder_seq_length != max_decoder_seq_length:
    raise Error("input_seq_length does not match target_seq_length")

max_seq_length = max_encoder_seq_length

print("Number of samples:", len(input_texts))
print("Number of unique tokens:", num_tokens)
print("Max sequence length:", max_seq_length)

Number of samples: 25865
Number of unique tokens: 4
Max sequence length: 3813


In [8]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

if input_token_index != target_token_index:
    raise Error("input_token_index does not match target_token_index")

token_index = input_token_index
    
print("token_index", len(token_index), token_index)

token_index 4 {'A': 0, 'C': 1, 'G': 2, 'T': 3}


In [9]:
encoder_input_data = np.zeros(
    (len(input_texts), max_seq_length, num_tokens), dtype = "float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_seq_length, num_tokens), dtype = "float32"
)
print("encoder_input_data.shape", encoder_input_data.shape)
print("decoder_input_data.shape", decoder_input_data.shape)

encoder_input_data.shape (25865, 3813, 4)
decoder_input_data.shape (25865, 3813, 4)


In [10]:
for i, (input_text, target_text) in tqdm(enumerate(zip(input_texts, target_texts))):
    
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, token_index[char]] = 1.0
    
    for t, char in enumerate(target_text):
        
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, token_index[char]] = 1.0

0it [00:00, ?it/s]

In [11]:
input_texts[0]

'ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGC

In [12]:
y_true = encoder_input_data[0]
y_true

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [13]:
y_pred = decoder_input_data[0]
y_pred

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [14]:
keras.losses.CategoricalCrossentropy()(y_true, y_pred).numpy()

2022-04-01 18:38:57.951133: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: system has unsupported display driver / cuda driver combination
2022-04-01 18:38:57.951262: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: mif325-gpu2
2022-04-01 18:38:57.951293: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: mif325-gpu2
2022-04-01 18:38:57.951482: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.54.0
2022-04-01 18:38:57.951566: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.47.3
2022-04-01 18:38:57.951591: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 510.47.3 does not match DSO version 510.54.0 -- cannot find working devices in this configuration
2022-04-01 18:38:57.963888: I tensorflow/core/platform/cpu_feature_guard.cc:151] This Tens

0.004227259

In [18]:
np.savez('data/ncbi_03/dataset.npz', 
         encoder_input_data = encoder_input_data, 
         decoder_input_data = decoder_input_data,
         token_index = token_index,
         parent_acc = pairs.parent.values,
         child_acc = pairs.child.values,
         mapping = mapping[['accession','accessions']].values
        )