In [8]:
%pip install keras
%pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7799301877687930097
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14343274496
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6663409341436498890
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [5]:
import collections
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, LSTM, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

In [8]:
# Load English data
english_sentences = load_data('/content/drive/My Drive/capstone/train-data/bible.en')
# Load kinya data
kinya_sentences = load_data('/content/drive/My Drive/capstone/train-data/bible.kn')

print('Dataset Loaded')

Dataset Loaded


In [9]:

for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, kinya_sentences[sample_i]))

small_vocab_en Line 1:  In the beginning God created the heavens and the earth.
small_vocab_fr Line 1:  Mbere na mbere Imana yaremye ijuru n'isi. 
small_vocab_en Line 2:  And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.
small_vocab_fr Line 2:  Isi yari itagira ishusho, yariho ubusa busa, umwijima wari hejuru y'imuhengeri, maze Umwuka w'Imana yagendagendaga hejuru y'amazi. 


#Vocabulary 

In [10]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
kinya_words_counter = collections.Counter([word for sentence in kinya_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} Kinya words.'.format(len([word for sentence in kinya_sentences for word in sentence.split()])))
print('{} unique Kinya words.'.format(len(kinya_words_counter)))
print('10 Most common words in the Kinya dataset:')
print('"' + '" "'.join(list(zip(*kinya_words_counter.most_common(10)))[0]) + '"')

481996 English words.
22806 unique English words.
10 Most common words in the English dataset:
"the" "and" "of" "to" "And" "that" "in" "shall" "he" "unto"

308124 Kinya words.
60522 unique Kinya words.
10 Most common words in the Kinya dataset:
"mu" "na" "ku" "ni" "kandi" "Uwiteka" "ari" "ati" "ngo" "kuko"


#Preprocess

In [11]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    # TODO: Implement
    tokenizer = Tokenizer(split=' ', char_level=False)
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer           # texts_to_sequences_generator(), Yields individual sequences.



# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [12]:
text_tokenized

[[1, 2, 4, 5, 6, 7, 1, 8, 9],
 [10, 11, 12, 2, 13, 14, 15, 16, 3, 17],
 [18, 19, 3, 20, 21]]

#Padding

In [13]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    if length is None:
        length = max([len(sentence) for sentence in x])
    
    return pad_sequences(x, maxlen=length, padding='post', truncating='post')



# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


#Preprocess


In [14]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_kinya_sentences, english_tokenizer, kinya_tokenizer =\
    preprocess(english_sentences, kinya_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_kinya_sequence_length = preproc_kinya_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
kinya_vocab_size = len(kinya_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max Kinyarwanda sentence length:", max_kinya_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("Kinyarwanda vocabulary size:", kinya_vocab_size)

Data Preprocessed
Max English sentence length: 90
Max Kinyarwanda sentence length: 68
English vocabulary size: 10822
Kinyarwanda vocabulary size: 47090


In [15]:
print(type(preproc_english_sentences))
print(preproc_english_sentences.shape)

<class 'numpy.ndarray'>
(18189, 90)


In [None]:
# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()


#Model

#Ids Back to Text
The neural network will be translating the input to words ids, which isn't the final form we want. We want the Kinya translation. The function logits_to_text will bridge the gab between the logits from the neural network to the Kinyarwanda  translation. You'll be using this function to better understand the output of the neural network.

In [16]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1) if index_to_words[prediction]!='<PAD>'] )

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


#RNN

In [17]:
def token_to_words(sequence, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return [index_to_words[token] for token in sequence if index_to_words[token]!='<PAD>']

In [None]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution


In [None]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, kinya_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param kinya_vocab_size: Number of unique Kinyarwanda  words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Build the layers
    learning_rate = 0.001
    english_input = Input(shape=input_shape[1:], name="input_layer")    # the shape is (input length x 1) as batchsize excluded
    
    # LSTM takes as input (batchsize,input_length,1) and outputs (batchsize, input_length, 64) because return-seq=True
    x = LSTM(64, return_sequences=True, activation="tanh", name="LSTM_layer")(english_input)
    preds = TimeDistributed(Dense(kinya_vocab_size, activation="softmax"), name="Dense_layer")(x)
    model = Model(inputs=english_input, outputs=preds, name='simple_LSTM')
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model



# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_kinya_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_kinya_sentences.shape[-2], 1))  # reshape as 3D (batchsize, timesteps, 1) for LSTM input

# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_kinya_sequence_length,
    english_vocab_size,
    kinya_vocab_size)

simple_rnn_model.summary()

simple_rnn_model.fit(tmp_x, preproc_kinya_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Model: "simple_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 68, 1)]           0         
                                                                 
 LSTM_layer (LSTM)           (None, 68, 64)            16896     
                                                                 
 Dense_layer (TimeDistribute  (None, 68, 47090)        3060850   
 d)                                                              
                                                                 
Total params: 3,077,746
Trainable params: 3,077,746
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20


In [None]:
def emb_bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Implement
    
    learning_rate = 0.001
    embedding_size = 256
    
    english_input = Input(shape=input_shape[1:], name="input_layer")  # Embedding input (batch, seq_length)
    
    embeddings = Embedding(input_dim = english_vocab_size, output_dim = embedding_size, 
                           input_length= output_sequence_length, name="Embedding_layer")(english_input)
    
    # input shape to LSTM (batchsize, seq_length, embedding_dim) output shape: (batchsize, seq_length, units=64x2)
    x = Bidirectional(LSTM(64, return_sequences=True, activation="tanh"), name="Bidir_LSTM_layer")(embeddings)
    
    preds = TimeDistributed(Dense(french_vocab_size, activation="softmax"), name="Dense_layer")(x)
    
    model = Model(inputs=english_input, outputs=preds, name='Embedding_Bidir_LSTM')
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
   
    return model

#tests.test_bd_model(bd_model)

# TODO: Train and Print prediction(s)

# TODO: Reshape the input
tmp_x = pad(preproc_english_sentences, max_kinya_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_kinya_sentences.shape[-2]))    # reshapped for Embedding input (batch, seq_length)

# Train the neural network
emb_bd_rnn_model = emb_bd_model(
                        tmp_x.shape,
                        max_kinya_sequence_length,
                        english_vocab_size,
                        kinya_vocab_size)

emb_bd_rnn_model.summary()

emb_bd_rnn_model.fit(tmp_x, preproc_kinya_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Model: "Embedding_Bidir_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 68)]              0         
                                                                 
 Embedding_layer (Embedding)  (None, 68, 256)          2770432   
                                                                 
 Bidir_LSTM_layer (Bidirecti  (None, 68, 128)          164352    
 onal)                                                           
                                                                 
 Dense_layer (TimeDistribute  (None, 68, 47090)        6074610   
 d)                                                              
                                                                 
Total params: 9,009,394
Trainable params: 9,009,394
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
