In [55]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

In [56]:
df = pd.read_csv(
    "..\\data\\LJSpeech-1.1\\metadata.csv",
    sep="|",
    header=None,
    names=["ID", "Text1", "Text2"],
)
texts = df["Text1"].tolist()
START_TOKEN = "<START>"
PADDING_TOKEN = "<PADDING>"
END_TOKEN = "<END>"
english_vocabulary = [
    START_TOKEN,
    " ",
    "!",
    '"',
    "#",
    "$",
    "%",
    "&",
    "'",
    "(",
    ")",
    "*",
    "+",
    ",",
    "-",
    ".",
    "/",
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    ":",
    "<",
    "=",
    ">",
    "?",
    "@",
    "[",
    "\\",
    "]",
    "^",
    "_",
    "`",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "{",
    "|",
    "}",
    "~",
    PADDING_TOKEN,
    END_TOKEN,
]

In [57]:
index_to_english = {
    k: v for k, v in enumerate(english_vocabulary)
}  # gives values to each english letter
english_to_index = {v: k for k, v in enumerate(english_vocabulary)}
english_to_index  # vice versa

{'<START>': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 '&': 7,
 "'": 8,
 '(': 9,
 ')': 10,
 '*': 11,
 '+': 12,
 ',': 13,
 '-': 14,
 '.': 15,
 '/': 16,
 '0': 17,
 '1': 18,
 '2': 19,
 '3': 20,
 '4': 21,
 '5': 22,
 '6': 23,
 '7': 24,
 '8': 25,
 '9': 26,
 ':': 27,
 '<': 28,
 '=': 29,
 '>': 30,
 '?': 31,
 '@': 32,
 '[': 33,
 '\\': 34,
 ']': 35,
 '^': 36,
 '_': 37,
 '`': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64,
 '{': 65,
 '|': 66,
 '}': 67,
 '~': 68,
 '<PADDING>': 69,
 '<END>': 70}

In [58]:
texts = texts[:13100]
english_sentences = [sentence.rstrip("\n").lower() for sentence in texts]
english_sentences[10]
print(texts[:10])

['Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', 'in being comparatively modern.', 'For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process', 'produced the block books, which were the immediate predecessors of the true printed book,', 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.', 'And it is worth mention in passing that, as an example of fine typography,', 'the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,', 'has never been surpassed.', 'Printing, then, for our purpose, may be considered as the art of making books by means of movable types.', 'Now, as all books not primarily intended as picture-books consist principally of types com

In [59]:
PERCENTILE = 97
print(
    f"{PERCENTILE}th percentile length english: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}"
)

97th percentile length english: 155.0


In [60]:
max_sequence_length = 200


def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True


def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (
        max_sequence_length - 1
    )  # need to re-add the end token so leaving 1 space


valid_sentence_indicies = []
for index in range(len(english_sentences)):
    english_sentence = english_sentences[index]
    if is_valid_length(english_sentence, max_sequence_length) and is_valid_tokens(
        english_sentence, english_vocabulary
    ):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(english_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 13100
Number of valid sentences: 12286


In [61]:
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]
english_sentences

['printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the exhibition',
 'in being comparatively modern.',
 'for although the chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the netherlands, by a similar process',
 'produced the block books, which were the immediate predecessors of the true printed book,',
 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.',
 'and it is worth mention in passing that, as an example of fine typography,',
 'the earliest book printed with movable types, the gutenberg, or "forty-two line bible" of about 1455,',
 'has never been surpassed.',
 'printing, then, for our purpose, may be considered as the art of making books by means of movable types.',
 'now, as all books not primarily intended as picture-books consist principally of 