In [42]:
import tensorflow as tf
print(tf.__version__)
tf.random.set_seed(1234)

!pip install tensorflow-datasets==1.2.0

import tensorflow_datasets as tfds
import os
import re
import numpy as np
import matplotlib.pyplot as plt

2.10.0


In [43]:
path_to_zip = tf.keras.utils.get_file(
    "cornell_movie_dialogs.zip",
    origin='https://www.kaggle.com/api/v1/datasets/download/soumikrakshit/cornell-movie-dialogs-corpus',
    extract=True
)
path_to_data = os.path.join(os.path.dirname(path_to_zip), "cornell movie-dialogs corpus")

path_to_movie_lines = os.path.join(path_to_data, "movie_lines.txt")
path_to_movie_conversations = os.path.join(path_to_data, "movie_conversations.txt")

print("Dataset downloaded and extracted.")

Dataset downloaded and extracted.


In [44]:
path_to_data

'C:\\Users\\erand\\.keras\\datasets\\cornell movie-dialogs corpus'

In [45]:
MAX_SAMPLE = 50000

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])",r" \1",sentence)
    sentence = re.sub(r'[" "]+'," ",sentence)
    sentence = re.sub(r"[^a-zA-Z?.,!]+"," ",sentence)
    sentence = sentence.strip()
    return sentence

def load_conversations():
    id2line={}
    with open(path_to_movie_lines, errors = "ignore") as file:
        lines = file.readlines()
    for line in lines:
        parts = line.replace("\n","").split(" +++$+++ ")
        id2line[parts[0]] = parts[4]

    inputs, outputs = [],[]

    with open(path_to_movie_conversations, "r") as file:
        lines = file.readlines()
    for line in lines:
        parts = line.replace("\n","").split(" +++$+++ ")
        conversation = [line[1:-1] for line in parts[3][1:-1].split(", ")]
        for i in range(len(conversation) - 1):
            inputs.append(preprocess_sentence(id2line[conversation[i]]))
            outputs.append(preprocess_sentence(id2line[conversation[i+1]]))

            if len(inputs) >= MAX_SAMPLE:
                return inputs,outputs
        return inputs,outputs

In [46]:
questions , answers = load_conversations()

In [51]:
len(questions)


3

In [52]:
len(answers)

3

In [54]:
print("Q : {}".format(questions[0]))
print("A : {}".format(answers[0]))

Q : can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .
A : well , i thought we d start with pronunciation , if that s okay with you .


In [55]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    questions+answers,target_vocab_size=2**13
)

START_TOKEN, END_TOKEN = [tokenizer.vocab_size],[tokenizer.vocab_size+1]
VOCAB_SIZE = tokenizer.vocab_size + 2

In [57]:
print("Tokenized Q : {}".format(tokenizer.encode(questions[2])))

Tokenized Q : [19, 7, 22, 1, 23, 1, 13, 17, 5, 16, 2]


In [59]:
def tokenize_and_filter(inputs,outputs):
    MAX_LENGTH = 40  # Example value
    tokenized_inputs,tokenized_outputs = [],[]

    for (sentence1,sentence2) in zip(inputs,outputs):
        sentence1 = START_TOKEN+tokenizer.encode(sentence1)+END_TOKEN
        sentence2 = START_TOKEN+tokenizer.encode(sentence2)+END_TOKEN
        if len(sentence1) <= MAX_LENGTH and len(sentence2) <=MAX_LENGTH:
            tokenized_inputs.append(sentence1)
            tokenized_outputs.append(sentence2)

    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(tokenized_inputs,maxlen=MAX_LENGTH,padding="post")
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(tokenized_outputs,maxlen=MAX_LENGTH,padding="post")

    return tokenized_inputs,tokenized_outputs

questions,answers = tokenize_and_filter(questions,answers)


In [60]:
len(questions)

3

In [61]:
VOCAB_SIZE

316

In [66]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((
    {
        "inputs": questions,
        "dec_inputs": answers[:,:-1]
    },
    {
        "outputs:": answers[:,1:]
    }
))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)