In [1]:
filepath = "./dialogs.txt"

In [2]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

import re
import contractions


def tokenize(text):
    standardized_text = contractions.fix(text)

    standardized_text = (
        standardized_text.replace("’", "'")
        .replace("‘", "'")
        .replace("´", "'")
        .replace("“", '"')
        .replace("”", '"')
        .replace("´´", '"')
    )

    tokens = tokenizer(standardized_text)

    filtered_tokens = [
        token
        for token in tokens
        if re.match(
            r"^[a-zA-Z0-9.,!?]+(-[a-zA-Z0-9.,!?]+)*(_[a-zA-Z0-9.,!?]+)*$", token
        )
    ]
    return filtered_tokens

In [3]:
import yaml
from torchtext.vocab import build_vocab_from_iterator


def corpus_iterator(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()

        prev_reply = None
        for line in lines:

            query, reply = line.strip().split("\t")

            # Check if not the last line and if the current reply is identical to the next query
            if query == prev_reply:
                out = reply
            else:
                out = query + reply
            prev_reply = reply

            yield tokenize(out)


# Add EOS, SOS, and PAD to the specials list
special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]

vocab = build_vocab_from_iterator(
    corpus_iterator(filepath),
    specials=special_tokens,
    min_freq=2,
)
vocab.set_default_index(vocab["<unk>"])

In [4]:
vocab.__len__()

1486

In [5]:
vocab.lookup_token(100)

'love'

In [6]:
import numpy as np

MAX_LENGTH = 30

queries, responses, masks_r = [], [], []


def all_words_in_vocab(sentence, vocab):
    return all(word in vocab for word in sentence)


def process_sentence(sequence, max_length=MAX_LENGTH):
    # Calculate the length needed for padding. Subtract 2 for <sos> and <eos> tokens
    padding_length = max_length - len(sequence) + 1

    # Processed sequence with <sos>, <eos>, and <pad>
    processed = ["<sos>"] + sequence + ["<eos>"] + ["<pad>"] * padding_length

    # Create a mask: 1s for actual tokens and 0s for padding
    # The mask length is len(sequence) + 2 for <sos> and <eos> tokens. The rest are 0s for padding.
    mask = [1] * (len(sequence) + 2) + [0] * padding_length

    return processed, mask


with open(filepath, "r", encoding="utf-8") as file:
    lines = file.readlines()

    prev_reply = None
    for line in lines:

        q, r = line.strip().split("\t")

        query = tokenize(q)
        response = tokenize(r)

        if (
            all_words_in_vocab(query + response, vocab)
            and len(query) <= MAX_LENGTH
            and len(response) <= MAX_LENGTH
        ):
            query, _ = process_sentence(query)
            response, mask_r = process_sentence(response)

            queries.append(vocab(query))
            responses.append(vocab(response))
            masks_r.append(mask_r)

queries = np.asarray(queries)
responses = np.asarray(responses)
masks_r = np.asarray(masks_r)

print(f"Number of queries/responses: {len(queries)}")

Number of queries/responses: 2470


In [7]:
len(queries[0])

33

In [8]:
import torch

# The dimensionality of GloVe embeddings
embedding_dim = 300

from torchtext.vocab import GloVe

# Load GloVe embeddings
glove = GloVe(name="42B", dim=embedding_dim, cache="./.vector_cache")

# Get GloVe embeddings for the vocabulary tokens
# Assuming 'vocab' is a list of vocabulary tokens including special tokens at the beginning
glove_embeddings = glove.get_vecs_by_tokens(vocab.get_itos(), lower_case_backup=True)


# Special tokens
special_tokens = ["<pad>", "<sos>", "<eos>"]
num_special_tokens = len(special_tokens)

# Initialize a tensor to hold the embeddings for special tokens
# Here, PAD is initialized to zeros, and SOS, EOS to random values
special_embeddings = torch.zeros(num_special_tokens, embedding_dim)
special_embeddings[1:] = (
    torch.rand(num_special_tokens - 1, embedding_dim) * 0.01
)  # Small random numbers for SOS and EOS


# Concatenate the special token embeddings with the GloVe embeddings
extended_embeddings = torch.cat([special_embeddings, glove_embeddings], dim=0)

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import deeplay as dl
from deeplay import DeeplayModule, Classifier

hidden_features = 150


class MyClassifier(Classifier):

    def training_step(self, batch, batch_idx):
        x1, x2, m = batch
        y = torch.cat((x2[:, 1:], x2[:, -1:]), dim=1)
        y_hat = self(x1, x2)
        loss = self.loss(y_hat, y, m)
        # loss = self.loss(y_hat.view(-1, y_hat.size(-1)), y.view(-1))
        self.log(
            f"train_loss",
            loss,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

        self.log_metrics(
            "train", y_hat, y, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        return loss

    def forward(self, x1, x2):
        return self.model(x1, x2)


class Encoder(DeeplayModule):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True, dropout=0.1)

    def forward(self, x):
        x = self.embedding(x)
        # x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths)
        outputs, (hidden, cell) = self.lstm(x)
        return hidden, cell


class Decoder(DeeplayModule):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True, dropout=0.1)
        self.dense = nn.Linear(lstm_units, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, hidden, cell):
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x, (hidden, cell))
        outputs = self.dense(outputs)
        outputs = self.softmax(outputs)
        return outputs, hidden, cell


class Seq2Seq(DeeplayModule):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(vocab_size, embedding_dim, lstm_units)
        self.decoder = Decoder(vocab_size, embedding_dim, lstm_units)
        self.vocab_size = vocab_size
        self.lstm_units = lstm_units

    def forward(self, encoder_input_data, decoder_input_data):
        encoder_hidden, encoder_cell = self.encoder(encoder_input_data)

        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        outputs = torch.zeros(
            (decoder_input_data.size(0), decoder_input_data.size(1), self.vocab_size)
        ).to("mps")
        for t in range(decoder_input_data.size(1)):  # Iterate through the sequence
            output, decoder_hidden, decoder_cell = self.decoder(
                decoder_input_data[:, t].unsqueeze(-1), decoder_hidden, decoder_cell
            )
            outputs[:, t, :] = output.squeeze(1)
        return outputs


seq2seq = Seq2Seq(len(vocab), embedding_dim, hidden_features)


def NLLLoss(inp, target, mask):
    crossEntropy = -torch.log(
        torch.gather(inp.view(-1, inp.shape[-1]), 1, target.view(-1, 1))
    )
    loss = crossEntropy.masked_select(mask.view(-1, 1)).mean()
    return loss  # , nTotal.item()


seq2seq_classifier = MyClassifier(
    model=seq2seq,
    loss=NLLLoss,  # nn.CrossEntropyLoss(),
    optimizer=dl.Adam(),
).create()

seq2seq_classifier.model.encoder.embedding.weight.data = extended_embeddings
seq2seq_classifier.model.encoder.embedding.weight.requires_grad = False
seq2seq_classifier.model.decoder.embedding.weight.data = extended_embeddings
seq2seq_classifier.model.decoder.embedding.weight.requires_grad = False



In [44]:
import deeptrack as dt
import torch

sources = dt.sources.Source(inputs=queries, targets=responses, masks=masks_r)

inputs_pl = dt.Value(sources.inputs) >> dt.pytorch.ToTensor(dtype=torch.int)
targets_pl = dt.Value(sources.targets) >> dt.pytorch.ToTensor(dtype=torch.int)
masks_pl = dt.Value(sources.masks) >> dt.pytorch.ToTensor(dtype=torch.bool)

In [45]:
from torch.utils.data import DataLoader

train_dataset = dt.pytorch.Dataset(inputs_pl & targets_pl & masks_pl, inputs=sources)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

In [46]:
trainer = dl.Trainer(max_epochs=100, accelerator="mps")

In [47]:
trainer.fit(seq2seq_classifier, train_loader)

/Users/841602/Documents/GitHub/Environments/deeplay_env/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.


Output()

/Users/841602/Documents/GitHub/Environments/deeplay_env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


In [48]:
def make_inference(model, source_text, max_length=MAX_LENGTH):
    # Tokenize the source text
    query_tokens = tokenize(source_text)

    # Process the tokens into the model's expected format, including adding <sos>, <eos>, and padding
    query, _ = process_sentence(query_tokens)

    # Convert tokens to indices using the vocabulary
    query = np.array(vocab(query))

    # Convert list of indices to a tensor and add a batch dimension
    source_sequence = torch.tensor(query, dtype=torch.int)

    # Move tensor to the same device as the model
    source_sequence = source_sequence.to(next(model.parameters()).device)

    # Encoder inference
    with torch.no_grad():
        hidden, cell = model.encoder(source_sequence)

    # Prepare the initial input to the decoder: <sos> token index
    target_index = torch.tensor(vocab(["<sos>"]), device=source_sequence.device)

    predictions = []
    for _ in range(max_length):
        with torch.no_grad():
            output, hidden, cell = model.decoder(target_index, hidden, cell)
            top1 = output.argmax(1)  # Adjust indexing based on output shape
            if top1.item() == vocab(["<eos>"])[0]:  # Stop if <eos> token is generated
                break
            predictions.append(top1.item())
            target_index = top1

    # Convert indices back to tokens
    predicted_tokens = [vocab.lookup_token(idx) for idx in predictions]

    return " ".join(predicted_tokens)

In [54]:
source_text = "who is the president?"
response = make_inference(seq2seq_classifier.model, source_text)
print(response)

i am not sure .
