I will be using Pytorch for my deep learning module, as I have the most experience with it from coursework and a summer program.

In [None]:
# Import the necessary modules
import torch
import torch.nn as nn

import numpy as np

# Task 1: Sentence Transformer Implementation

For this task, I referenced the textbook Dive into Deep Learning Chapter 11 Section 7 <https://d2l.ai/chapter_attention-mechanisms-and-transformers/transformer.html>, as well as my coursework from COMP SCI 539: Introduction to Artificial Neural Networks.

We want to implement the transformer architecture, modeled by the figure below:

<img src=transformer.png>

If we were to implement this from scratch, we would need to implement encoder and decoder layers, which would require implementing multi-head attention, feed-forward networks, etc. Fortunately, Pytorch provides a default Tranformer module, which we can use. However, we will still need to provide masked source and target sequences.

First, we want to be able to convert a sentence to a fixed-size list of tokens; for convenience, we will use words as tokens and remove punctuation characters. (This means contraction words will each be its own token, with the apostrophe character removed.) We can use padding tokens to attain the specified word count. Next, we will want to convert the tokens into a list of indices, each of which will correspond to a certain word. These indices will be input to an Embedding layer, which will provide the embedded sequences.

To do this, we will need to create a dictionary of words, as well as some auxilliary functions and constants.

In [None]:
START_TOKEN_WORD = "SOS"  # start of sentence
END_TOKEN_WORD = "EOS"  # end of sentence
PAD_TOKEN_WORD = "PAD"  # padding
UNKNOWN_TOKEN_WORD = "UNK"  # unknnown word # TODO: remove if unused

START_TOKEN_IDX = 0  # start of sentence
END_TOKEN_IDX = 1  # end of sentence
PAD_TOKEN_IDX = 2  # padding
UNKNOWN_TOKEN_IDX = 3  # unknnown word # TODO: remove if unused


class WordDictionary:
    def __init__(self):
        self.word_to_index = {
            START_TOKEN_WORD: START_TOKEN_IDX,
            END_TOKEN_WORD: END_TOKEN_IDX,
            PAD_TOKEN_WORD: PAD_TOKEN_IDX,
            UNKNOWN_TOKEN_WORD: UNKNOWN_TOKEN_IDX,
        }
        self.index_to_word = {
            START_TOKEN_IDX: START_TOKEN_WORD,
            END_TOKEN_IDX: END_TOKEN_WORD,
            PAD_TOKEN_IDX: PAD_TOKEN_WORD,
            UNKNOWN_TOKEN_IDX: UNKNOWN_TOKEN_WORD,
        }
        self.word_to_count = {
            START_TOKEN_WORD: 0,
            END_TOKEN_WORD: 0,
            PAD_TOKEN_WORD: 0,
            UNKNOWN_TOKEN_WORD: 0,
        }
        self.n_words = len(self.word_to_index)

    def add_word_list(self, sentence: list[str]):
        """Add a list of words to the dictionary."""
        for word in sentence:
            self.add_word(word)

    def add_word(self, word: str):
        """Add a word to the dictionary."""
        if word in self.word_to_index:
            self.word_to_count[word] += 1
        else:
            self.word_to_index[word] = self.n_words
            self.word_to_count[word] = 1
            self.index_to_word[self.n_words] = word
            self.n_words += 1


def tokenize_and_pad(sentence: str, token_count: int):
    """Tokenize the sentence and truncate or pad to a list of fixed length.
    Punctuation characters are removed."""
    sentence = (
        sentence.upper()
        .replace(".", "")
        .replace(",", "")
        .replace("?", "")
        .replace("!", "")
        .replace(";", "")
        .replace(":", "")
        .replace("'", "")
        .replace('"', "")
        .replace("-", "")
        .replace("_", "")
    )  # Note: if the input might contain other punctuation marks, include them here
    word_list = sentence.split()
    word_list.insert(0, START_TOKEN_WORD)
    if len(word_list) < token_count:
        word_list.append(END_TOKEN_WORD)
        word_list.extend([PAD_TOKEN_WORD] * (token_count - len(word_list)))
    return word_list[:token_count]


def word_list_to_indices(word_list: list[str], word_dict: WordDictionary):
    """Convert the list of words to a list of indices, updating the word_dict as necessary."""
    word_dict.add_word_list(word_list)
    output = []
    for word in word_list:
        output.append(word_dict.word_to_index[word])
    return output


def indices_to_word_list(indices: list[int], word_dict: WordDictionary):
    """Convert the list of indeces to a list of words."""
    output = []
    for idx in indices:
        output.append(word_dict.index_to_word[idx])
    return output

Now let's create a WordDictionary and add a list of sentences we will use. For creating a large language model with real world applications, we would want to add all the English words we will use into the dictionary; however, for the sake of simplicity, we will demonstrate our model using only the words of a few sample sentences.

In [None]:
word_dict = WordDictionary()
token_count = 10

sentences = [
    "Hello World!",
    "Roses are red, violets are blue.",
    "Pineapple belongs on pizza.",
    "Milk first, then cereal.",
    "A well-done steak is a steak well-done.",
    "The ocean is a soup, and soup is a drink.",
    "Ergo, the ocean is a drink.",
    "How many times have I told you, Kevin?",
    "DON'T EAT FOOD OFF THE FLOOR!",
    "Eyes of mine with fire burn.",
    "Heart of mine freezing in the blizzard; I am numb.",
]

for sentence in sentences:
    tokens = tokenize_and_pad(sentence=sentence, token_count=token_count)
    print(tokens)
    indices = word_list_to_indices(tokens, word_dict)
    print(indices)

Now we can create our Transformer model class.

In [None]:
# TODO: fix/explain below

class SentenceTransformerModel(nn.Transformer):
    def __init__(self, sentence_length: int, word_dict: WordDictionary):
        super.__init__(self, sentence_length)
        self.embedding = nn.Embedding(num_embeddings=word_dict.n_words, embedding_dim=sentence_length)

    def forward(self, x):
        
        x = self.embedding(x)
        return super.forward(x)



transformer = nn.Transformer()

transformer("Hello World!", [0, 4, 5, 1, 2, 2, 2, 2, 2, 2])

In [None]:
# TODO: implement

# Task 2: Multi-Task Learning Expansion

In [None]:
# TODO: implement

# Task 3: Training Considerations

In [None]:
# TODO: implement

# Task 4: Training Loop Implementation (BONUS)

In [None]:
# TODO: implement