# Week 14: Colab Experiment

# I. Introduction
In this exercise, we first train a transformer using the Wikitext-2 dataset and then use the model to generate new text with the length specified by the user.  

# II. Methods

What is the model architecture?

In [30]:

import time
import math
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [31]:
# Uncomment one of the following that works for you.

device = torch.device("cuda")
# device = torch.device("mps")
# device = torch.device("cpu")

In [32]:
batch_size = 20

emsize = 200 # size of word embeddings
nhead = 2
nhid = 200
nlayers = 2
dropout = 0.2
# lr = 20 # initial learning rate
lr = 5
epochs=10 # upper epoch limit

bptt=35 #sequence length
clip=0.25 #gradient clipping
log_interval=200 # report interval

save='model.pt' #path to save the final model

# Set the random seed manually for reproducibility.
torch.manual_seed(0)

eval_batch_size = 10

## Load data

In [33]:
# from google.colab import drive
# drive.mount('/content/drive')
import sys
# sys.path.append('/content/drive/MyDrive/Week14/Week14/') # Change to your own path
sys.path.append(os.getcwd())
print(os.getcwd())
import data

/home/codeamon/desktop/ML/homework


In [34]:
# corpus = data.Corpus('/content/drive/My Drive/Week14/Week14/data/wikitext-2')
corpus = data.Corpus('../data/wikitext-2') 

def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
ntokens = len(corpus.dictionary)

## Build the model

In [35]:
# Define positional encoding used in the transformer model

#################################################################################################
# [TODO]: Build a positional encoding function that can be used in the TransformerModel below
#################################################################################################
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


In [36]:
# Define the transformer model

class TransformerModel(nn.Transformer):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers)
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout) # This is what you had constructed above

        # Define the encoder layers
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.encoder = TransformerEncoder(encoder_layers, nlayers)

        self.input_emb = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        return torch.log(torch.tril(torch.ones(sz,sz)))

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.input_emb.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_emb(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.encoder(src, mask=self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

In [37]:
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
criterion = nn.NLLLoss()

## Training

In [38]:


def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = model(data)
            output = output.view(-1, ntokens)

            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        output = model(data)
        output = output.view(-1, ntokens)
        loss = criterion(output, targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()



# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)


# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)




| epoch   1 |   200/ 2983 batches | lr 5.00 | ms/batch  2.87 | loss  7.59 | ppl  1978.65
| epoch   1 |   400/ 2983 batches | lr 5.00 | ms/batch  2.81 | loss  6.79 | ppl   892.93
| epoch   1 |   600/ 2983 batches | lr 5.00 | ms/batch  2.82 | loss  6.50 | ppl   663.33
| epoch   1 |   800/ 2983 batches | lr 5.00 | ms/batch  2.81 | loss  6.36 | ppl   575.48
| epoch   1 |  1000/ 2983 batches | lr 5.00 | ms/batch  2.83 | loss  6.25 | ppl   520.25
| epoch   1 |  1200/ 2983 batches | lr 5.00 | ms/batch  2.79 | loss  6.22 | ppl   503.55
| epoch   1 |  1400/ 2983 batches | lr 5.00 | ms/batch  2.78 | loss  6.14 | ppl   465.24
| epoch   1 |  1600/ 2983 batches | lr 5.00 | ms/batch  2.79 | loss  6.15 | ppl   470.38
| epoch   1 |  1800/ 2983 batches | lr 5.00 | ms/batch  2.79 | loss  6.03 | ppl   415.69
| epoch   1 |  2000/ 2983 batches | lr 5.00 | ms/batch  2.79 | loss  6.03 | ppl   415.04
| epoch   1 |  2200/ 2983 batches | lr 5.00 | ms/batch  2.79 | loss  5.92 | ppl   372.02
| epoch   1 |  2400/ 

  model = torch.load(f)


| End of training | test loss  5.19 | test ppl   178.85


# III. Results
Here we generate text of length 100 words.

In [39]:
num_words = 100
temperature = 1


g = torch.Generator().manual_seed(0)
initial_state = g.get_state()

with open('./model.pt', 'rb') as f:
    model = torch.load(f, map_location=device)
model.eval()

  model = torch.load(f, map_location=device)


TransformerModel(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (decoder): Linear(in_features=200, out_features=33278, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (input_emb): Embedding(33278, 200)
)

In [40]:
g.set_state(initial_state)
input = torch.randint(ntokens, (1, 1), dtype=torch.long, generator=g).to(device)


generated_text = ""

# Dictionary to map indices to words
idx_to_word = corpus.dictionary.idx2word

##################################################################################
# [TODO] Fill out this section to use the transfer model to generate new text
##################################################################################
with torch.no_grad():
    for i in range(num_words):
        # 1. Predict next word probabilities.
        output = model(input)
        output = output[-1, 0, :]
        # 2. Scale probabilities with temperature.
        word_weights = output / temperature
        word_probs = F.softmax(word_weights, dim=-1)
        # 3. Sample the next word index.
        next_word_idx = torch.multinomial(word_probs, num_samples=1).item()
        # 4. Add sampled word to the input.
        input = torch.cat([input, torch.tensor([[next_word_idx]], device=device)], dim=0)
        # 5. Find the word for the index.
        word = idx_to_word[next_word_idx]
        # 6. Add word to the output text.
        generated_text += word + " "

print(generated_text)

Africa against Maryland , Ireland , Georgia in Scotland , Spain , and when 2002 was replaced by Union Mech Div , bridges were the largest user in which land . Among those who developed Tigernach records other players are large numbers were advocated such as a poor starlings were slowed @-@ enhancing the island 's wasps , West Branch as Dr. Patrick 's colours are also linking South Wales are still bears a big fewer males drowned . By 1997 USD ) returns to South Africa in such as part of greater concern to them thereafter , making the 


# IV. Conclusion and Discussion

What did you find and learn in this excercise?

The purpose of the task is to generate new text sequences using a trained language model. It starts with an initial random word and builds a sequence by predicting and sampling subsequent words, ultimately producing a coherent piece of text. The model we used to train is a standard transformer module based on the paper `Attention is All You Need`.

1. Import required libraries, initialize random state, and hyperparameters.
2. Load `wikitext-2` dataset and split into training, testing, and validation sets.
3. Build a transformer model architecture introduced by paper `Attention is All You Need` with some modifications. Since the language modeling task is to assign a probability for the likelihood of a given word to follow a sequence of words, only the encoder layers are trained in this task. Moreover, the PositionalEncoding module adds information about the relative or absolute positions of tokens in the sequence. The positional encodings are designed to have the same dimension as the embeddings, allowing them to be summed together. In this case, sine and cosine functions with varying frequencies are used.
$$
\begin{align*}
PE_{(\text{pos},2i)}&=sin(pos/10000^{2i/d_{model}}) \\
PE_{(\text{pos},2i+1)}&=cos(pos/10000^{2i/d_{model}})
\end{align*}
$$
4. After building the model, we train the model on the `wikitext-2` dataset. The model, based on a transformer architecture with encoder layers, learns to predict the next word in a sequence. It processes input sequences by incorporating positional encodings, which use sine and cosine functions to encode the positions of words in the sequence. This allows the model to handle varying sequence lengths. During training, the model's predictions are compared to the actual next words using a loss function (like cross-entropy), and the model's parameters are adjusted through backpropagation.
5. Once the model is trained, it can generate text by starting with a random word. The model predicts the next word in the sequence, appends it to the input, and continues predicting and appending words to generate a coherent piece of text.
