# **Building and Training Sequence-to-Sequence Models for Language Processing with PyTorch and fastai**

In [34]:
# Import necessary libraries from fastai for text processing
from fastai.text.all import *

# Download and prepare the Human Numbers dataset
path = untar_data(URLs.HUMAN_NUMBERS)
Path.BASE_PATH = path

In [35]:
# Load and prepare data
def load_data(path):
    lines = L()
    # Read lines from train and validation files
    for file in ['train.txt', 'valid.txt']:
        with open(path/file) as f: lines += L(*f.readlines())
    # Create a single text stream and tokenize
    text = ' . '.join([line.strip() for line in lines])
    tokens = text.split(' ')
    return tokens

tokens = load_data(path)

In [36]:

# Create a vocabulary and numericalize tokens
vocab = L(*tokens).unique()
word2idx = {word: i for i, word in enumerate(vocab)}
nums = L(word2idx[token] for token in tokens)

# Prepare sequences for the model
def create_sequences(nums):
    seq_len = 3
    sequences = L((tensor(nums[i:i+seq_len]), nums[i+seq_len]) for i in range(0, len(nums)-seq_len))
    return sequences

sequences = create_sequences(nums)

In [37]:
# Split data into training and validation sets and create DataLoader
def split_data(sequences, train_pct=0.8):
    cut = int(len(sequences) * train_pct)
    return sequences[:cut], sequences[cut:]

train_seqs, valid_seqs = split_data(sequences)

bs = 64
dls = DataLoaders.from_dsets(train_seqs, valid_seqs, bs=bs, shuffle=False)

In [38]:
# Define a simple RNN model architecture
class SimpleRNN(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  # Input to hidden
        self.h_h = nn.Linear(n_hidden, n_hidden)     # Hidden to hidden
        self.h_o = nn.Linear(n_hidden, vocab_sz)     # Hidden to output
        self.h = 0                                   # Initialize hidden state

    def forward(self, x):
        for i in range(3):  # Process 3 inputs from the sequence
            self.h = self.h + self.i_h(x[:, i])
            self.h = F.relu(self.h_h(self.h))
        return self.h_o(self.h)

In [39]:

import torch.nn.functional as F
from torch import nn

# Define the first simple language model
class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)

    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        return self.h_o(F.relu(self.h_h(h)))

In [40]:
# Our First Recurrent Neural Network
class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)

    def forward(self, x):
        h = 0
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)


In [41]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  # Input to hidden layer
        self.h_h = nn.Linear(n_hidden, n_hidden)     # Hidden to hidden layer
        self.h_o = nn.Linear(n_hidden, vocab_sz)    # Hidden to output layer
        self.n_hidden = n_hidden
        self.h = None  # Initialize hidden state as None

    def forward(self, x):
        if self.h is None or self.h.size(0) != x.size(0):
            # Initialize hidden state to zeros if it is None or batch size has changed
            self.h = torch.zeros(x.size(0), self.n_hidden, device=x.device)
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()  # Detach to prevent backprop through the entire history
        return out

    def reset(self):
        self.h = None  # Reset hidden state to None

In [42]:
# Callback to reset model state at the beginning of training and validation
class ModelResetter(Callback):
    def before_train(self): self.model.reset()
    def before_validate(self): self.model.reset()

# Training the improved RNN
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.335075,1.789224,0.484824,00:09
1,1.03608,1.5619,0.502258,00:09
2,1.025811,1.718748,0.507964,00:10
3,1.003425,1.97814,0.514779,00:11
4,0.987475,2.286434,0.520168,00:10
5,1.026209,2.152428,0.507172,00:10
6,0.99238,2.056885,0.530866,00:10
7,1.035084,2.183841,0.52207,00:09
8,1.096464,1.738006,0.525636,00:09
9,1.217491,1.404073,0.51256,00:09


In [43]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

In [44]:

sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [45]:
class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0  # This will be updated to handle batch sizes dynamically

    def forward(self, x):
        outs = []
        for i in range(x.size(1)):  # Assuming x is of shape (batch_size, sequence_length)
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)

    def reset(self):
        self.h = 0  # Resets state for a new sequence


def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)


epoch,train_loss,valid_loss,accuracy,time
0,3.329795,3.128267,0.180094,00:02
1,2.393921,1.968232,0.469727,00:02
2,1.778201,1.87135,0.457031,00:03
3,1.504304,1.817366,0.489502,00:02
4,1.332533,1.851684,0.526204,00:02
5,1.195112,1.843094,0.533203,00:02
6,1.082083,2.012559,0.581868,00:03
7,1.001093,2.035439,0.58667,00:02
8,0.931485,2.102063,0.597819,00:02
9,0.869821,2.269284,0.625977,00:02


## **LSTM**

In [46]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.ih = nn.Linear(ni,4*nh)
        self.hh = nn.Linear(nh,4*nh)

    def forward(self, input, state):
        h,c = state
        # One big multiplication for all the gates is better than 4 smaller ones
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1)
        ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()

        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h,c)

In [47]:
t = torch.arange(0,10); t

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [48]:
t.chunk(2)

(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))

In [49]:
class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)

    def reset(self):
        for h in self.h: h.zero_()


learn = Learner(dls, LMModel6(len(vocab), 64, 2),
                loss_func=CrossEntropyLossFlat(),
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.021744,2.702758,0.318115,00:04
1,2.22038,2.398314,0.265544,00:03
2,1.619505,1.872986,0.479492,00:03
3,1.289589,2.184089,0.523519,00:04
4,1.015464,2.09879,0.587321,00:03
5,0.759935,1.98451,0.620443,00:03
6,0.545575,1.74931,0.665609,00:03
7,0.35604,1.66323,0.688151,00:04
8,0.233045,1.645964,0.732992,00:03
9,0.145595,1.657074,0.735352,00:03


# **Regularizing an LSTM**

In [50]:
class Dropout(Module):
    def __init__(self, p): self.p = p
    def forward(self, x):
        if not self.training: return x
        mask = x.new(*x.shape).bernoulli_(1-p)
        return x * mask.div_(1-p)

In [51]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        raw,h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out),raw,out

    def reset(self):
        for h in self.h: h.zero_()

In [52]:
learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),
                loss_func=CrossEntropyLossFlat(), metrics=accuracy,
                cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])

In [53]:
learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4),
                    loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [54]:
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.610863,2.077192,0.455322,00:03
1,1.597773,1.432384,0.596598,00:04
2,0.846239,0.89436,0.802409,00:03
3,0.413939,0.82646,0.839518,00:03
4,0.211952,0.700753,0.848307,00:03
5,0.11456,0.652205,0.864502,00:04
6,0.068979,0.537934,0.87915,00:03
7,0.047844,0.62162,0.876628,00:03
8,0.03806,0.594036,0.877197,00:03
9,0.028884,0.687566,0.871012,00:03
