In [1]:
import os
import pickle
import torch
from torch.utils.data import DataLoader

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [2]:
with open("data/XY_pairs.pkl", "rb") as file:
    xy_pairs = pickle.load(file)

with open("data/token_to_int.pkl", "rb") as file:
    token_to_int=pickle.load(file)

with open("data/int_to_token.pkl", "rb") as file:
    int_to_token=pickle.load(file)

Transform feature label pairs to Torch Tensor format. Then load the pairs to a DataLoader object

In [4]:
fea_lbl_pairs = [(torch.tensor(x), torch.tensor(y)) for x,y in xy_pairs]

In [5]:
torch.manual_seed(42)
BATCH_SIZE = 32
loader = DataLoader(fea_lbl_pairs, batch_size=BATCH_SIZE, shuffle=True)

# Build the Model

In [7]:
from torch import nn

In [14]:
class NextTokenLSTM(nn.Module):
    def __init__(self, vocab_size, input_size=128, n_embed=128, n_layers=3, drop_prob=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.input_size = input_size
        self.n_embed = n_embed
        self.n_layers = n_layers
        self.drop_prob = drop_prob
        self.embedding = nn.Embedding(self.vocab_size, self.n_embed)
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.n_embed, num_layers=self.n_layers, dropout=self.drop_prob, batch_first=True)
        self.full_connected = nn.Linear(self.input_size, self.vocab_size)

    def forward(self, x, hc):
        embed = self.embedding(x)
        o, hc = self.lstm(embed, hc)
        y = self.full_connected(o)
        return y, hc

    def initalize_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, batch_size, self.n_embed).zero_(), weight.new(self.n_layers, batch_size, self.n_embed).zero_())

## Train the model

In [19]:
model = NextTokenLSTM(len(token_to_int)).to(device)
print(model)

NextTokenLSTM(
  (embedding): Embedding(14094, 128)
  (lstm): LSTM(128, 128, num_layers=3, batch_first=True, dropout=0.2)
  (full_connected): Linear(in_features=128, out_features=14094, bias=True)
)


In [20]:
lr=0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_func = nn.CrossEntropyLoss()

In [None]:
for epoch in range(50):
    total_loss = 0.0
    sh, sc = model.initalize_hidden(BATCH_SIZE)
    for idx, (x, y) in enumerate(loader):
        if x.shape[0]==BATCH_SIZE:
            inputs, targets = x.to(device), y.to(device)
            optimizer.zero_grad()
            outputs, (sh, sc) = model(inputs, (sh, sc))
            losses = loss_func(outputs.transpose(1,2), targets)
            sh, sc = sh.detach(), sc.detach()
            losses.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            total_loss += losses.item()
        if (idx+1)%1000==0:
            print(f"at epoch {epoch+1} iteration {idx+1} average loss = {total_loss/(idx+1)}")

at epoch 1 iteration 1000 average loss = 6.42337814950943
at epoch 1 iteration 2000 average loss = 6.238867068052292


save the trained model

In [23]:
torch.save(model.state_dict(), "models/wordLSTM.pth")