In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import random
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

PreProcessing

In [3]:

def preprocess(data):
    data = re.sub(r'\n|\s+', ' ', data) #newline and multiple spaces -> single space
    data = re.sub(r'[’‘]', '\'', data) #apostrophes
    data = re.sub(r'[“”`\' ]|[–—-]', ' ', data) #quotes and dashes
    data = re.sub(r'(?<!\w)([.!?])(?!\w)', r' \1 ', data) #dont remove punctuation
    data = re.sub(r'[™•]', ' ', data) #remove other unwanted symbols
    return data.strip() #strip extra spaces

Tokenization

In [4]:
def tokenize(data, min_length_sentences):
    sentences = sent_tokenize(data)
    sentences = [sentence for sentence in sentences if len(sentence.split()) >= min_length_sentences]

    print("Length of sentences:", len(sentences))

    words_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.lower() not in ['.', ',', '!', '?', ';', ':']]
        words = ['<s>'] + words + ['</s>']
        words_sentences.append(words)

    return sentences, words_sentences

Data Preparations

In [64]:
def train_val_test_split(sentences, train_ratio=0.7, val_ratio=0.125, seed=None, num_shuffles=1):
    if seed is not None:
        random.seed(seed)

    for _ in range(num_shuffles):
        random.shuffle(sentences)

    total_sentences = len(sentences)

    train_size = int(total_sentences * train_ratio)
    val_size = int(total_sentences * val_ratio)
    test_size = total_sentences - train_size - val_size

    train_sentences = sentences[:train_size]
    val_sentences = sentences[train_size:train_size + val_size]
    test_sentences = sentences[train_size + val_size:]

    return train_sentences, val_sentences, test_sentences

Load Glove Embeddings

In [65]:
def create_glove_embeddings(glove_path):
    glove = {}
    embedding_dim = 0

    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            glove[word] = vector
            embedding_dim = len(values[1:])

    glove['<UNK>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
    glove['<PAD>'] = torch.zeros(embedding_dim)
    glove['<s>'] = torch.rand(embedding_dim)
    glove['</s>'] = torch.rand(embedding_dim)

    return glove

Creation of Vocab and Embeddings

In [66]:
def create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove):
    embedding_dim = len(list(glove.values())[0])
    vocab = set()

    vocab.update(['<UNK>', '<PAD>', '<s>', '</s>'])
    for sentence in train_sentences:
        for word in sentence:
            if word in glove:
                vocab.add(word)
            else:
                sentence[sentence.index(word)] = '<UNK>'

    embeddings = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}

    for word in vocab:
        if word in glove:
            embeddings[word_to_idx[word]] = glove[word]
        else:
            embeddings[word_to_idx[word]] = np.random.rand(embedding_dim)

    def encode_sentences(sentences, word_to_idx):
        encoded_sentences = []
        for sentence in sentences:
            encoded_sentence = [word_to_idx[word] if word in word_to_idx else word_to_idx['<UNK>'] for word in sentence]
            encoded_sentences.append(encoded_sentence)
        return encoded_sentences

    encoded_train_sentences = encode_sentences(train_sentences, word_to_idx)
    encoded_val_sentences = encode_sentences(val_sentences, word_to_idx)
    encoded_test_sentences = encode_sentences(test_sentences, word_to_idx)

    return torch.FloatTensor(embeddings), encoded_train_sentences, encoded_val_sentences, encoded_test_sentences, word_to_idx, list(vocab)

Dataset for training LSTM

In [67]:
class LSTMDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        input_sentence = torch.tensor(sentence[:-1], dtype=torch.long)
        target = torch.tensor(sentence[1:], dtype=torch.long)
        return input_sentence, target

In [68]:
def collate_fn(batch, pad_idx):
    input_sentences, targets = zip(*batch)
    input_sentences = pad_sequence(input_sentences, batch_first=True, padding_value=pad_idx)
    targets = pad_sequence(targets, batch_first=True, padding_value=pad_idx)
    return input_sentences, targets

LSTM

In [69]:
class LSTM(nn.Module):
    def __init__(self, embeddings, hidden_dim, dropout, num_layers=1):
        super(LSTM, self).__init__()
        # freeze embeddings
        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.vocab_size = embeddings.shape[0]
        self.embedding_dim = embeddings.shape[1]
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=self.dropout, batch_first=True)
        self.fc1 = nn.Linear(self.hidden_dim, self.vocab_size)
        self.dropout = nn.Dropout(self.dropout)

    def forward(self, input_seq, hidden=None):
        input_seq = self.embeddings(input_seq)

        if hidden is None:
            lstm_out, hidden = self.lstm(input_seq)
        else:
            lstm_out, hidden = self.lstm(input_seq, hidden)

        return self.fc1(self.dropout(lstm_out)), hidden

Model Testing

In [70]:
def test_model(model, val_loader, criterion, pad_idx):
    model.eval()
    total_loss = 0
    hidden = None

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

    avg_val_loss = total_loss / len(val_loader)
    val_perplexity = torch.exp(torch.tensor(avg_val_loss))
    return avg_val_loss, val_perplexity

Train Model

In [71]:

def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience=2, pad_idx=0):
    model.to(device)
    early_stopping_counter = 0
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        hidden = None

        for x, y in tqdm(train_loader):
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            optimizer.zero_grad()
            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

        avg_train_loss = total_loss / len(train_loader)
        perplexity = torch.exp(torch.tensor(avg_train_loss))

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Train Perplexity: {perplexity:.4f}')

        avg_val_loss, val_perplexity = test_model(model, val_loader, criterion, pad_idx)

        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Perplexity: {val_perplexity:.4f}')

        # check for early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), '2024201073_LSTM.pt')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    return model

Save perplexities in files

In [72]:
def save_perplexities_lstm(model, sentences, criterion, filename, idx_to_word):
    model.eval()
    total_loss = 0
    all_sentences = []
    perplexity_scores = []

    with torch.no_grad():
        for sentence in sentences:
            sentence_loss = 0
            sentence_length = 0
            input_indices = sentence[:-1]
            target_indices = sentence[1:]

            input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
            targets = torch.tensor(target_indices, dtype=torch.long).to(device)
            outputs, _ = model(input_tensor)

            for i in range(outputs.shape[1]):
                output = outputs[0, i]
                target_word = targets[i]

                loss = criterion(output.unsqueeze(0), target_word.unsqueeze(0))
                sentence_loss += loss.item()
                sentence_length += 1

            avg_loss_per_sentence = sentence_loss / sentence_length
            sentence_perplexity = torch.exp(torch.tensor(avg_loss_per_sentence)).item()
            perplexity_scores.append(sentence_perplexity)

            sentence_words = [idx_to_word[idx] for idx in sentence]
            full_sentence = " ".join(sentence_words)
            all_sentences.append(full_sentence)

        avg_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    with open(filename, 'w') as f:
        for i, sentence in enumerate(all_sentences):
            f.write(f"{sentence}\t{perplexity_scores[i]:.4f}\n")

        f.write(f"Average\t{avg_perplexity:.4f}\n")

    return avg_perplexity

In [73]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

Running the model

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [75]:
from google.colab import files
uploaded = files.upload()

In [76]:
with open('/content/Pride and Prejudice - Jane Austen.txt', 'r') as f:
    corpus = f.read()

corpus = preprocess(corpus)

sentences, word_sentences = tokenize(corpus, 2)

train_sentences, val_sentences, test_sentences = train_val_test_split(word_sentences)

print("Train size:", len(train_sentences))
print("Validation size:", len(val_sentences))
print("Test size:", len(test_sentences))

Length of sentences: 6085
Train size: 4259
Validation size: 760
Test size: 1066


In [18]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-02-17 15:22:01--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-17 15:22:01--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-17 15:22:01--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [77]:
glove = create_glove_embeddings('/content/glove.6B.100d.txt')

embeddings, encoded_train, encoded_val, encoded_test, word_to_idx, vocab = create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove)

In [78]:
pad_idx = word_to_idx['<PAD>']

train_dataset = LSTMDataset(encoded_train)
val_dataset = LSTMDataset(encoded_val)
test_dataset = LSTMDataset(encoded_test)

train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')

Train dataset size: 4259
Validation dataset size: 760
Test dataset size: 1066


In [133]:
learning_rate = 0.001
num_epochs = 30
patience = 3
model = LSTM(embeddings, 300, 0.6, 2)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

In [134]:
model = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience,scheduler)

100%|██████████| 67/67 [00:02<00:00, 24.64it/s]


Train Loss: 6.6787
Train Perplexity: 795.2762
Val Loss: 6.1043
Val Perplexity: 447.7665


100%|██████████| 67/67 [00:02<00:00, 25.31it/s]


Train Loss: 6.2771
Train Perplexity: 532.2511
Val Loss: 6.0435
Val Perplexity: 421.3833


100%|██████████| 67/67 [00:02<00:00, 24.96it/s]


Train Loss: 6.1627
Train Perplexity: 474.6846
Val Loss: 5.9584
Val Perplexity: 386.9861


100%|██████████| 67/67 [00:02<00:00, 24.47it/s]


Train Loss: 6.0534
Train Perplexity: 425.5695
Val Loss: 5.8279
Val Perplexity: 339.6581


100%|██████████| 67/67 [00:02<00:00, 25.19it/s]


Train Loss: 5.9237
Train Perplexity: 373.7765
Val Loss: 5.6836
Val Perplexity: 294.0176


100%|██████████| 67/67 [00:02<00:00, 24.84it/s]


Train Loss: 5.7866
Train Perplexity: 325.9071
Val Loss: 5.5591
Val Perplexity: 259.5934


100%|██████████| 67/67 [00:02<00:00, 24.03it/s]


Train Loss: 5.6655
Train Perplexity: 288.7353
Val Loss: 5.4521
Val Perplexity: 233.2475


100%|██████████| 67/67 [00:02<00:00, 24.18it/s]


Train Loss: 5.5737
Train Perplexity: 263.4079
Val Loss: 5.3821
Val Perplexity: 217.4773


100%|██████████| 67/67 [00:02<00:00, 24.65it/s]


Train Loss: 5.4903
Train Perplexity: 242.3327
Val Loss: 5.3196
Val Perplexity: 204.3043


100%|██████████| 67/67 [00:02<00:00, 24.69it/s]


Train Loss: 5.4206
Train Perplexity: 226.0228
Val Loss: 5.2582
Val Perplexity: 192.1346


100%|██████████| 67/67 [00:02<00:00, 24.06it/s]


Train Loss: 5.3623
Train Perplexity: 213.2093
Val Loss: 5.2141
Val Perplexity: 183.8463


100%|██████████| 67/67 [00:02<00:00, 24.40it/s]


Train Loss: 5.3142
Train Perplexity: 203.1936
Val Loss: 5.1813
Val Perplexity: 177.9165


100%|██████████| 67/67 [00:02<00:00, 24.79it/s]


Train Loss: 5.2614
Train Perplexity: 192.7569
Val Loss: 5.1480
Val Perplexity: 172.0832


100%|██████████| 67/67 [00:02<00:00, 24.67it/s]


Train Loss: 5.2197
Train Perplexity: 184.8770
Val Loss: 5.1199
Val Perplexity: 167.3129


100%|██████████| 67/67 [00:02<00:00, 24.62it/s]


Train Loss: 5.1859
Train Perplexity: 178.7327
Val Loss: 5.0945
Val Perplexity: 163.1187


100%|██████████| 67/67 [00:02<00:00, 24.76it/s]


Train Loss: 5.1467
Train Perplexity: 171.8627
Val Loss: 5.0781
Val Perplexity: 160.4649


100%|██████████| 67/67 [00:02<00:00, 25.02it/s]


Train Loss: 5.1112
Train Perplexity: 165.8641
Val Loss: 5.0565
Val Perplexity: 157.0383


100%|██████████| 67/67 [00:02<00:00, 25.18it/s]


Train Loss: 5.0842
Train Perplexity: 161.4455
Val Loss: 5.0398
Val Perplexity: 154.4359


100%|██████████| 67/67 [00:02<00:00, 25.07it/s]


Train Loss: 5.0517
Train Perplexity: 156.2902
Val Loss: 5.0221
Val Perplexity: 151.7250


100%|██████████| 67/67 [00:02<00:00, 25.53it/s]


Train Loss: 5.0175
Train Perplexity: 151.0314
Val Loss: 5.0139
Val Perplexity: 150.4832


100%|██████████| 67/67 [00:02<00:00, 24.74it/s]


Train Loss: 4.9950
Train Perplexity: 147.6780
Val Loss: 5.0018
Val Perplexity: 148.6737


100%|██████████| 67/67 [00:02<00:00, 25.45it/s]


Train Loss: 4.9629
Train Perplexity: 143.0062
Val Loss: 4.9883
Val Perplexity: 146.6883


100%|██████████| 67/67 [00:02<00:00, 25.19it/s]


Train Loss: 4.9387
Train Perplexity: 139.5920
Val Loss: 4.9733
Val Perplexity: 144.5027


100%|██████████| 67/67 [00:02<00:00, 25.43it/s]


Train Loss: 4.9156
Train Perplexity: 136.3988
Val Loss: 4.9651
Val Perplexity: 143.3231


100%|██████████| 67/67 [00:02<00:00, 25.11it/s]


Train Loss: 4.8945
Train Perplexity: 133.5493
Val Loss: 4.9548
Val Perplexity: 141.8501


100%|██████████| 67/67 [00:02<00:00, 24.96it/s]


Train Loss: 4.8656
Train Perplexity: 129.7436
Val Loss: 4.9426
Val Perplexity: 140.1272


100%|██████████| 67/67 [00:02<00:00, 25.88it/s]


Train Loss: 4.8418
Train Perplexity: 126.7014
Val Loss: 4.9352
Val Perplexity: 139.1050


100%|██████████| 67/67 [00:02<00:00, 25.25it/s]


Train Loss: 4.8182
Train Perplexity: 123.7380
Val Loss: 4.9367
Val Perplexity: 139.3101


100%|██████████| 67/67 [00:02<00:00, 25.90it/s]


Train Loss: 4.7982
Train Perplexity: 121.2948
Val Loss: 4.9244
Val Perplexity: 137.6084


100%|██████████| 67/67 [00:02<00:00, 24.88it/s]


Train Loss: 4.7779
Train Perplexity: 118.8538
Val Loss: 4.9083
Val Perplexity: 135.4132


Perplexity Scores

In [135]:
loss, perplexity = test_model(model, train_loader, criterion, pad_idx)
print(f'\nTrain Loss: {loss}')
print(f'Train Perplexity: {perplexity}')


Train Loss: 4.505523183452549
Train Perplexity: 90.51569366455078


In [136]:
loss, perplexity = test_model(model, val_loader, criterion, pad_idx)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')


Val Loss: 4.912246982256572
Val Perplexity: 135.94456481933594


In [137]:
loss, perplexity = test_model(model, test_loader, criterion, pad_idx)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')


Test Loss: 4.926246250376982
Test Perplexity: 137.8610382080078


In [139]:
save_perplexities_lstm(model, encoded_train, criterion, '2024201073_LSTM_pp_train_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_val, criterion, '2024201073_LSTM_pp_val_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_test, criterion, '2024201073_LSTM_pp_test_perplexity.txt', vocab)

163.99615225514597

In [138]:
import pickle

with open('data_store_lstm.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'vocab': vocab,
        'word_to_idx': word_to_idx,
        'encoded_train': encoded_train,
        'encoded_val': encoded_val,
        'encoded_test': encoded_test,
    }, f)

print("Data saved successfully!")

Data saved successfully!
