In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import random
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

PreProcessing

In [None]:

def preprocess(data):
    data = re.sub(r'\n|\s+', ' ', data) #newline and multiple spaces -> single space
    data = re.sub(r'[’‘]', '\'', data) #apostrophes
    data = re.sub(r'[“”`\' ]|[–—-]', ' ', data) #quotes and dashes
    data = re.sub(r'(?<!\w)([.!?])(?!\w)', r' \1 ', data) #dont remove punctuation
    data = re.sub(r'[™•]', ' ', data) #remove other unwanted symbols
    return data.strip() #strip extra spaces

Tokenization

In [None]:
def tokenize(data, min_length_sentences):
    sentences = sent_tokenize(data)
    sentences = [sentence for sentence in sentences if len(sentence.split()) >= min_length_sentences]

    print("Length of sentences:", len(sentences))

    words_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.lower() not in ['.', ',', '!', '?', ';', ':']]
        words = ['<s>'] + words + ['</s>']
        words_sentences.append(words)

    return sentences, words_sentences

Data Preparations

In [None]:
def train_val_test_split(sentences, train_ratio=0.8, val_ratio=0.15, seed=None, num_shuffles=1):
    if seed is not None:
        random.seed(seed)

    for _ in range(num_shuffles):
        random.shuffle(sentences)

    total_sentences = len(sentences)

    train_size = int(total_sentences * train_ratio)
    val_size = int(total_sentences * val_ratio)
    test_size = total_sentences - train_size - val_size

    train_sentences = sentences[:train_size]
    val_sentences = sentences[train_size:train_size + val_size]
    test_sentences = sentences[train_size + val_size:]

    return train_sentences, val_sentences, test_sentences

Load Glove Embeddings

In [None]:
def create_glove_embeddings(glove_path):
    glove = {}
    embedding_dim = 0

    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            glove[word] = vector
            embedding_dim = len(values[1:])

    glove['<UNK>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
    glove['<PAD>'] = torch.zeros(embedding_dim)
    glove['<s>'] = torch.rand(embedding_dim)
    glove['</s>'] = torch.rand(embedding_dim)

    return glove

Creation of Vocab and Embeddings

In [None]:
def create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove):
    embedding_dim = len(list(glove.values())[0])
    vocab = set()

    vocab.update(['<UNK>', '<PAD>', '<s>', '</s>'])
    for sentence in train_sentences:
        for word in sentence:
            if word in glove:
                vocab.add(word)
            else:
                sentence[sentence.index(word)] = '<UNK>'

    embeddings = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}

    for word in vocab:
        if word in glove:
            embeddings[word_to_idx[word]] = glove[word]
        else:
            embeddings[word_to_idx[word]] = np.random.rand(embedding_dim)

    def encode_sentences(sentences, word_to_idx):
        encoded_sentences = []
        for sentence in sentences:
            encoded_sentence = [word_to_idx[word] if word in word_to_idx else word_to_idx['<UNK>'] for word in sentence]
            encoded_sentences.append(encoded_sentence)
        return encoded_sentences

    encoded_train_sentences = encode_sentences(train_sentences, word_to_idx)
    encoded_val_sentences = encode_sentences(val_sentences, word_to_idx)
    encoded_test_sentences = encode_sentences(test_sentences, word_to_idx)

    return torch.FloatTensor(embeddings), encoded_train_sentences, encoded_val_sentences, encoded_test_sentences, word_to_idx, list(vocab)

Dataset for training LSTM

In [None]:
class LSTMDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        input_sentence = torch.tensor(sentence[:-1], dtype=torch.long)
        target = torch.tensor(sentence[1:], dtype=torch.long)
        return input_sentence, target

In [None]:
def collate_fn(batch, pad_idx):
    input_sentences, targets = zip(*batch)
    input_sentences = pad_sequence(input_sentences, batch_first=True, padding_value=pad_idx)
    targets = pad_sequence(targets, batch_first=True, padding_value=pad_idx)
    return input_sentences, targets

LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, embeddings, hidden_dim, dropout, num_layers=1):
        super(LSTM, self).__init__()
        # freeze embeddings
        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.vocab_size = embeddings.shape[0]
        self.embedding_dim = embeddings.shape[1]
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=self.dropout, batch_first=True)
        self.fc1 = nn.Linear(self.hidden_dim, self.vocab_size)
        self.dropout = nn.Dropout(self.dropout)

    def forward(self, input_seq, hidden=None):
        input_seq = self.embeddings(input_seq)

        if hidden is None:
            lstm_out, hidden = self.lstm(input_seq)
        else:
            lstm_out, hidden = self.lstm(input_seq, hidden)

        return self.fc1(self.dropout(lstm_out)), hidden

Model Testing

In [None]:
def test_model(model, val_loader, criterion, pad_idx):
    model.eval()
    total_loss = 0
    hidden = None

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

    avg_val_loss = total_loss / len(val_loader)
    val_perplexity = torch.exp(torch.tensor(avg_val_loss))
    return avg_val_loss, val_perplexity

Train Model

In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience=2, pad_idx=0, max_grad_norm=1.0):
    model.to(device)
    early_stopping_counter = 0
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        hidden = None

        for x, y in tqdm(train_loader):
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            optimizer.zero_grad()
            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            loss.backward()

            # Apply gradient clipping here
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_grad_norm)  # Apply gradient clipping

            optimizer.step()

            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

        avg_train_loss = total_loss / len(train_loader)
        perplexity = torch.exp(torch.tensor(avg_train_loss))

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Train Perplexity: {perplexity:.4f}')

        avg_val_loss, val_perplexity = test_model(model, val_loader, criterion, pad_idx)

        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Perplexity: {val_perplexity:.4f}')

        # check for early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), '2024201073_LSTM_u.pt')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    return model


Save perplexities in files

In [None]:
def save_perplexities_lstm(model, sentences, criterion, filename, idx_to_word):
    model.eval()
    total_loss = 0
    all_sentences = []
    perplexity_scores = []

    with torch.no_grad():
        for sentence in sentences:
            sentence_loss = 0
            sentence_length = 0
            input_indices = sentence[:-1]
            target_indices = sentence[1:]

            input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
            targets = torch.tensor(target_indices, dtype=torch.long).to(device)
            outputs, _ = model(input_tensor)

            for i in range(outputs.shape[1]):
                output = outputs[0, i]
                target_word = targets[i]

                loss = criterion(output.unsqueeze(0), target_word.unsqueeze(0))
                sentence_loss += loss.item()
                sentence_length += 1

            avg_loss_per_sentence = sentence_loss / sentence_length
            sentence_perplexity = torch.exp(torch.tensor(avg_loss_per_sentence)).item()
            perplexity_scores.append(sentence_perplexity)

            sentence_words = [idx_to_word[idx] for idx in sentence]
            full_sentence = " ".join(sentence_words)
            all_sentences.append(full_sentence)

        avg_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    with open(filename, 'w') as f:
        for i, sentence in enumerate(all_sentences):
            f.write(f"{sentence}\t{perplexity_scores[i]:.4f}\n")

        f.write(f"Average\t{avg_perplexity:.4f}\n")

    return avg_perplexity

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

Running the model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from google.colab import files
uploaded = files.upload()

Saving Ulysses - James Joyce.txt to Ulysses - James Joyce.txt


In [None]:
with open('/content/Ulysses - James Joyce.txt', 'r') as f:
    corpus = f.read()

corpus = preprocess(corpus)

sentences, word_sentences = tokenize(corpus, 2)

train_sentences, val_sentences, test_sentences = train_val_test_split(word_sentences)

print("Train size:", len(train_sentences))
print("Validation size:", len(val_sentences))
print("Test size:", len(test_sentences))

Length of sentences: 22226
Train size: 17780
Validation size: 3333
Test size: 1113


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-02-18 05:54:08--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-18 05:54:08--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-18 05:54:08--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
glove = create_glove_embeddings('/content/glove.6B.100d.txt')

embeddings, encoded_train, encoded_val, encoded_test, word_to_idx, vocab = create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove)

In [None]:
# Step 1: Define the maximum sequence length
MAX_SEQ_LENGTH = 20  # Adjust based on your memory constraints

# Step 2: Truncate the sequences to the maximum length
def truncate_sequences(sequences, max_length):
    return [seq[:max_length] for seq in sequences]

# Apply truncation to your data
encoded_train = truncate_sequences(encoded_train, MAX_SEQ_LENGTH)
encoded_val = truncate_sequences(encoded_val, MAX_SEQ_LENGTH)
encoded_test = truncate_sequences(encoded_test, MAX_SEQ_LENGTH)

# Step 3: Create the datasets using the truncated sequences
pad_idx = word_to_idx['<PAD>']

train_dataset = LSTMDataset(encoded_train)
val_dataset = LSTMDataset(encoded_val)
test_dataset = LSTMDataset(encoded_test)

# Step 4: DataLoader remains the same but will now operate on truncated sequences
train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)

# Print dataset sizes
print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')


Train dataset size: 17780
Validation dataset size: 3333
Test dataset size: 1113


In [None]:
learning_rate = 3e-4
num_epochs = 40
patience = 6
model = LSTM(embeddings, 256, 0.7, 2)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.AdamW(model.parameters(), learning_rate, weight_decay=1e-6)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=3)


In [None]:
model = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience,scheduler)

100%|██████████| 278/278 [00:05<00:00, 48.30it/s]


Train Loss: 7.2555
Train Perplexity: 1415.8820
Val Loss: 6.3935
Val Perplexity: 597.9263


100%|██████████| 278/278 [00:05<00:00, 49.51it/s]


Train Loss: 6.7117
Train Perplexity: 821.9260
Val Loss: 6.3072
Val Perplexity: 548.4813


100%|██████████| 278/278 [00:05<00:00, 48.67it/s]


Train Loss: 6.6085
Train Perplexity: 741.3588
Val Loss: 6.2185
Val Perplexity: 501.9615


100%|██████████| 278/278 [00:05<00:00, 49.57it/s]


Train Loss: 6.5345
Train Perplexity: 688.5223
Val Loss: 6.1406
Val Perplexity: 464.3492


100%|██████████| 278/278 [00:05<00:00, 48.59it/s]


Train Loss: 6.4694
Train Perplexity: 645.1131
Val Loss: 6.0983
Val Perplexity: 445.0926


100%|██████████| 278/278 [00:05<00:00, 48.74it/s]


Train Loss: 6.4096
Train Perplexity: 607.6559
Val Loss: 6.0325
Val Perplexity: 416.7523


100%|██████████| 278/278 [00:05<00:00, 48.17it/s]


Train Loss: 6.3524
Train Perplexity: 573.8439
Val Loss: 5.9937
Val Perplexity: 400.8958


100%|██████████| 278/278 [00:05<00:00, 49.41it/s]


Train Loss: 6.2991
Train Perplexity: 544.0565
Val Loss: 5.9418
Val Perplexity: 380.6158


100%|██████████| 278/278 [00:05<00:00, 48.81it/s]


Train Loss: 6.2502
Train Perplexity: 518.1146
Val Loss: 5.8973
Val Perplexity: 364.0689


100%|██████████| 278/278 [00:05<00:00, 49.26it/s]


Train Loss: 6.2063
Train Perplexity: 495.8718
Val Loss: 5.8595
Val Perplexity: 350.5460


100%|██████████| 278/278 [00:05<00:00, 49.26it/s]


Train Loss: 6.1660
Train Perplexity: 476.2798
Val Loss: 5.8497
Val Perplexity: 347.1448


100%|██████████| 278/278 [00:05<00:00, 48.91it/s]


Train Loss: 6.1307
Train Perplexity: 459.7787
Val Loss: 5.8115
Val Perplexity: 334.1357


100%|██████████| 278/278 [00:05<00:00, 49.36it/s]


Train Loss: 6.0958
Train Perplexity: 443.9982
Val Loss: 5.8068
Val Perplexity: 332.5602


100%|██████████| 278/278 [00:05<00:00, 48.82it/s]


Train Loss: 6.0649
Train Perplexity: 430.4660
Val Loss: 5.7740
Val Perplexity: 321.8265


100%|██████████| 278/278 [00:05<00:00, 49.39it/s]


Train Loss: 6.0333
Train Perplexity: 417.1023
Val Loss: 5.7668
Val Perplexity: 319.4987


100%|██████████| 278/278 [00:05<00:00, 49.10it/s]


Train Loss: 6.0064
Train Perplexity: 405.9999
Val Loss: 5.7402
Val Perplexity: 311.1207


100%|██████████| 278/278 [00:05<00:00, 49.43it/s]


Train Loss: 5.9774
Train Perplexity: 394.4090
Val Loss: 5.7262
Val Perplexity: 306.7945


100%|██████████| 278/278 [00:05<00:00, 48.69it/s]


Train Loss: 5.9551
Train Perplexity: 385.7310
Val Loss: 5.7329
Val Perplexity: 308.8727


100%|██████████| 278/278 [00:05<00:00, 49.49it/s]


Train Loss: 5.9280
Train Perplexity: 375.4166
Val Loss: 5.7122
Val Perplexity: 302.5428


100%|██████████| 278/278 [00:05<00:00, 48.65it/s]


Train Loss: 5.9070
Train Perplexity: 367.6172
Val Loss: 5.7048
Val Perplexity: 300.3163


100%|██████████| 278/278 [00:05<00:00, 49.41it/s]


Train Loss: 5.8796
Train Perplexity: 357.6651
Val Loss: 5.6953
Val Perplexity: 297.4626


100%|██████████| 278/278 [00:05<00:00, 48.82it/s]


Train Loss: 5.8605
Train Perplexity: 350.8948
Val Loss: 5.6731
Val Perplexity: 290.9449


100%|██████████| 278/278 [00:05<00:00, 49.28it/s]


Train Loss: 5.8334
Train Perplexity: 341.5160
Val Loss: 5.6806
Val Perplexity: 293.1217


100%|██████████| 278/278 [00:05<00:00, 49.14it/s]


Train Loss: 5.8117
Train Perplexity: 334.1987
Val Loss: 5.6687
Val Perplexity: 289.6563


100%|██████████| 278/278 [00:05<00:00, 48.87it/s]


Train Loss: 5.7910
Train Perplexity: 327.3542
Val Loss: 5.6572
Val Perplexity: 286.3390


100%|██████████| 278/278 [00:05<00:00, 49.40it/s]


Train Loss: 5.7718
Train Perplexity: 321.1241
Val Loss: 5.6628
Val Perplexity: 287.9491


100%|██████████| 278/278 [00:05<00:00, 48.70it/s]


Train Loss: 5.7551
Train Perplexity: 315.7995
Val Loss: 5.6678
Val Perplexity: 289.3861


100%|██████████| 278/278 [00:05<00:00, 49.52it/s]


Train Loss: 5.7363
Train Perplexity: 309.9171
Val Loss: 5.6695
Val Perplexity: 289.9003


100%|██████████| 278/278 [00:05<00:00, 48.43it/s]


Train Loss: 5.7145
Train Perplexity: 303.2189
Val Loss: 5.6284
Val Perplexity: 278.2160


100%|██████████| 278/278 [00:05<00:00, 49.41it/s]


Train Loss: 5.6942
Train Perplexity: 297.1455
Val Loss: 5.6452
Val Perplexity: 282.9299


100%|██████████| 278/278 [00:05<00:00, 48.41it/s]


Train Loss: 5.6745
Train Perplexity: 291.3364
Val Loss: 5.6599
Val Perplexity: 287.1203


100%|██████████| 278/278 [00:05<00:00, 49.42it/s]


Train Loss: 5.6568
Train Perplexity: 286.2261
Val Loss: 5.6263
Val Perplexity: 277.6201


100%|██████████| 278/278 [00:05<00:00, 48.70it/s]


Train Loss: 5.6407
Train Perplexity: 281.6699
Val Loss: 5.6428
Val Perplexity: 282.2606


100%|██████████| 278/278 [00:05<00:00, 49.41it/s]


Train Loss: 5.6231
Train Perplexity: 276.7411
Val Loss: 5.6405
Val Perplexity: 281.6035


100%|██████████| 278/278 [00:05<00:00, 49.03it/s]


Train Loss: 5.6057
Train Perplexity: 271.9763
Val Loss: 5.6440
Val Perplexity: 282.5957


100%|██████████| 278/278 [00:05<00:00, 49.03it/s]


Train Loss: 5.5915
Train Perplexity: 268.1312
Val Loss: 5.6602
Val Perplexity: 287.2141


100%|██████████| 278/278 [00:05<00:00, 49.28it/s]


Train Loss: 5.5739
Train Perplexity: 263.4576
Val Loss: 5.6596
Val Perplexity: 287.0284


100%|██████████| 278/278 [00:05<00:00, 48.62it/s]


Train Loss: 5.5555
Train Perplexity: 258.6489
Val Loss: 5.6580
Val Perplexity: 286.5733
Early stopping at epoch 38


Perplexity Scores

In [None]:
loss, perplexity = test_model(model, train_loader, criterion, pad_idx)
print(f'\nTrain Loss: {loss}')
print(f'Train Perplexity: {perplexity}')


Train Loss: 5.207592650283154
Train Perplexity: 182.65379333496094


In [None]:
loss, perplexity = test_model(model, val_loader, criterion, pad_idx)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')


Val Loss: 5.659929743352926
Val Perplexity: 287.12847900390625


In [None]:
loss, perplexity = test_model(model, test_loader, criterion, pad_idx)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')


Test Loss: 5.7002099884880915
Test Perplexity: 298.9302062988281


In [None]:
save_perplexities_lstm(model, encoded_train, criterion, '2024201073_LSTM_u_train_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_val, criterion, '2024201073_LSTM_u_val_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_test, criterion, '2024201073_LSTM_u_test_perplexity.txt', vocab)

641.7235306348226

In [None]:
import pickle

with open('data_store_LSTM_u.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'vocab': vocab,
        'word_to_idx': word_to_idx,
        'encoded_train': encoded_train,
        'encoded_val': encoded_val,
        'encoded_test': encoded_test,
    }, f)

print("Data saved successfully!")

Data saved successfully!
