In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import random
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

PreProcess

In [None]:
def preprocess(data):
    data = re.sub(r'\n|\s+', ' ', data) #newline and multiple spaces -> single space
    data = re.sub(r'[’‘]', '\'', data) #apostrophes
    data = re.sub(r'[“”`\' ]|[–—-]', ' ', data) #quotes and dashes
    data = re.sub(r'(?<!\w)([.!?])(?!\w)', r' \1 ', data) #dont remove punctuation
    data = re.sub(r'[™•]', ' ', data) #remove other unwanted symbols
    return data.strip() #strip extra spaces

Tokenization

In [None]:
def tokenize(data, min_length_sentences):
    sentences = sent_tokenize(data)
    sentences = [sentence for sentence in sentences if len(sentence.split()) >= min_length_sentences]

    print("Length of sentences:", len(sentences))

    words_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.lower() not in ['.', ',', '!', '?', ';', ':']]
        words = ['<s>'] + words + ['</s>']
        words_sentences.append(words)

    return sentences, words_sentences

Data Prepartions

In [None]:
def train_val_test_split(sentences, train_ratio=0.8, val_ratio=0.15, seed=None, num_shuffles=1):
    if seed is not None:
        random.seed(seed)

    for _ in range(num_shuffles):
        random.shuffle(sentences)

    total_sentences = len(sentences)

    train_size = int(total_sentences * train_ratio)
    val_size = int(total_sentences * val_ratio)
    test_size = total_sentences - train_size - val_size

    train_sentences = sentences[:train_size]
    val_sentences = sentences[train_size:train_size + val_size]
    test_sentences = sentences[train_size + val_size:]

    return train_sentences, val_sentences, test_sentences

Load Glove Embeddings

In [None]:
def create_glove_embeddings(glove_path):
    glove = {}
    embedding_dim = 0

    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            glove[word] = vector
            embedding_dim = len(values[1:])

    glove['<UNK>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
    glove['<PAD>'] = torch.zeros(embedding_dim)
    glove['<s>'] = torch.rand(embedding_dim)
    glove['</s>'] = torch.rand(embedding_dim)

    return glove

Creation of vocab and embeddings

In [None]:
def create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove):
    embedding_dim = len(list(glove.values())[0])
    vocab = set()

    vocab.update(['<UNK>', '<PAD>', '<s>', '</s>'])
    for sentence in train_sentences:
        for word in sentence:
            if word in glove:
                vocab.add(word)
            else:
                sentence[sentence.index(word)] = '<UNK>'

    embeddings = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}

    for word in vocab:
        if word in glove:
            embeddings[word_to_idx[word]] = glove[word]
        else:
            embeddings[word_to_idx[word]] = np.random.rand(embedding_dim)

    def encode_sentences(sentences, word_to_idx):
        encoded_sentences = []
        for sentence in sentences:
            encoded_sentence = [word_to_idx[word] if word in word_to_idx else word_to_idx['<UNK>'] for word in sentence]
            encoded_sentences.append(encoded_sentence)
        return encoded_sentences

    encoded_train_sentences = encode_sentences(train_sentences, word_to_idx)
    encoded_val_sentences = encode_sentences(val_sentences, word_to_idx)
    encoded_test_sentences = encode_sentences(test_sentences, word_to_idx)

    return torch.FloatTensor(embeddings), encoded_train_sentences, encoded_val_sentences, encoded_test_sentences, word_to_idx, list(vocab)

Daatset for trainig RNN

In [None]:
class RNNDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        input_sentence = torch.tensor(sentence[:-1], dtype=torch.long)
        target = torch.tensor(sentence[1:], dtype=torch.long)
        return input_sentence, target

In [None]:
def collate_fn(batch, pad_idx):
    input_sentences, targets = zip(*batch)
    input_sentences = pad_sequence(input_sentences, batch_first=True, padding_value=pad_idx)
    targets = pad_sequence(targets, batch_first=True, padding_value=pad_idx)
    return input_sentences, targets

RNN

In [None]:
class RNN(nn.Module):
    def __init__(self, embeddings, hidden_dim, dropout, num_layers=1):
        super(RNN, self).__init__()
        # Freeze embeddings
        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.vocab_size = embeddings.shape[0]
        self.embedding_dim = embeddings.shape[1]
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout  # Avoid overwriting nn.Dropout later

        # Define RNN layer
        self.rnn = nn.RNN(input_size=self.embedding_dim, hidden_size=self.hidden_dim,
                          num_layers=self.num_layers, dropout=self.dropout_prob, batch_first=True)

        self.fc1 = nn.Linear(self.hidden_dim, self.vocab_size)
        self.dropout = nn.Dropout(self.dropout_prob)  # Keep dropout for regularization

    def forward(self, input_seq, hidden=None):
        input_seq = self.embeddings(input_seq)  # Convert tokens to embeddings

        if hidden is None:
            rnn_out, hidden = self.rnn(input_seq)
        else:
            rnn_out, hidden = self.rnn(input_seq, hidden)

        return self.fc1(self.dropout(rnn_out)), hidden

Model Testing

In [None]:
def test_model(model, val_loader, criterion, pad_idx):
    model.eval()
    total_loss = 0
    hidden = None

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

    avg_val_loss = total_loss / len(val_loader)
    val_perplexity = torch.exp(torch.tensor(avg_val_loss))
    return avg_val_loss, val_perplexity

Train Model

In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience=2, pad_idx=0):
    model.to(device)
    early_stopping_counter = 0
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        hidden = None

        for x, y in tqdm(train_loader):
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            optimizer.zero_grad()
            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

        avg_train_loss = total_loss / len(train_loader)
        perplexity = torch.exp(torch.tensor(avg_train_loss))

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Train Perplexity: {perplexity:.4f}')

        avg_val_loss, val_perplexity = test_model(model, val_loader, criterion, pad_idx)

        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Perplexity: {val_perplexity:.4f}')

        # check for early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), '2024201073_RNN_u.pt')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    return model

Save perplexities in File

In [None]:
def save_perplexities_lstm(model, sentences, criterion, filename, idx_to_word):
    model.eval()
    total_loss = 0
    all_sentences = []
    perplexity_scores = []

    with torch.no_grad():
        for sentence in sentences:
            sentence_loss = 0
            sentence_length = 0
            input_indices = sentence[:-1]
            target_indices = sentence[1:]

            input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
            targets = torch.tensor(target_indices, dtype=torch.long).to(device)
            outputs, _ = model(input_tensor)

            for i in range(outputs.shape[1]):
                output = outputs[0, i]
                target_word = targets[i]

                loss = criterion(output.unsqueeze(0), target_word.unsqueeze(0))
                sentence_loss += loss.item()
                sentence_length += 1

            avg_loss_per_sentence = sentence_loss / sentence_length
            sentence_perplexity = torch.exp(torch.tensor(avg_loss_per_sentence)).item()
            perplexity_scores.append(sentence_perplexity)

            sentence_words = [idx_to_word[idx] for idx in sentence]
            full_sentence = " ".join(sentence_words)
            all_sentences.append(full_sentence)

        avg_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    with open(filename, 'w') as f:
        for i, sentence in enumerate(all_sentences):
            f.write(f"{sentence}\t{perplexity_scores[i]:.4f}\n")

        f.write(f"Average\t{avg_perplexity:.4f}\n")

    return avg_perplexity

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

Running the model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from google.colab import files
uploaded = files.upload()

Saving Ulysses - James Joyce.txt to Ulysses - James Joyce.txt


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-02-18 05:39:09--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-18 05:39:09--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-18 05:39:09--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
with open('/content/Ulysses - James Joyce.txt', 'r') as f:
    corpus = f.read()

corpus = preprocess(corpus)

sentences, word_sentences = tokenize(corpus, 2)

train_sentences, val_sentences, test_sentences = train_val_test_split(word_sentences)

print("Train size:", len(train_sentences))
print("Validation size:", len(val_sentences))
print("Test size:", len(test_sentences))

Length of sentences: 22226
Train size: 17780
Validation size: 3333
Test size: 1113


In [None]:
glove = create_glove_embeddings('/content/glove.6B.100d.txt')

embeddings, encoded_train, encoded_val, encoded_test, word_to_idx, vocab = create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove)

In [None]:
# Step 1: Define the maximum sequence length
MAX_SEQ_LENGTH = 30  # You can adjust this based on your memory constraints

# Step 2: Truncate the sequences to the maximum length
def truncate_sequences(sequences, max_length):
    return [seq[:max_length] for seq in sequences]

# Apply truncation to your data
encoded_train = truncate_sequences(encoded_train, MAX_SEQ_LENGTH)
encoded_val = truncate_sequences(encoded_val, MAX_SEQ_LENGTH)
encoded_test = truncate_sequences(encoded_test, MAX_SEQ_LENGTH)

# Step 3: Create the datasets using the truncated sequences
pad_idx = word_to_idx['<PAD>']

train_dataset = RNNDataset(encoded_train)
val_dataset = RNNDataset(encoded_val)
test_dataset = RNNDataset(encoded_test)

# Step 4: DataLoader remains the same
train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)

# Print dataset sizes
print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')


Train dataset size: 17780
Validation dataset size: 3333
Test dataset size: 1113


In [None]:
learning_rate = 0.001
num_epochs = 10
patience = 2
model = RNN(embeddings, 300, 0.6, 3)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), learning_rate,weight_decay=1e-4)

In [None]:
model = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience)

100%|██████████| 278/278 [00:11<00:00, 24.84it/s]


Train Loss: 7.0277
Train Perplexity: 1127.3984
Val Loss: 6.1846
Val Perplexity: 485.1961


100%|██████████| 278/278 [00:10<00:00, 27.10it/s]


Train Loss: 6.5840
Train Perplexity: 723.4504
Val Loss: 6.0142
Val Perplexity: 409.2041


100%|██████████| 278/278 [00:10<00:00, 26.74it/s]


Train Loss: 6.4048
Train Perplexity: 604.7437
Val Loss: 5.8986
Val Perplexity: 364.5115


100%|██████████| 278/278 [00:10<00:00, 26.40it/s]


Train Loss: 6.2710
Train Perplexity: 529.0291
Val Loss: 5.8302
Val Perplexity: 340.4391


100%|██████████| 278/278 [00:10<00:00, 26.18it/s]


Train Loss: 6.1755
Train Perplexity: 480.8091
Val Loss: 5.7870
Val Perplexity: 326.0424


100%|██████████| 278/278 [00:10<00:00, 26.03it/s]


Train Loss: 6.0975
Train Perplexity: 444.7273
Val Loss: 5.7471
Val Perplexity: 313.2726


100%|██████████| 278/278 [00:10<00:00, 26.16it/s]


Train Loss: 6.0329
Train Perplexity: 416.9249
Val Loss: 5.7275
Val Perplexity: 307.1981


100%|██████████| 278/278 [00:10<00:00, 26.05it/s]


Train Loss: 5.9795
Train Perplexity: 395.2502
Val Loss: 5.7244
Val Perplexity: 306.2501


100%|██████████| 278/278 [00:10<00:00, 25.68it/s]


Train Loss: 5.9313
Train Perplexity: 376.6302
Val Loss: 5.7062
Val Perplexity: 300.7333


100%|██████████| 278/278 [00:10<00:00, 25.40it/s]


Train Loss: 5.8946
Train Perplexity: 363.0608
Val Loss: 5.6817
Val Perplexity: 293.4407


Perplexity Scores

In [None]:
loss, perplexity = test_model(model, train_loader, criterion, pad_idx)
print(f'\nTrain Loss: {loss}')
print(f'Train Perplexity: {perplexity}')


Train Loss: 5.488025901986541
Train Perplexity: 241.77938842773438


In [None]:
loss, perplexity = test_model(model, val_loader, criterion, pad_idx)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')


Val Loss: 5.685312190145816
Val Perplexity: 294.5097961425781


In [None]:
loss, perplexity = test_model(model, test_loader, criterion, pad_idx)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')


Test Loss: 5.660770840115017
Test Perplexity: 287.3700866699219


In [None]:
save_perplexities_lstm(model, encoded_train, criterion, '2024201073_RNN_u_train_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_val, criterion, '2024201073_RNN_u_val_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_test, criterion, '2024201073_RNN_u_test_perplexity.txt', vocab)

450.8836490561699

In [None]:
import pickle

with open('data_store_rnn_u.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'vocab': vocab,
        'word_to_idx': word_to_idx,
        'encoded_train': encoded_train,
        'encoded_val': encoded_val,
        'encoded_test': encoded_test,
    }, f)

print("Data saved successfully!")

Data saved successfully!
