In [20]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=d2310199f770fd1ac3b66141f2def57b39ac0de212de18dc2686faa2d7193977
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time
import math
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Read the text file
file_path = '/content/sherlock-holm.es_stories_plain-text_advs.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text and create word-to-index mapping
word_to_ix = {}
tokenized_text = []
index = 0

for word in text.split():
    if word not in word_to_ix:
        word_to_ix[word] = index
        index += 1
    tokenized_text.append(word_to_ix[word])

total_words = len(word_to_ix)

class NextWordDataset(Dataset):
    def __init__(self, text, word_to_ix):
        self.text = text
        self.word_to_ix = word_to_ix
        self.sequences = self.generate_sequences()

    def generate_sequences(self):
        input_sequences = []
        for line in self.text.split('\n'):
            token_list = [self.word_to_ix[word] for word in line.split()]
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i+1]
                input_sequences.append(n_gram_sequence)
        return input_sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        X = torch.tensor(sequence[:-1])
        y = torch.tensor(sequence[-1])
        return X, y

# Create the dataset and dataloaders
dataset = NextWordDataset(text, word_to_ix)
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

def collate_fn(batch):
    # Pad sequences to the length of the longest sequence in the batch
    max_len = max(len(seq[0]) for seq in batch)
    padded_X = [torch.cat([torch.tensor(seq[0]), torch.zeros(max_len - len(seq[0]))]) for seq in batch]
    padded_y = torch.tensor([seq[1] for seq in batch])
    return torch.stack(padded_X), padded_y

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Define the PyTorch model
class NextWordModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(NextWordModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x.long())  # Ensure x is of type torch.LongTensor
        lstm_out, _ = self.lstm(embedded.permute(1, 0, 2))  # Permute dimensions
        output = self.fc(lstm_out[-1, :, :])
        return output

# Move the model to GPU
embedding_dim = 50
hidden_dim = 100
model = NextWordModel(total_words, embedding_dim, hidden_dim).to(device)

# Instantiate the model, define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with time tracking
num_epochs = 100
start_time = time.time()

for epoch in tqdm(range(num_epochs)):
    model.train()
    for X, y in train_loader:
        # Move data to GPU
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

# Save the model after training
torch.save(model.state_dict(), 'next_word_model.pth')




cuda


  padded_X = [torch.cat([torch.tensor(seq[0]), torch.zeros(max_len - len(seq[0]))]) for seq in batch]
100%|██████████| 100/100 [08:11<00:00,  4.91s/it]


Input: was brown, rather darker than coffee THE THE THE THE THE THE THE, Target: colour,, Predicted: seems
Input: here, unless I am THE THE THE THE THE THE THE THE THE, Target: much, Predicted: mistaken,
Input: variable, THE THE THE THE THE THE THE THE THE THE THE THE, Target: geology, Predicted: singular
Input: the second waiting-maid, THE THE THE THE THE THE THE THE THE THE, Target: has, Predicted: of
Input: "'Is £4 a THE THE THE THE THE THE THE THE THE THE, Target: week.', Predicted: few
Input: marriage might have changed his THE THE THE THE THE THE THE THE THE, Target: whole, Predicted: white
Input: Your affection for Arthur blinds you as to THE THE THE THE THE THE, Target: the, Predicted: these
Input: their position, and the loss of it THE THE THE THE THE THE THE, Target: would, Predicted: with
Input: determined to have a look at it anyhow, so I bought a penny bottle, Target: of, Predicted: lady
Input: "G" with a small "t" THE THE THE THE THE THE THE THE THE, Target: woven, Predic

In [21]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from rouge_score import rouge_scorer


# print(word_to_ix.values())

# Evaluation loop with predictions
model.eval()

total_loss = 0.0
total_tokens = 0

references_bleu = []
hypotheses_bleu = []
references_rouge = []
hypotheses_rouge = []

rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []
bleu_scores = []

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)

with torch.no_grad():
    for X, y in test_loader:
        X, y = X.to(device), y.to(device)

        # print(idx for idx in y[0])

        output = model(X)
        predictions = torch.argmax(output, dim=1)

        # Calculate perplexity
        loss = criterion(output, y)
        total_loss += loss.item()
        total_tokens += y.numel()
        perplexity = math.exp(total_loss / total_tokens)

        for i in range(len(X)):
            if y[i].dim() == 0:  # Check if y[i] is a scalar tensor
                reference_bleu = [list(word_to_ix.keys())[list(word_to_ix.values()).index(y[i].item())]]
            else:
                reference_bleu = [list(word_to_ix.keys())[list(word_to_ix.values()).index(idx.item())] for idx in y[i]]

            # Check if predictions[i] is a scalar tensor
            if predictions[i].dim() == 0:
                hypothesis_bleu = [list(word_to_ix.keys())[list(word_to_ix.values()).index(predictions[i].item())]]
            else:
                hypothesis_bleu = [list(word_to_ix.keys())[list(word_to_ix.values()).index(idx.item())] for idx in predictions[i]]


            # Calculate BLEU for each sentence
            bleu_score = sentence_bleu([reference_bleu], hypothesis_bleu)
            bleu_scores.append(bleu_score)

            reference_rouge = ' '.join(reference_bleu)
            hypothesis_rouge = ' '.join(hypothesis_bleu)

            references_rouge.append(reference_rouge)
            hypotheses_rouge.append(hypothesis_rouge)

            # Calculate ROUGE scores
            rouge_scores = scorer.score(reference_rouge, hypothesis_rouge)
            rouge_1_scores.append(rouge_scores['rouge1'].fmeasure)
            rouge_2_scores.append(rouge_scores['rouge2'].fmeasure)
            rouge_l_scores.append(rouge_scores['rougeLsum'].fmeasure)

            # Print some predictions during evaluation
            if i < 5:
                input_text = ' '.join([list(word_to_ix.keys())[list(word_to_ix.values()).index(idx.item())] for idx in X[i]])
                target_word = list(word_to_ix.keys())[list(word_to_ix.values()).index(y[i].item())]
                predicted_word = list(word_to_ix.keys())[list(word_to_ix.values()).index(predictions[i].item())]
                print(f"Input: {input_text}, Target: {target_word}, Predicted: {predicted_word}")



# Calculate perplexity
perplexity = math.exp(total_loss / total_tokens)
print(f"Perplexity on test set: {perplexity:.2f}")

# Calculate average BLEU score
average_bleu = sum(bleu_scores) / len(bleu_scores)

# Calculate average ROUGE scores
average_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
average_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print(f"Average BLEU score: {average_bleu:.4f}")
print(f"Average ROUGE-1 score: {average_rouge_1:.4f}")
print(f"Average ROUGE-2 score: {average_rouge_2:.4f}")
print(f"Average ROUGE-L score: {average_rouge_l:.4f}")
print(f"Perplexity on test set: {perplexity:.2f}")

# Calculate and print the total training time
end_time = time.time()
training_time = end_time - start_time
print(f"Total training time: {training_time:.2f} seconds")






# Prediction loop
seed_text = "I will leave if they"
next_words = 3

for _ in range(next_words):
    token_list = [word_to_ix[word] for word in seed_text.split()]
    token_tensor = torch.tensor(token_list, dtype=torch.long).unsqueeze(0).to(device)
    predicted = torch.argmax(model(token_tensor), dim=1).item()
    output_word = list(word_to_ix.keys())[list(word_to_ix.values()).index(predicted)]
    seed_text += " " + output_word

print(seed_text)

  padded_X = [torch.cat([torch.tensor(seq[0]), torch.zeros(max_len - len(seq[0]))]) for seq in batch]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Input: was brown, rather darker than coffee THE THE THE THE THE THE THE, Target: colour,, Predicted: seems
Input: here, unless I am THE THE THE THE THE THE THE THE THE, Target: much, Predicted: mistaken,
Input: variable, THE THE THE THE THE THE THE THE THE THE THE THE, Target: geology, Predicted: singular
Input: the second waiting-maid, THE THE THE THE THE THE THE THE THE THE, Target: has, Predicted: of
Input: "'Is £4 a THE THE THE THE THE THE THE THE THE THE, Target: week.', Predicted: few
Input: marriage might have changed his THE THE THE THE THE THE THE THE THE, Target: whole, Predicted: white
Input: Your affection for Arthur blinds you as to THE THE THE THE THE THE, Target: the, Predicted: these
Input: their position, and the loss of it THE THE THE THE THE THE THE, Target: would, Predicted: with
Input: determined to have a look at it anyhow, so I bought a penny bottle, Target: of, Predicted: lady
Input: "G" with a small "t" THE THE THE THE THE THE THE THE THE, Target: woven, Predic