In [1]:
import re
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
import pickle

ABBREVIATIONS = {"Mr.", "Mrs.", "Dr.", "Ms.", "Prof.", "Sr.", "Jr.", "vs.", "etc."}

def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "URL", text)
    text = re.sub(r"@\w+", "MENTION", text)
    text = re.sub(r"#\w+", "HASHTAG", text)
    text = re.sub(r"\b\d+(\.\d+)?\s?%", "PERCENTAGE", text)
    text = re.sub(r"\b\d+\s?(years old|yrs old|yo|years|yrs)\b", "AGE", text, flags=re.IGNORECASE)
    text = re.sub(r"\b\d{1,2}:\d{2}(?:\s?[APap][Mm])?\b", "TIME", text)
    text = re.sub(r"\b\d+\s?(seconds|minutes|hours|days|weeks|months|years)\b", 
                  "TIME_PERIOD", text, flags=re.IGNORECASE)
    return text

def clean_text(text):
    paragraphs = text.split("\n\n")
    cleaned_paragraphs = []

    for paragraph in paragraphs:
        cleaned_paragraph = re.sub(r"\s*\n\s*", " ", paragraph)
        cleaned_paragraphs.append(cleaned_paragraph.strip())

    return "\n\n".join(cleaned_paragraphs)

def fix_abbreviation_splits(text):
    for abbr in ABBREVIATIONS:
        text = re.sub(rf"\b{re.escape(abbr)}\s", abbr.replace(".", "__TEMP__") + " ", text)
    
    return text

def restore_abbreviations(text):
    return text.replace("__TEMP__", ".")

def custom_nlp_tokenizer(text):
    text = preprocess_text(text)
    text = clean_text(text)

    tokenizer = TreebankWordTokenizer()

    text = fix_abbreviation_splits(text)

    sentences = sent_tokenize(text)

    sentences = [restore_abbreviations(sent) for sent in sentences]

    tokenized_sentences = []
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        tokens = ['<s>'] + tokens + ['</s>']
        tokenized_sentences.append(tokens)

    return tokenized_sentences

if __name__ == '__main__':
    text = '''
I am Shravan. What are
you doing here?
Are you
good, Mr. Bingley!!'''
    result = custom_nlp_tokenizer(text)
    print("Tokenized text:", result)

Tokenized text: [['<s>', 'I', 'am', 'Shravan', '.', '</s>'], ['<s>', 'What', 'are', 'you', 'doing', 'here', '?', '</s>'], ['<s>', 'Are', 'you', 'good', ',', 'Mr.', 'Bingley', '!', '</s>'], ['<s>', '!', '</s>']]


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from torch.utils.data import DataLoader, Dataset
# from tokenizer import custom_nlp_tokenizer
import sys
import os

class FFNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_gram):
        super(FFNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(n_gram * embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x).view(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Take last output in the sequence
        return out


class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Take last output in the sequence
        return out


class TextDataset(Dataset):
    def __init__(self, ngrams, vocab):
        self.ngrams = ngrams
        self.vocab = vocab

    def __len__(self):
        return len(self.ngrams)

    def __getitem__(self, idx):
        context, target = self.ngrams[idx]
        context_tensor = torch.tensor([self.vocab.get(word, 0) for word in context], dtype=torch.long)
        target_tensor = torch.tensor(self.vocab.get(target, 0), dtype=torch.long)
        return context_tensor, target_tensor


def generate_ngrams(tokenized_text, n):
    ngrams = []
    for sentence in tokenized_text:
        if len(sentence) < n:
            continue
        for i in range(len(sentence) - n):
            context = sentence[i:i + n]
            target = sentence[i + n]
            ngrams.append((context, target))
    return ngrams


class Training:
    def __init__(self, model, dataloader, vocab, inv_vocab, epochs=10, lr=0.001, save_path="/kaggle/working/"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)
        self.model = model.to(self.device)
        self.dataloader = dataloader
        self.vocab = vocab
        self.inv_vocab = inv_vocab
        self.epochs = epochs
        self.lr = lr
        self.save_path = save_path

    def train_model(self):
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        for epoch in range(self.epochs):
            total_loss = 0
            for context, target in self.dataloader:
                context, target = context.to(self.device), target.to(self.device)

                optimizer.zero_grad()
                output = self.model(context)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            print(f"Epoch {epoch + 1}, Loss: {total_loss / len(self.dataloader)}")

        os.makedirs(os.path.dirname(self.save_path), exist_ok=True)
        torch.save(self.model.state_dict(), os.path.join(self.save_path,'model.pt'))
        print(f"Full model saved at {self.save_path}")


def compute_perplexity2(model, dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    total_log_prob = 0
    total_words = 0

    with torch.no_grad():
        for context, target in dataloader:
            context, target = context.to(device), target.to(device)
            output = model(context)
            log_probs = torch.log_softmax(output, dim=1)
            batch_log_prob = log_probs[torch.arange(target.shape[0]), target]
            total_log_prob += batch_log_prob.sum().item()
            total_words += target.shape[0]

    return torch.exp(torch.tensor(-total_log_prob / total_words))



def create_vocab(tokenized_text, path):
    vocab = {word: idx for idx, word in enumerate(set(sum(tokenized_text, [])))}
    inv_vocab = {idx: word for word, idx in vocab.items()}

    os.makedirs(path, exist_ok=True)

    vocab_path = os.path.join(path, "vocab.pkl")
    inv_vocab_path = os.path.join(path, "inv_vocab.pkl")

    with open(vocab_path, "wb") as f:
        pickle.dump(vocab, f)

    with open(inv_vocab_path, "wb") as f:
        pickle.dump(inv_vocab, f)

    print(f"Vocabulary saved at {vocab_path} and {inv_vocab_path}")
    return vocab, inv_vocab


def split_dataset(ngram_data, test_size=1000):
    test_set = random.sample(ngram_data, test_size)
    train_set = [pair for pair in ngram_data if pair not in test_set]
    return train_set, test_set


def predict_next_word(model, vocab, inv_vocab, n_gram, sentence, k):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    if len(sentence) < n_gram:
        print("Sentence too short!")
        return

    context = sentence[-n_gram:]
    context_tensor = torch.tensor([vocab.get(word, 0) for word in context], dtype=torch.long).unsqueeze(0).to(device)
    output = model(context_tensor)
    probs = torch.softmax(output, dim=1).squeeze()
    top_k = torch.topk(probs, k)

    for idx, prob in zip(top_k.indices, top_k.values):
        print(f"{inv_vocab[idx.item()]} {prob.item():.4f}")


# def main():
# if __name__ == "__main__":
#     main()

In [3]:
import os
import torch
import torch.nn.functional as F

def compute_and_save_perplexities(model, train_loader, test_loader, vocab, inv_vocab, model_name, n_gram, corpus_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Set model to evaluation mode
    
    def compute_perplexity(loader, filename):
        results_dir = "/kaggle/working/results"
        os.makedirs(results_dir, exist_ok=True)
        path = os.path.join(results_dir, filename)
        print(filename)

        total_log_prob = 0
        total_words = 0
        perplexity_lines = []

        with torch.no_grad():
            for context, target in loader:
                context, target = context.to(device), target.to(device)
                output = model(context)
                log_probs = F.log_softmax(output, dim=1)
                batch_log_prob = log_probs[torch.arange(target.shape[0]), target]
                
                total_log_prob += batch_log_prob.sum().item()
                total_words += target.shape[0]

                # Convert context indices back to words
                context_words = [" ".join(inv_vocab[idx.item()] for idx in sent) for sent in context]
                target_words = [inv_vocab[target[i].item()] for i in range(target.shape[0])]

                for ctx, tgt, ppl in zip(context_words, target_words, batch_log_prob):
                    perplexity_lines.append(f"{ctx} -> {tgt}\t{torch.exp(-ppl).item():.4f}\n")

        # Compute the average perplexity
        avg_perplexity = torch.exp(torch.tensor(-total_log_prob / total_words)).item()

        # Write to file with average perplexity at the top
        with open(path, "w") as f:
            f.write(f"Average Perplexity: {avg_perplexity:.4f}\n")
            f.writelines(perplexity_lines)

        print(f"Saved perplexity scores with sentences in: {path}")

    # Create filenames
    train_filename = f"{model_name}_train_{n_gram}_{corpus_name}.txt"
    test_filename = f"{model_name}_test_{n_gram}_{corpus_name}.txt"

    # Compute perplexities for train and test
    compute_perplexity(train_loader, train_filename)
    compute_perplexity(test_loader, test_filename)

In [4]:
# if len(sys.argv) < 4:
#     print("Usage: python3 generator.py <lm_type> <corpus_path> <k>")
#     sys.exit(1)

# lm_type = '-f'
# corpus_path = '/kaggle/input/external/Pride and Prejudice - Jane Austen.txt'
# corpus_path = '/kaggle/input/external/Ulysses - James Joyce.txt'
import os

k = 3

for lm_type in ['-r', '-l', '-f']:
    for corpus_path in ['/kaggle/input/external/Pride and Prejudice - Jane Austen.txt', '/kaggle/input/external/Ulysses - James Joyce.txt']:
        for n_gram in [3, 5]:

            with open(corpus_path, 'r', encoding='utf-8') as f:
                text = f.read()

            tokenized_text = custom_nlp_tokenizer(text)
            corpus_name = os.path.basename(corpus_path).replace(" ", "_").replace(".txt", "")

            vocab_path = f"/kaggle/working/{lm_type[1]}nn_{corpus_name}_n_{n_gram}"
            os.makedirs(os.path.dirname(vocab_path), exist_ok=True)
            vocab, inv_vocab = create_vocab(tokenized_text,path=vocab_path)

            ngram_data = generate_ngrams(tokenized_text, n_gram)  # Correct n_gram usage
            train_set, test_set = split_dataset(ngram_data)

            train_dataset = TextDataset(train_set, vocab)
            test_dataset = TextDataset(test_set, vocab)
            train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

            if lm_type == '-f':
                model = FFNNLanguageModel(len(vocab), embedding_dim=100, hidden_dim=200, n_gram=n_gram)
            elif lm_type == '-r':
                model = RNNLanguageModel(len(vocab), embedding_dim=100, hidden_dim=200)
            elif lm_type == '-l':
                model = LSTMLanguageModel(len(vocab), embedding_dim=100, hidden_dim=200)
            else:
                print("Invalid model type! Use '-f' for FFNN, '-r' for RNN, '-l' for LSTM")
                sys.exit(1)

            save_path = f"/kaggle/working/{lm_type[1]}nn_{corpus_name}_n_{n_gram}"

            epochs = 5 if corpus_path=='/kaggle/input/external/Pride and Prejudice - Jane Austen.txt' else 15
            Trainer = Training(model, train_loader, vocab, inv_vocab, epochs=epochs, lr=0.001, save_path=save_path)
            Trainer.train_model()


            perplexity = compute_perplexity2(model, train_loader)
            print(f"Train Perplexity for {lm_type} on {corpus_name} (n={n_gram}): {perplexity}")
            
            perplexity = compute_perplexity2(model, test_loader)
            print(f"Test Perplexity for {lm_type} on {corpus_name} (n={n_gram}): {perplexity}")

            # print('Printing corpus_name',corpus_name)

            compute_and_save_perplexities(model=model, train_loader=train_loader, test_loader=test_loader, vocab=vocab, inv_vocab=inv_vocab, model_name=lm_type[1] + "nn", n_gram=n_gram, corpus_name=corpus_name)
            # print(f"Avg Train Perplexity: {avg_train_ppl:.4f}, Avg Test Perplexity: {avg_test_ppl:.4f}")

# while True:
#     sentence = input("Input sentence: ").strip().split()
#     predict_next_word(model, vocab, inv_vocab, n_gram, sentence, k)

Vocabulary saved at /kaggle/working/rnn_Pride_and_Prejudice_-_Jane_Austen_n_3/vocab.pkl and /kaggle/working/rnn_Pride_and_Prejudice_-_Jane_Austen_n_3/inv_vocab.pkl
cuda
Epoch 1, Loss: 5.550588242790916
Epoch 2, Loss: 4.744100943478671
Epoch 3, Loss: 4.411708566925743
Epoch 4, Loss: 4.152533916126598
Epoch 5, Loss: 3.9281965780258177
Full model saved at /kaggle/working/rnn_Pride_and_Prejudice_-_Jane_Austen_n_3
Train Perplexity for -r on Pride_and_Prejudice_-_Jane_Austen (n=3): 37.40426254272461
Test Perplexity for -r on Pride_and_Prejudice_-_Jane_Austen (n=3): 143.08645629882812
rnn_train_3_Pride_and_Prejudice_-_Jane_Austen.txt
Saved perplexity scores with sentences in: /kaggle/working/results/rnn_train_3_Pride_and_Prejudice_-_Jane_Austen.txt
rnn_test_3_Pride_and_Prejudice_-_Jane_Austen.txt
Saved perplexity scores with sentences in: /kaggle/working/results/rnn_test_3_Pride_and_Prejudice_-_Jane_Austen.txt
Vocabulary saved at /kaggle/working/rnn_Pride_and_Prejudice_-_Jane_Austen_n_5/vocab

In [5]:
!zip -r everything.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/lnn_Ulysses_-_James_Joyce_n_5/ (stored 0%)
  adding: kaggle/working/lnn_Ulysses_-_James_Joyce_n_5/model.pt (deflated 7%)
  adding: kaggle/working/lnn_Ulysses_-_James_Joyce_n_5/inv_vocab.pkl (deflated 42%)
  adding: kaggle/working/lnn_Ulysses_-_James_Joyce_n_5/vocab.pkl (deflated 42%)
  adding: kaggle/working/fnn_Pride_and_Prejudice_-_Jane_Austen_n_3/ (stored 0%)
  adding: kaggle/working/fnn_Pride_and_Prejudice_-_Jane_Austen_n_3/model.pt (deflated 8%)
  adding: kaggle/working/fnn_Pride_and_Prejudice_-_Jane_Austen_n_3/inv_vocab.pkl (deflated 45%)
  adding: kaggle/working/fnn_Pride_and_Prejudice_-_Jane_Austen_n_3/vocab.pkl (deflated 45%)
  adding: kaggle/working/rnn_Ulysses_-_James_Joyce_n_5/ (stored 0%)
  adding: kaggle/working/rnn_Ulysses_-_James_Joyce_n_5/model.pt (deflated 8%)
  adding: kaggle/working/rnn_Ulysses_-_James_Joyce_n_5/inv_vocab.pkl (deflated 42%)
  adding: kaggle/working/rnn_Ulysses_-_James_Joyce_n_5/vocab.pkl