In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
import torch
import torch.nn.functional as F
from torch import nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import time
from sklearn.manifold import TSNE
from tqdm import tqdm
import pickle
import json
import nltk
import urllib.request

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
url = "https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt"
response = urllib.request.urlopen(url)
text = response.read().decode("utf-8")

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
def preprocess_text(text):
    tokens = nltk.tokenize.word_tokenize(text.lower())
    words = [word for word in tokens if word.isalpha()]
    return words

words = preprocess_text(text)
vocab = sorted(set(words))  
vocab += ['unknown token', 'padding']
vocab_size = len(vocab)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for idx, word in enumerate(vocab)}

In [15]:
class RNNTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):  # Corrected from _init_ to __init__
        super(RNNTextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return x


In [16]:
def prepare_data(words, context_length):
    X, y = [], []
    for i in range(context_length, len(words)):
        X.append([word_to_idx[word] for word in words[i-context_length:i]])
        y.append(word_to_idx[words[i]])
    return np.array(X), np.array(y)

In [17]:
def train_model(context_length, k, random_seed, epochs):
    torch.manual_seed(random_seed)  # Set random seed for reproducibility

    X, y = prepare_data(words, context_length)
    embedding_dim = 64
    hidden_dim = 128
    model = RNNTextGenerator(vocab_size, embedding_dim, hidden_dim).to(device) 

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    batch_size = 32
    train_data = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long))
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
    
    model_path = f'rnn_text_generator_c{context_length}_k{k}_seed{random_seed}.pth'
    torch.save(model.state_dict(), model_path)

    vocab_data = {
        "word_to_idx": word_to_idx,
        "idx_to_word": idx_to_word
    }
    vocab_path = f'vocab_c{context_length}_k{k}_seed{random_seed}.json'
    with open(vocab_path, 'w') as f:
        json.dump(vocab_data, f)

    return model_path, vocab_path

In [18]:
context_lengths = [3, 4]  
k_values = [1, 3]  
random_seeds = [42, 123]  
epochs_list = [10, 10, 5, 5, 3, 3, 3, 3]  
model_count = 0  
for context_length in context_lengths:
    for k in k_values:
        for random_seed in random_seeds:
            model_path, vocab_path = train_model(context_length, k, random_seed, epochs_list[model_count])
            print(f"Model trained and saved: {model_path}, Vocabulary saved: {vocab_path}")
            model_count += 1

Epoch 1/10: 100%|██████████| 17514/17514 [00:53<00:00, 324.41it/s]


Epoch 1, Loss: 5.9636


Epoch 2/10: 100%|██████████| 17514/17514 [00:52<00:00, 330.49it/s]


Epoch 2, Loss: 5.4729


Epoch 3/10: 100%|██████████| 17514/17514 [00:52<00:00, 331.13it/s]


Epoch 3, Loss: 5.2711


Epoch 4/10: 100%|██████████| 17514/17514 [00:52<00:00, 332.46it/s]


Epoch 4, Loss: 5.1257


Epoch 5/10: 100%|██████████| 17514/17514 [00:53<00:00, 329.18it/s]


Epoch 5, Loss: 5.0093


Epoch 6/10: 100%|██████████| 17514/17514 [00:52<00:00, 330.82it/s]


Epoch 6, Loss: 4.9114


Epoch 7/10: 100%|██████████| 17514/17514 [00:52<00:00, 332.35it/s]


Epoch 7, Loss: 4.8251


Epoch 8/10: 100%|██████████| 17514/17514 [00:53<00:00, 330.11it/s]


Epoch 8, Loss: 4.7526


Epoch 9/10: 100%|██████████| 17514/17514 [00:52<00:00, 330.73it/s]


Epoch 9, Loss: 4.6836


Epoch 10/10: 100%|██████████| 17514/17514 [00:52<00:00, 333.28it/s]


Epoch 10, Loss: 4.6167
Model trained and saved: rnn_text_generator_c3_k1_seed42.pth, Vocabulary saved: vocab_c3_k1_seed42.json


Epoch 1/10: 100%|██████████| 17514/17514 [00:52<00:00, 330.94it/s]


Epoch 1, Loss: 5.9683


Epoch 2/10: 100%|██████████| 17514/17514 [00:52<00:00, 331.34it/s]


Epoch 2, Loss: 5.4681


Epoch 3/10: 100%|██████████| 17514/17514 [00:52<00:00, 330.99it/s]


Epoch 3, Loss: 5.2704


Epoch 4/10: 100%|██████████| 17514/17514 [00:53<00:00, 330.43it/s]


Epoch 4, Loss: 5.1315


Epoch 5/10: 100%|██████████| 17514/17514 [00:52<00:00, 331.46it/s]


Epoch 5, Loss: 5.0200


Epoch 6/10: 100%|██████████| 17514/17514 [00:52<00:00, 332.19it/s]


Epoch 6, Loss: 4.9243


Epoch 7/10: 100%|██████████| 17514/17514 [00:52<00:00, 331.40it/s]


Epoch 7, Loss: 4.8360


Epoch 8/10: 100%|██████████| 17514/17514 [00:52<00:00, 330.61it/s]


Epoch 8, Loss: 4.7637


Epoch 9/10: 100%|██████████| 17514/17514 [00:52<00:00, 331.20it/s]


Epoch 9, Loss: 4.6983


Epoch 10/10: 100%|██████████| 17514/17514 [00:52<00:00, 331.95it/s]


Epoch 10, Loss: 4.6354
Model trained and saved: rnn_text_generator_c3_k1_seed123.pth, Vocabulary saved: vocab_c3_k1_seed123.json


Epoch 1/5: 100%|██████████| 17514/17514 [00:52<00:00, 330.61it/s]


Epoch 1, Loss: 5.9636


Epoch 2/5: 100%|██████████| 17514/17514 [00:52<00:00, 331.26it/s]


Epoch 2, Loss: 5.4729


Epoch 3/5: 100%|██████████| 17514/17514 [00:52<00:00, 331.94it/s]


Epoch 3, Loss: 5.2711


Epoch 4/5: 100%|██████████| 17514/17514 [00:52<00:00, 332.35it/s]


Epoch 4, Loss: 5.1257


Epoch 5/5: 100%|██████████| 17514/17514 [00:52<00:00, 331.03it/s]


Epoch 5, Loss: 5.0093
Model trained and saved: rnn_text_generator_c3_k3_seed42.pth, Vocabulary saved: vocab_c3_k3_seed42.json


Epoch 1/5: 100%|██████████| 17514/17514 [00:52<00:00, 333.05it/s]


Epoch 1, Loss: 5.9683


Epoch 2/5: 100%|██████████| 17514/17514 [00:52<00:00, 331.18it/s]


Epoch 2, Loss: 5.4681


Epoch 3/5: 100%|██████████| 17514/17514 [00:52<00:00, 331.89it/s]


Epoch 3, Loss: 5.2704


Epoch 4/5: 100%|██████████| 17514/17514 [00:52<00:00, 332.46it/s]


Epoch 4, Loss: 5.1315


Epoch 5/5: 100%|██████████| 17514/17514 [00:52<00:00, 331.42it/s]


Epoch 5, Loss: 5.0200
Model trained and saved: rnn_text_generator_c3_k3_seed123.pth, Vocabulary saved: vocab_c3_k3_seed123.json


Epoch 1/3: 100%|██████████| 17514/17514 [00:53<00:00, 330.13it/s]


Epoch 1, Loss: 5.9596


Epoch 2/3: 100%|██████████| 17514/17514 [00:52<00:00, 332.19it/s]


Epoch 2, Loss: 5.4531


Epoch 3/3: 100%|██████████| 17514/17514 [00:52<00:00, 331.75it/s]


Epoch 3, Loss: 5.2492
Model trained and saved: rnn_text_generator_c4_k1_seed42.pth, Vocabulary saved: vocab_c4_k1_seed42.json


Epoch 1/3: 100%|██████████| 17514/17514 [00:52<00:00, 331.67it/s]


Epoch 1, Loss: 5.9622


Epoch 2/3: 100%|██████████| 17514/17514 [00:52<00:00, 331.51it/s]


Epoch 2, Loss: 5.4491


Epoch 3/3: 100%|██████████| 17514/17514 [00:52<00:00, 332.79it/s]


Epoch 3, Loss: 5.2470
Model trained and saved: rnn_text_generator_c4_k1_seed123.pth, Vocabulary saved: vocab_c4_k1_seed123.json


Epoch 1/3: 100%|██████████| 17514/17514 [00:53<00:00, 330.32it/s]


Epoch 1, Loss: 5.9596


Epoch 2/3: 100%|██████████| 17514/17514 [00:52<00:00, 333.79it/s]


Epoch 2, Loss: 5.4531


Epoch 3/3: 100%|██████████| 17514/17514 [00:52<00:00, 332.31it/s]


Epoch 3, Loss: 5.2492
Model trained and saved: rnn_text_generator_c4_k3_seed42.pth, Vocabulary saved: vocab_c4_k3_seed42.json


Epoch 1/3: 100%|██████████| 17514/17514 [00:52<00:00, 332.22it/s]


Epoch 1, Loss: 5.9622


Epoch 2/3: 100%|██████████| 17514/17514 [00:52<00:00, 333.48it/s]


Epoch 2, Loss: 5.4491


Epoch 3/3: 100%|██████████| 17514/17514 [00:52<00:00, 333.28it/s]


Epoch 3, Loss: 5.2470
Model trained and saved: rnn_text_generator_c4_k3_seed123.pth, Vocabulary saved: vocab_c4_k3_seed123.json
