In [34]:
import re
with open("combined.txt", 'r', encoding='utf-8') as f:
    text = f.read().lower()


# Tokenize into words (simple whitespace split)
words = text.split()

# Step 2: Create a word-level vocabulary
unique_words = sorted(set(words))
word2idx = {word: idx for idx, word in enumerate(unique_words)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(unique_words)

# Step 3: Encode entire text
encoded_text = [word2idx[word] for word in words]


In [32]:
print(vocab_size)

5752


In [33]:

print("chars",word2idx)

chars {"'artificial": 0, "'can": 1, "'computing": 2, "'inspect'": 3, "'machines": 4, "'what": 5, '(': 6, '(1)': 7, '(1951)': 8, '(1958)': 9, '(1960)': 10, '(1969).': 11, '(1970).': 12, '(1984)': 13, '(1985)': 14, '(1986)': 15, '(1990),': 16, '(1991)': 17, '(1994)': 18, '(1998),': 19, '(1999).': 20, '(1st': 21, '(2)': 22, '(20': 23, '(2010).': 24, '(2012)': 25, '(2013).': 26, '(2014)': 27, '(2015)': 28, '(2015),': 29, '(2017)': 30, '(2017).': 31, '(2018)': 32, '(2021)': 33, '(2021).': 34, '(2022)': 35, '(2022).': 36, '(2024).': 37, '(2nd': 38, '(3)': 39, '(3rd': 40, '(4)': 41, '(4th': 42, '(5)': 43, '(a': 44, '(above': 45, '(acm': 46, '(adsense,': 47, '(agi)': 48, '(ai)': 49, '(ai).': 50, '(also': 51, '(analogous': 52, '(and': 53, '(ann):': 54, '(anns)': 55, "(apple's": 56, '(arl)': 57, '(as': 58, '(asr)': 59, '(asr).': 60, '(based': 61, '(both': 62, '(bsde).': 63, '(called': 64, '(cao).': 65, '(cap': 66, '(cap)': 67, '(captions)': 68, '(cerebellar': 69, '(cnns)': 70, '(commercial': 71,

In [2]:

import torch
from torch.utils.data import Dataset, DataLoader

class WordDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx+self.block_size])
        y = torch.tensor(self.data[idx+1:idx+self.block_size+1])
        return x, y

block_size = 20
dataset = WordDataset(encoded_text, block_size)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [2]:

import torch
from torch.utils.data import Dataset, DataLoader

class WordDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx+self.block_size])
        y = torch.tensor(self.data[idx+1:idx+self.block_size+1])
        return x, y

block_size = 20
dataset = WordDataset(encoded_text, block_size)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [3]:
import torch.nn as nn

class LSTMWordModel(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        logits = self.fc(out)
        return logits, hidden


In [4]:
def train_model(model, data_loader, total_iters, word2idx, idx2word, device='cpu', max_batches_per_iter=10):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
    criterion = nn.CrossEntropyLoss()

    checkpoints = [500, 1000, 2000]
    model.train()

    for iteration in range(1, total_iters + 1):
        print(f"\nIteration {iteration}/{total_iters}")
        for i, (x, y) in enumerate(data_loader):
            if i >= max_batches_per_iter:
                break
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits, _ = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()

        print(f"Loss: {loss.item():.4f}")

        if iteration in checkpoints:
            save_path = f'word_LSTM_{iteration}_iters.pth'
            torch.save({
                'model_state_dict': model.state_dict(),
                'word2idx': word2idx,
                'idx2word': idx2word
            }, save_path)
            print(f"Model checkpoint saved at iteration {iteration}: {save_path}")

            sample = generate_text(model, start_text="a", length=50, device=device)
            print(f"\n Sample after {iteration} iterations:\n{sample}\n")


In [5]:

# Step 7: Text Generation
def generate_text(model, start_text, length, device='cpu'):
    model.eval()
    words = start_text.lower().split()
    input_eval = torch.tensor([word2idx[w] for w in words], dtype=torch.long).unsqueeze(0).to(device)
    hidden = None
    output_words = words[:]

    with torch.no_grad():
        for _ in range(length):
            logits, hidden = model(input_eval, hidden)
            probs = torch.softmax(logits[:, -1, :], dim=-1)
            next_word_idx = torch.multinomial(probs, num_samples=1).item()
            next_word = idx2word[next_word_idx]
            output_words.append(next_word)
            input_eval = torch.tensor([[next_word_idx]], device=device)

    return ' '.join(output_words)



In [6]:
model = LSTMWordModel(vocab_size=vocab_size, hidden_size=256)
print("\n--- Training for 2000 iterations with checkpoints at 500, 1000, and 2000 ---")
train_model(model, dataloader, total_iters=2000, word2idx=word2idx, idx2word=idx2word, device='cpu')



--- Training for 2000 iterations with checkpoints at 500, 1000, and 2000 ---

Iteration 1/2000
Loss: 7.4512

Iteration 2/2000
Loss: 7.1317

Iteration 3/2000
Loss: 6.7071

Iteration 4/2000
Loss: 6.8218

Iteration 5/2000
Loss: 6.5241

Iteration 6/2000
Loss: 6.1308

Iteration 7/2000
Loss: 5.8558

Iteration 8/2000
Loss: 5.7200

Iteration 9/2000
Loss: 5.3731

Iteration 10/2000
Loss: 5.2271

Iteration 11/2000
Loss: 4.8849

Iteration 12/2000
Loss: 4.6351

Iteration 13/2000
Loss: 4.2989

Iteration 14/2000
Loss: 4.1877

Iteration 15/2000
Loss: 3.8474

Iteration 16/2000
Loss: 3.6387

Iteration 17/2000
Loss: 3.4920

Iteration 18/2000
Loss: 3.2361

Iteration 19/2000
Loss: 3.0500

Iteration 20/2000
Loss: 2.8012

Iteration 21/2000
Loss: 2.5597

Iteration 22/2000
Loss: 2.5019

Iteration 23/2000
Loss: 2.3237

Iteration 24/2000
Loss: 2.3471

Iteration 25/2000
Loss: 2.1521

Iteration 26/2000
Loss: 1.9500

Iteration 27/2000
Loss: 1.9897

Iteration 28/2000
Loss: 1.7798

Iteration 29/2000
Loss: 1.7207

It