In [1]:
import torch

In [2]:
import wget
import zipfile

# Download the dataset
url = "http://mattmahoney.net/dc/text8.zip"
wget.download(url, "text8.zip")

# Extract it
with zipfile.ZipFile("text8.zip", "r") as z:
    z.extractall()

# Read the file
with open("text8", "r", encoding="utf-8") as f:
    data = f.read()
print(data[:500])  # Print a sample


 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philoso


In [3]:
dic=data.split(' ')

In [4]:

dic

['',
 'anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the',
 'organization',
 'of',
 'society',
 'it',
 'has',
 'also',
 'been',
 'taken',
 'up',
 'as',
 'a',
 'positive',
 'label',
 'by',
 'self',
 'defined',
 'anarchists',
 'the',
 'word',
 'anarchism',
 'is',
 'derived',
 'from',
 'the',
 'greek',
 'without',
 'archons',
 'ruler',
 'chief',
 'king',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'is',
 'the',
 'belief',
 'that',
 'rulers',
 'are',
 'unnecessary',
 'and',
 'should',
 'be',
 'abolished',
 'although',
 'there',
 'are',
 'differing

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):  
    def __init__(self, embedding_dim, hidden_dim, vocab_size, context_size):
        super().__init__()  
        self.hid = hidden_dim
        self.voc = vocab_size
        self.emb = embedding_dim 
        self.context = context_size
        self.embeddings = nn.Embedding(self.voc, self.emb)  
        self.linear = nn.Linear(self.context * self.emb, self.hid)
        self.output = nn.Linear(self.hid, self.voc)
    
    def forward(self, inp):
        embed = self.embeddings(inp)
        embed_flat = embed.view(embed.shape[0], -1)  
        hidden_output = torch.tanh(self.linear(embed_flat))  
        logits = self.output(hidden_output)
        output = F.softmax(logits, dim=1)  
        return output

In [6]:
from torch.utils.data import Dataset, DataLoader

class dataloader(Dataset):
    def __init__(self,text,context_size):
        self.text=text
        self.context_size=context_size

        self.data = []
        for i in range(len(text) - context_size):
            context = text[i:i+context_size]
            target = text[i+context_size]
            self.data.append((context, target))
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        context, target = self.data[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

In [7]:
def train(text, embed_size, hidden_dim, context_size, batch_size=32, epochs=10, learning_rate=0.001):
    vocab = list(set(text))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for idx, word in enumerate(vocab)}
    
    text_indices = [word_to_idx[word] for word in text]
    
    vocab_size = len(vocab)
    dataset = dataloader(text_indices, context_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model, loss function and optimizer
    model = Model(embed_size, hidden_dim, vocab_size, context_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    print(f"Starting training with vocabulary size: {vocab_size}")
    print(f"Parameters: embed_size={embed_size}, hidden_dim={hidden_dim}, context_size={context_size}")
    
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        
        for batch_idx, (contexts, targets) in enumerate(data_loader):
            # Forward pass
            optimizer.zero_grad()
            outputs = model(contexts)
            
            loss = criterion(outputs, targets)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if batch_idx % 100 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(data_loader)}, Loss: {loss.item():.4f}")
        
        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch+1}/{epochs} completed, Average Loss: {avg_loss:.4f}")
    
    print("Training completed!")
    return model, word_to_idx, idx_to_word


In [None]:
def generate_text(model, seed_text, word_to_idx, idx_to_word, context_size, num_words=50):
    model.eval()
    
    if len(seed_text) < context_size:
        print(f"Warning: Seed text must contain at least {context_size} words. Padding with initial words.")
        while len(seed_text) < context_size:
            seed_text = [seed_text[0]] + seed_text
    
    context = [word_to_idx.get(word, 0) for word in seed_text[-context_size:]]
    generated_text = list(seed_text)
    
    with torch.no_grad():
        for _ in range(num_words):
            input_tensor = torch.tensor(context, dtype=torch.long).unsqueeze(0)
            
            output = model(input_tensor)
            
            word_idx = torch.argmax(output, dim=1).item()
            
            word = idx_to_word[word_idx]
            
            generated_text.append(word)
            
            context = context[1:] + [word_idx]
    
    return generated_text

# Example usage
if __name__ == "__main__":
    
    # Set model parameters
    embed_size = 50
    hidden_dim = 128
    context_size = 10
    
    # Train the model
    trained_model, word_to_idx, idx_to_word = train(
        dic, 
        embed_size=embed_size, 
        hidden_dim=hidden_dim, 
        context_size=context_size,
        epochs=1  # More epochs for this small dataset
    )
    
    # Generate text
    seed = ["the", "fox", "jumps"]
    generated = generate_text(trained_model, seed, word_to_idx, idx_to_word, context_size, num_words=20)
    
    print("\nGenerated text:")
    print(" ".join(generated))
    
    # Save the model
    torch.save({
        'model_state_dict': trained_model.state_dict(),
        'word_to_idx': word_to_idx,
        'idx_to_word': idx_to_word,
        'embed_size': embed_size,
        'hidden_dim': hidden_dim,
        'context_size': context_size,
        'vocab_size': len(word_to_idx)
    }, 'nnlm_model.pth')
    
    print("Model saved to 'nnlm_model.pth'")


Starting training with vocabulary size: 253855
Parameters: embed_size=50, hidden_dim=128, context_size=10
Epoch 1/1, Batch 0/531413, Loss: 12.4445
