In [2]:
!pip install pypdf2



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# Step 1: Load Text from File
def load_text_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# Step 2: Tokenize Data
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Use GPT-2 tokenizer for simplicity
def tokenize_text(text, block_size=128):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    sequences = [tokens[i : i + block_size] for i in range(0, len(tokens) - block_size, block_size)]
    return sequences

# Step 3: Create Dataset Class
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.data = torch.tensor(sequences, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx][:-1], self.data[idx][1:]  # Input and target shift by 1

# Step 4: Define a Minimal GPT Model
class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=5, block_size=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoding = nn.Parameter(torch.zeros(1, block_size, embed_dim))
        self.transformer_blocks = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads), num_layers
        )
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x) + self.pos_encoding[:, :x.shape[1], :]
        x = self.transformer_blocks(x)
        return self.fc_out(x)

# Step 5: Train the Model
def train_model(model, dataloader, epochs=3000, lr=1e-3):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.view(-1, outputs.shape[-1]), y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}: Loss = {total_loss / len(dataloader)}")
        print("\nGenerated Text:")
        if ((epoch + 1) % 100) == 0:
          print(generate_text(model, "Hello!", length=50))

# Step 6: Generate Text
def generate_text(model, start_text, length=100, temperature=0.8):
    model.eval()
    input_ids = tokenizer.encode(start_text, return_tensors="pt").cuda()
    generated = input_ids

    for _ in range(length):
        with torch.no_grad():
            logits = model(generated[:, -128:])
            logits = logits[:, -1, :] / temperature  # Apply temperature scaling
            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=-1)

    return tokenizer.decode(generated[0].tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:

file_path = "harry_potter.txt"  # Replace with your text file path
text_data = load_text_from_file(file_path)
sequences = tokenize_text(text_data)

dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

vocab_size = tokenizer.vocab_size
model = MiniGPT(vocab_size).cuda()

Token indices sequence length is longer than the specified maximum sequence length for this model (113793 > 1024). Running this sequence through the model will result in indexing errors


In [6]:
train_model(model, dataloader)

print("\nFinal Generated Text:")
print(generate_text(model, "Harry!", length=100))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Generated Text:
Epoch 1375: Loss = 1.1128737181425095

Generated Text:
Epoch 1376: Loss = 1.1135821299893516

Generated Text:
Epoch 1377: Loss = 1.1142951399087906

Generated Text:
Epoch 1378: Loss = 1.113010483128684

Generated Text:
Epoch 1379: Loss = 1.1127500981092453

Generated Text:
Epoch 1380: Loss = 1.1134565770626068

Generated Text:
Epoch 1381: Loss = 1.1144828924110957

Generated Text:
Epoch 1382: Loss = 1.1137261859008245

Generated Text:
Epoch 1383: Loss = 1.1153813345091683

Generated Text:
Epoch 1384: Loss = 1.1150969586202077

Generated Text:
Epoch 1385: Loss = 1.1161192804574966

Generated Text:
Epoch 1386: Loss = 1.1155880285160882

Generated Text:
Epoch 1387: Loss = 1.1152023524045944

Generated Text:
Epoch 1388: Loss = 1.1149595017944063

Generated Text:
Epoch 1389: Loss = 1.1155968925782613

Generated Text:
Epoch 1390: Loss = 1.1155740129096168

Generated Text:
Epoch 1391: Loss = 1.1150573598487037

