In [2]:
# STEP 1: IMPORTS
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm

# STEP 2: GPT-2 COMPONENTS
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super().__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        B, T, C = x.shape
        H = self.heads
        q = self.query(x).view(B, T, H, C // H).transpose(1, 2)
        k = self.key(x).view(B, T, H, C // H).transpose(1, 2)
        v = self.value(x).view(B, T, H, C // H).transpose(1, 2)

        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn = torch.softmax(scores, dim=-1)
        out = attn @ v
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.fc_out(out)

class FeedForward(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_size, hidden_size),
            nn.GELU(),
            nn.Linear(hidden_size, embed_size)
        )

    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, ff_hidden_size):
        super().__init__()
        self.attn = MultiHeadSelfAttention(embed_size, heads)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ff = FeedForward(embed_size, ff_hidden_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class GPT2(nn.Module):
    def __init__(self, vocab_size, embed_size=768, heads=12, ff_hidden_size=3072, num_layers=12, max_len=512):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_size)
        self.pos_emb = PositionalEncoding(embed_size, max_len)
        self.blocks = nn.Sequential(*[
            TransformerBlock(embed_size, heads, ff_hidden_size)
            for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(embed_size)
        self.head = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.token_emb(x)
        x = self.pos_emb(x)
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.head(x)

# STEP 3: Dataset
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, seq_len=128, max_samples=500000):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        self.samples = []
        for line in lines[:max_samples]:
            tokens = tokenizer.encode(line.strip())
            for i in range(0, len(tokens) - seq_len - 1):
                x = tokens[i:i+seq_len]
                y = tokens[i+1:i+seq_len+1]
                self.samples.append((torch.tensor(x), torch.tensor(y)))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

# STEP 4: إعدادات
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
data_path = "/content/TinyStories-train.txt"  # بعد الرفع من جهازك
dataset = TextDataset(data_path, tokenizer, seq_len=128)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

model = GPT2(vocab_size=tokenizer.vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
loss_fn = nn.CrossEntropyLoss()

# STEP 5: Training
for epoch in range(5):
    total_loss = 0
    for xb, yb in tqdm(loader, desc=f"Epoch {epoch+1}"):
        xb, yb = xb.to(device), yb.to(device)
        out = model(xb)
        loss = loss_fn(out.view(-1, out.size(-1)), yb.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    print(f"✅ Epoch {epoch+1} Avg Loss: {avg_loss:.4f}")

# STEP 6: حساب الـ Perplexity
def compute_perplexity(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for xb, yb in data_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out.view(-1, out.size(-1)), yb.view(-1))
            total_loss += loss.item()
    avg_loss = total_loss / len(data_loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

perplexity = compute_perplexity(model, loader)
print(f"\n📊 Perplexity: {perplexity:.2f}")

# STEP 7: توليد 5 عينات
def generate_text(prompt, model, tokenizer, max_new_tokens=50):
    model.eval()
    tokens = tokenizer.encode(prompt, return_tensors="pt").to(next(model.parameters()).device)
    for _ in range(max_new_tokens):
        with torch.no_grad():
            out = model(tokens)
            probs = torch.softmax(out[:, -1, :], dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            tokens = torch.cat([tokens, next_token], dim=1)
    return tokenizer.decode(tokens[0])

print("\n📝 Generated Samples:")
for i in range(5):
    sample = generate_text("Once upon a time", model, tokenizer, max_new_tokens=50)
    print(f"\nSample {i+1}:\n{sample}")


Epoch 1: 100%|██████████| 165/165 [44:43<00:00, 16.26s/it]


✅ Epoch 1 Avg Loss: 2.3585


Epoch 2: 100%|██████████| 165/165 [44:47<00:00, 16.29s/it]


✅ Epoch 2 Avg Loss: 0.4539


Epoch 3: 100%|██████████| 165/165 [45:05<00:00, 16.40s/it]


✅ Epoch 3 Avg Loss: 0.2025


Epoch 4: 100%|██████████| 165/165 [45:28<00:00, 16.53s/it]


✅ Epoch 4 Avg Loss: 0.1177


Epoch 5: 100%|██████████| 165/165 [46:07<00:00, 16.77s/it]


✅ Epoch 5 Avg Loss: 0.0704

📊 Perplexity: 1.07

📝 Generated Samples:

Sample 1:
Once upon a timeloo. Sometimes things die and was full of fruits and up at the thanked grassy was full of water bubOnt Grandma lovesOldInsp set coils total on top. Daisy sighed and told truly would go inside. She loved his feet on top.

Sample 2:
Once upon a time Conj. McCluggled under at home from then said, "Can you forgive me byimony smiled and told my friends again?" Her friends again?" Her friends looked at her. They said too." They said, "We are happy you are happy

Sample 3:
Once upon a time garden something special momentAngNAT 1893 MET went to get closer. But we can't want to me?" The furry animal looked up at her and started to move. Sarah was by her and she bent down closer.Ground little girl could not wait,

Sample 4:
Once upon a time As and pushed and very serious.video very healthy because she ate lots of fruits and people in each other. She stopped and Lat itself A, furry animal. Sarah was ve

In [3]:
torch.save(model.state_dict(), "gpt2_modell_weights.pth")

In [6]:
from google.colab import files
files.download("gpt2_modell_weights.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>