<a href="https://colab.research.google.com/github/EldarsUP/semantic_methods/blob/main/homework3%7C3final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
# 1. Загрузка текста
with open("/gpt_data (1).txt", "r", encoding="utf-8") as f:
    text = f.read()

# Создание уникальных символов + специальные токены
chars = ['<BOS>', '<EOS>'] + sorted(list(set(text)))
vocab_size = len(chars)

# Маппинг символов в индексы и наоборот
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

# Функции кодирования и декодирования
encode = lambda s: [stoi.get(c, stoi['<EOS>']) for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Кодируем текст (с добавлением специальных токенов)
encoded_data = encode("<BOS> " + text + " <EOS>")
data = torch.tensor(encoded_data, dtype=torch.long)

# 3. Параметры
block_size = 128
batch_size = 32
embedding_dim = 128
n_heads = 4
n_layers = 2
ff_hidden = 256
n_epochs = 80
dropout_rate=0.1
lr = 3e-4

# 4. Dataset
class CharDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size
    def __len__(self):
        return len(self.data) - self.block_size
    def __getitem__(self, idx):
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]
        return x, y

dataloader = DataLoader(CharDataset(data, block_size), batch_size=batch_size, shuffle=True)

# 5. Модель
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.head_dim = embed_dim // num_heads
        self.num_heads = num_heads
        self.q = nn.Linear(embed_dim, embed_dim)
        self.k = nn.Linear(embed_dim, embed_dim)
        self.v = nn.Linear(embed_dim, embed_dim)
        self.out = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        B, T, C = x.shape
        q = self.q(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        att = att.masked_fill(torch.tril(torch.ones(T, T, device=att.device)) == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        out = (att @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.out(out)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_hidden):
        super().__init__()
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden),
            nn.ReLU(),
            nn.Linear(ff_hidden, embed_dim),
        )
        self.ln2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout_rate)


    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = self.dropout(x)
        x = x + self.ff(self.ln2(x))
        x = self.dropout(x)
        return x

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, block_size, n_heads, n_layers, ff_hidden):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(block_size, embed_dim)
        self.blocks = nn.Sequential(*[
            TransformerBlock(embed_dim, n_heads, ff_hidden)
            for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))[None, :, :]
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits


# 6. Обучение
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MiniGPT(vocab_size, embedding_dim, block_size, n_heads, n_layers, ff_hidden).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

for epoch in range(n_epochs):
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        B, T, C = logits.shape
        loss = F.cross_entropy(logits.view(B * T, C), y.view(B * T))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0 or epoch == n_epochs - 1:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

def generate(model, start_text="Привет", max_new_tokens=100, temperature=1.0, top_k=None):
    model.eval()
    context = torch.tensor(encode(start_text), dtype=torch.long)[None, :].to(device)

    for _ in range(max_new_tokens):
        context_condensed = context[:, -block_size:]
        logits = model(context_condensed)
        logits = logits[:, -1, :] / temperature

        probs = F.softmax(logits, dim=-1)

        if top_k is not None:
            values, indices = torch.topk(probs, top_k)
            probs = torch.zeros_like(probs).scatter_(1, indices, values)
            probs /= probs.sum(dim=-1, keepdim=True)

        next_token = torch.multinomial(probs, num_samples=1)
        context = torch.cat((context, next_token), dim=1)

    return decode(context[0].tolist())

print("\n=== Сгенерированный текст ===")
print(generate(model, start_text="А вы ноктюрн сыграть смогли бы ", max_new_tokens=200, temperature=0.7, top_k=10))


Epoch 0, Loss: 2.5954
Epoch 10, Loss: 1.5939
Epoch 20, Loss: 1.1618
Epoch 30, Loss: 1.1152
Epoch 40, Loss: 1.0406
Epoch 50, Loss: 0.9410
Epoch 60, Loss: 0.9439
Epoch 70, Loss: 0.8533
Epoch 79, Loss: 0.9491

=== Сгенерированный текст ===
А вы ноктюрн сыграть смогли бы отными краными размывами, и скраными рамочными цилинниками; горловины на шарнирах были сложены пополам, как сетую двагентом в бортироваться. Вешь обочине.

– Всё простор день из гредумал. Всё, бы орто
