In [3]:
from encoder import tokenizer as t1,m as m1
from decoder import tokenizer as t2,m as m2,batch_size,block_size,n_embd,device,stoi,decode_en

import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import torch
import torch.nn as nn
from torch.nn import functional as F
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
source_text = "First Citizen"
source_texted=[source_text+"<EOS>"]
source_tokens = torch.tensor(t1.texts_to_sequences(source_texted), dtype=torch.long).to(device)  # (1, T)

# Encoder çıkışı (kaynak metnin bağlam temsilini çıkarıyoruz)
with torch.no_grad():
    source_embeddings = m1(source_tokens)


In [5]:
target_seq = torch.tensor(t2.texts_to_sequences(["<BOS>"]), dtype=torch.long).to(device)  # <BOS> ile başlat
generated = target_seq.clone()
generated



tensor([[1]], device='cuda:0')

In [7]:

batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 64 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 100
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.0
vocab_size = len(stoi)

output=source_embeddings
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
class AltanMultiHead(nn.Module):
    def __init__(self,embed_dim,num_head):
        super().__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_head, dropout=dropout)
    def forward(self, x):
        B, T, C = x.shape  # B = batch_size, T = sequence_length, C = embed_dim

        # Burada query, key, value aynı girdi verisinden alınır
        query = key = value = x.permute(1, 0, 2)  # (sequence_length, batch_size, embed_dim) şeklinde permütasyon yapıyoruz
        
        # Multihead attention hesaplama
        attn_output, attn_output_weights = self.multihead_attn(query, key, value)
        
       
        
        # Çıkış
        return attn_output.permute(1, 0, 2)  # Çıkışı (B, T, C) şeklinde geri dönüyoruz

class CrossAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    """def forward(self, x_1, x_2=output):
        B,T,C=x_1.shape
        queries_1 = self.query(x_1)
        keys_2 = self.key(x_2)
        values_2 = self.value(x_2)
        
        attn_scores = queries_1 @ keys_2.transpose(-2, -1)
        attn_weights = F.softmax(attn_scores / queries_1.size(-1) ** 0.5, dim=-1)
        context_vec = attn_weights @ values_2
        return context_vec"""
    def forward(self, x_1, x_2=output):
        # x_1: (batch_size, target_seq_len, embed_dim)
        # x_2: (batch_size, source_seq_len, embed_dim)
        B, T1, C = x_1.shape
        _, T2, _ = x_2.shape

        # Linearly project input tensors to query, key, and value
        queries = self.query(x_1)  # (B, T1, head_size)
        keys = self.key(x_2)      # (B, T2, head_size)
        values = self.value(x_2)  # (B, T2, head_size)

        # Compute attention scores (scaled dot-product attention)
        attn_scores = torch.matmul(queries, keys.transpose(-2, -1))  # (B, T1, T2)
        attn_scores = attn_scores / (C ** 0.5)  # Scale by the square root of head_size

        # Compute attention weights
        attn_weights = F.softmax(attn_scores, dim=-1)  # (B, T1, T2)
        attn_weights = self.dropout(attn_weights)

        # Compute context vectors
        context_vec = torch.matmul(attn_weights, values)  # (B, T1, head_size)

        return context_vec
        
"""class MultiCrossAttention(nn.Module):
    

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([CrossAttention(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out"""
class MultiCrossAttention(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embed dim must be divisible by num_heads"
        head_size = embed_dim // num_heads
        self.heads = nn.ModuleList([CrossAttention(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(embed_dim, embed_dim)  # Combine all heads
        self.dropout = nn.Dropout(dropout)

    def forward(self, x_1, x_2=output):
        # Multi-Head Cross Attention
        out = torch.cat([head(x_1, x_2) for head in self.heads], dim=-1)  # (B, T, embed_dim)
        out = self.dropout(self.proj(out))  # Final projection
        return out


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = AltanMultiHead(n_embd,n_head)
        self.ffwd = FeedFoward(n_embd)
        self.cratt = MultiCrossAttention(n_head,n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ln3 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.cratt(self.ln3(x),output)
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class AltanTranslator(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(4)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        B,T=idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

model = AltanTranslator()
m = model.to(device)


In [8]:
def generate(idx,temperature, max_new_tokens):
        eos_token = t2.texts_to_sequences(["<EOS>"])[0][0]
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = m(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            
            # apply softmax to get probabilities
            probs = F.softmax(logits/temperature, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)
            if idx_next.item() == eos_token:
              break  
             # (B, T+1)
        return idx

In [20]:
result=generate(generated,0.7,40)

In [21]:
result

tensor([[  1,  78, 476, 224,  13, 335,  32,  92, 202, 444,  23,   8, 406, 420,
          72, 344, 207, 154,  27, 229, 237, 108, 196, 130,  12, 474,  82, 284,
         440, 404, 158, 250, 227, 276, 303,  35,   0, 211,  44, 110, 326]],
       device='cuda:0')

In [22]:
result.shape

torch.Size([1, 41])

In [23]:
result[0][0]

tensor(1, device='cuda:0')

In [24]:
itos=dict(zip(stoi.values(), stoi.keys()))


In [25]:
def translate(result):
    eleman=""
    for i in result[0]:
        eleman+=" "+itos[int(i)]
    return eleman

In [26]:
translated=translate(result)

KeyError: 0

In [27]:
print(translated)

 bos acılar unutmayın verseler suçlayacaksınız verseler göstereceğiz gidelim biz gidiyorsunuz bağırışlar söz kanınızın yine anlatmayı itibar ama aldıklarımı iyi konuşmayalım düşündüğümüzü anlatmayı iyisini suçlanmalıdır ettik avantaj kim suçlayacaksınız niyetle tüm tefeciliği fiyattan gönderirim karar yoksul insanca değil çektiğimiz arzusu kendinizi oysa
