In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math

In [48]:
class FractionalPositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=5000, alpha=0.8):
        super().__init__()
        self.alpha = alpha
        self.d_model = d_model
        self.positional_embeddings = self.generate_positional_embeddings(d_model, max_len)
    
    def generate_positional_embeddings(self, d_model, max_len):
        positions = torch.arange(0, max_len).unsqueeze(1)
        freq = torch.pow(10000, -2 * (torch.arange(0, d_model, 2) / d_model))
        embeddings = positions * freq
        embeddings = torch.sin(embeddings) + torch.cos(embeddings) ** self.alpha
        embeddings = embeddings.unsqueeze(1)  # Shape (max_len, 1, d_model)
        return embeddings

    def forward(self, x):
        # x: (batch_size, seq_len)
        batch_size, seq_len = x.size()
        
        # Expand x to have 3 dimensions: (batch_size, seq_len, d_model)
        x = x.unsqueeze(-1).repeat(1, 1, self.d_model)  # Shape (batch_size, seq_len, d_model)
        
        # Get the positional embedding for this specific sequence length
        positional_emb = self.positional_embeddings[:seq_len, 0, :].unsqueeze(0).repeat(batch_size, 1, 1).to(x.device)
        print(f"positional_emb: {positional_emb.shape}")
        positional_emb = positional_emb.repeat_interleave(2,dim=-1)
        print(f"positional_emb: {positional_emb.shape}")
        
        # Check tensor shapes before addition
        print(f"x.shape: {x.shape}, positional_emb.shape: {positional_emb.shape}")
        
        # Ensure the tensors have the same shape before addition
        return x + positional_emb

In [50]:
#Code Try 3
import torch
import torch.nn as nn
import math

class FractionalAttention(nn.Module):
    def __init__(self, d_model, heads, alpha=0.8, epsilon=1e-6):
        super().__init__()
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.alpha = alpha
        self.epsilon = epsilon  # Small epsilon for numerical stability

    def forward(self, q, k, v):
        Q = self.query(q)
        K = self.key(k)
        V = self.value(v)

        # Compute attention scores with numerical stability
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1))
        
        try:
            # Add small diagonal to improve condition number
            attention_scores = attention_scores + self.epsilon * torch.eye(attention_scores.size(-1), 
                                                                           device=attention_scores.device)
            
            # Safe eigendecomposition with robust method
            eigvals, eigvecs = torch.linalg.eigh(attention_scores)
            
            # Clip eigenvalues to prevent overflow
            eigvals = torch.clamp(eigvals, min=self.epsilon)
            
            # Fractional power with clipped values
            fractional_eigvals = torch.pow(eigvals, self.alpha)
            
            # Reconstruct matrix
            attention_scores = eigvecs @ torch.diag_embed(fractional_eigvals) @ eigvecs.transpose(-2, -1)
        
        except Exception as e:
            # Fallback to a stable method if eigendecomposition fails
            print(f"Eigendecomposition failed: {e}")
            attention_scores = torch.softmax(attention_scores, dim=-1)
        
        # Normalize attention weights
        attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)
        
        return torch.matmul(attention_weights, V)

In [15]:
class FractionalCrossEntropyLoss(nn.Module):
    def __init__(self, alpha=0.8):
        super().__init__()
        self.alpha = alpha

    def forward(self, logits, targets):
        log_probs = torch.log_softmax(logits, dim=-1)
        fractional_log_probs = log_probs ** self.alpha
        return -torch.mean(torch.sum(fractional_log_probs * targets, dim=-1))

In [16]:
class FractionalGPT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, alpha=0.8):
        super().__init__()
        self.embedding = FractionalPositionalEmbedding(d_model, alpha=alpha)
        self.layers = nn.ModuleList([FractionalAttention(d_model, n_heads, alpha=alpha) for _ in range(n_layers)])
        self.decoder = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, x, x)
        x = self.decoder(x)
        return x

In [6]:
class FractionalGradientDescent(optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.8):
        defaults = dict(lr=lr, alpha=alpha)
        super(FractionalGradientDescent, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is not None:
                    alpha = group['alpha']
                    grad = p.grad.data
                    fractional_grad = grad ** alpha
                    p.data -= group['lr'] * fractional_grad


In [51]:
def train_fractional_gpt():
    model = FractionalGPT(vocab_size=1000, d_model=64, n_layers=4, n_heads=4, alpha=0.8)
    criterion = FractionalCrossEntropyLoss(alpha=0.8)
    optimizer = FractionalGradientDescent(model.parameters(), lr=1e-4, alpha=0.8)

    for epoch in range(10):
        # Inputs: batch_size x seq_len
        inputs = torch.randint(0, 1000, (32, 20))
        targets = torch.randint(0, 1000, (32, 20))

        # Ensure model input matches expected shape
        logits = model(inputs)
        
        # Compute loss
        loss = criterion(logits.view(-1, 1000), targets.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

train_fractional_gpt()

positional_emb: torch.Size([32, 20, 32])
positional_emb: torch.Size([32, 20, 64])
x.shape: torch.Size([32, 20, 64]), positional_emb.shape: torch.Size([32, 20, 64])
Eigendecomposition failed: linalg.eigh: (Batch element 0): The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: 19).
Eigendecomposition failed: linalg.eigh: (Batch element 0): The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: 19).
Eigendecomposition failed: linalg.eigh: (Batch element 0): The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: 19).
Eigendecomposition failed: linalg.eigh: (Batch element 0): The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: 19).


RuntimeError: The size of tensor a (1000) must match the size of tensor b (640) at non-singleton dimension 1

In [None]:
#Fractional
import torch.nn as nn
import math

class FractionalAttention(nn.Module):
    def __init__(self, d_model, alpha=0.8, epsilon=1e-8):
        super().__init__()
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.alpha = alpha
        self.epsilon = epsilon

    def robust_eigendecomposition(self, attention_scores):
        try:
            # Method 1: Stabilized Eigendecomposition
            stabilized_scores = attention_scores + self.epsilon * torch.eye(
                attention_scores.size(-1), 
                device=attention_scores.device, 
                dtype=attention_scores.dtype
            )
            
            eigvals, eigvecs = torch.linalg.eigh(stabilized_scores)
            
            # Clip and process eigenvalues
            eigvals = torch.clamp(eigvals, min=self.epsilon)
            fractional_eigvals = torch.pow(eigvals, self.alpha)
            
            reconstructed_scores = eigvecs @ torch.diag_embed(fractional_eigvals) @ eigvecs.transpose(-2, -1)
            
            return reconstructed_scores

        except Exception as e:
            # Fallback to SVD if eigendecomposition fails
            try:
                U, S, Vh = torch.linalg.svd(attention_scores)
                S_clamped = torch.clamp(S, min=self.epsilon)
                S_fractional = torch.pow(S_clamped, self.alpha)
                
                reconstructed_scores = U @ torch.diag_embed(S_fractional) @ Vh
                
                return reconstructed_scores

            except Exception:
                # Final fallback to softmax
                return torch.softmax(attention_scores, dim=-1)

    def forward(self, q, k, v):
        # Linear projections
        Q = self.query(q)
        K = self.key(k)
        V = self.value(v)
        
        # Compute attention scores
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1))
        
        # Robust Eigendecomposition
        processed_scores = self.robust_eigendecomposition(attention_scores)
        
        # Compute attention weights
        attention_weights = torch.nn.functional.softmax(processed_scores, dim=-1)
        
        # Final output
        output = torch.matmul(attention_weights, V)
        
        return output

In [None]:
#Error Handling Mechnaisms

import torch
import torch.nn.functional as F

# Original tensor with shape [32, 30, 32]
positional_emb = torch.randn(32, 30, 32)

# Method 1: Simple Repetition
repeated_emb = positional_emb.repeat_interleave(2, dim=-1)

# Method 2: Zero Padding
zero_padded = F.pad(positional_emb, (0, 32))

# Method 3: Concatenation with itself
concat_self = torch.cat([positional_emb, positional_emb], dim=-1)

# Method 4: Using interpolation techniques
def custom_interpolation(tensor, target_dim):
    # Linear interpolation between original values
    original = tensor
    expanded = torch.zeros(tensor.shape[0], tensor.shape[1], target_dim, device=tensor.device)
    expanded[:, :, :tensor.shape[-1]] = original
    
    # Fill the rest with interpolated values
    for i in range(tensor.shape[-1], target_dim):
        # Simple linear interpolation
        prev = expanded[:, :, i-1]
        expanded[:, :, i] = prev * 1.1  # Simple scaling
    
    return expanded

interpolated_emb = custom_interpolation(positional_emb, 64)

# Method 5: Tiling with a pattern
tiled_emb = torch.tile(positional_emb, (1, 1, 2))

# Verify shape
print("Method 1 (Repeat Interleave) shape:", repeated_emb.shape)
print("Method 2 (Zero Padding) shape:", zero_padded.shape)
print("Method 3 (Concatenation) shape:", concat_self.shape)
print("Method 4 (Interpolation) shape:", interpolated_emb.shape)
print("Method 5 (Tiling) shape:", tiled_emb.shape)

Method 1 (Repeat Interleave) shape: torch.Size([32, 30, 64])
Method 2 (Zero Padding) shape: torch.Size([32, 30, 64])
Method 3 (Concatenation) shape: torch.Size([32, 30, 64])
Method 4 (Interpolation) shape: torch.Size([32, 30, 64])
Method 5 (Tiling) shape: torch.Size([32, 30, 64])
