In [1]:
import numpy as np
from scipy.linalg import fractional_matrix_power

def fractional_power_matrix(A, alpha):
    """
    Compute the fractional power of a matrix A to the power alpha.
    Args:
        A (np.array): The input square matrix.
        alpha (float): The fractional power to which the matrix should be raised.
    Returns:
        np.array: The fractional power of the matrix.
    """
    if not np.allclose(A, A.T):
        raise ValueError("Matrix must be symmetric for fractional powers.")
    return fractional_matrix_power(A, alpha)

# Example usage
A = np.array([[4, 1], [1, 3]])
alpha = 0.5  # Example fractional power
A_fractional = fractional_power_matrix(A, alpha)
print("Fractional Power of A^0.5:")
print(A_fractional)

Fractional Power of A^0.5:
[[1.98157763 0.27083221]
 [0.27083221 1.71074543]]


In [3]:
import torch
import torch.nn.functional as F

class FractionalAttention(torch.nn.Module):
    def __init__(self, embed_dim, alpha=0.5):
        """
        Initialize a fractional attention module.
        Args:
            embed_dim (int): Dimension of embeddings.
            alpha (float): Fractional power for the attention weights.
        """
        super(FractionalAttention, self).__init__()
        self.query = torch.nn.Linear(embed_dim, embed_dim)
        self.key = torch.nn.Linear(embed_dim, embed_dim)
        self.value = torch.nn.Linear(embed_dim, embed_dim)
        self.alpha = alpha

    def forward(self, x):
        """
        Forward pass for fractional attention.
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_len, embed_dim).
        Returns:
            torch.Tensor: The output tensor.
        """
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        # Attention computation (dot product)
        attention_scores = torch.bmm(Q, K.transpose(1, 2)) / (Q.size(-1) ** 0.5)
        
        # Softmax to get probabilities
        attention_probs = F.softmax(attention_scores, dim=-1)
        
        # Fractional power of the attention probabilities
        fractional_attention_probs = torch.pow(attention_probs, self.alpha)
        
        # Multiply the attention weights by the value
        output = torch.bmm(fractional_attention_probs, V)
        return output

# Example usage
batch_size = 2
seq_len = 4
embed_dim = 8
x = torch.rand(batch_size, seq_len, embed_dim)
attention = FractionalAttention(embed_dim=embed_dim, alpha=0.8)
output = attention(x)
print("Output from Fractional Attention Layer:", output.shape)
print(output)

Output from Fractional Attention Layer: torch.Size([2, 4, 8])
tensor([[[ 0.1760,  0.0748, -0.8958,  0.3870, -0.1142,  0.3949, -0.1325,
           0.1630],
         [ 0.1727,  0.0773, -0.8998,  0.3854, -0.1154,  0.3950, -0.1323,
           0.1638],
         [ 0.1763,  0.0746, -0.8938,  0.3874, -0.1151,  0.3941, -0.1317,
           0.1630],
         [ 0.1787,  0.0728, -0.8896,  0.3887, -0.1152,  0.3931, -0.1312,
           0.1624]],

        [[ 0.3159, -0.3146, -0.6623,  0.5485,  0.0555,  0.3512, -0.0403,
           0.3671],
         [ 0.3172, -0.3143, -0.6630,  0.5481,  0.0567,  0.3498, -0.0368,
           0.3687],
         [ 0.3152, -0.3135, -0.6644,  0.5494,  0.0543,  0.3512, -0.0444,
           0.3657],
         [ 0.3169, -0.3134, -0.6633,  0.5480,  0.0560,  0.3498, -0.0385,
           0.3677]]], grad_fn=<BmmBackward0>)


In [5]:
import torch

class FractionalLinear(torch.nn.Module):
    def __init__(self, in_features, out_features, alpha=0.9):
        """
        Initialize a linear layer with fractional backpropagation.
        Args:
            in_features (int): Number of input features.
            out_features (int): Number of output features.
            alpha (float): Fractional order of the gradient.
        """
        super(FractionalLinear, self).__init__()
        self.weight = torch.nn.Parameter(torch.randn(out_features, in_features))
        self.bias = torch.nn.Parameter(torch.randn(out_features))
        self.alpha = alpha

    def forward(self, x):
        return torch.matmul(x, self.weight.T) + self.bias

    def backward(self, grad_output):
        """
        Custom backward function using fractional derivatives.
        Args:
            grad_output (torch.Tensor): The gradient of the loss.
        Returns:
            torch.Tensor: Fractional gradient.
        """
        grad_input = torch.pow(grad_output, self.alpha)
        return grad_input

# Example usage
x = torch.randn(10, 5)
layer = FractionalLinear(5, 3, alpha=0.8)
output = layer(x)
loss = torch.sum(output)
loss.backward()
output

tensor([[-2.5563, -2.5354, -6.7174],
        [ 0.7297, -0.0386,  2.5425],
        [-4.2383, -2.0661, -4.6462],
        [-1.9240, -2.8116, -2.7333],
        [-3.0439, -1.3420, -3.8347],
        [ 0.7035,  0.5866, -2.1987],
        [ 0.3729, -1.7287,  0.3559],
        [ 0.5532,  2.7181,  1.1819],
        [ 2.4136,  3.3529, -0.3646],
        [-1.4591, -0.9820, -3.9150]], grad_fn=<AddBackward0>)

In [6]:
import torch
import torch.nn as nn

class FractionalTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, alpha=0.8):
        super(FractionalTransformerBlock, self).__init__()
        self.attention = FractionalAttention(embed_dim, alpha)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ffn = nn.Sequential(
            FractionalLinear(embed_dim, embed_dim * 4, alpha),
            nn.ReLU(),
            FractionalLinear(embed_dim * 4, embed_dim, alpha)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output = self.attention(x)
        x = self.norm1(x + attn_output)
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        return x

class FractionalGPT(nn.Module):
    def __init__(self, num_layers, embed_dim, num_heads, vocab_size, alpha=0.8):
        super(FractionalGPT, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.layers = nn.ModuleList([
            FractionalTransformerBlock(embed_dim, num_heads, alpha) 
            for _ in range(num_layers)
        ])
        self.output_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        for layer in self.layers:
            x = layer(x)
        logits = self.output_head(x)
        return logits

# Example usage
vocab_size = 1000
seq_len = 10
batch_size = 2
embed_dim = 32
num_layers = 2
num_heads = 4

x = torch.randint(0, vocab_size, (batch_size, seq_len))
model = FractionalGPT(num_layers, embed_dim, num_heads, vocab_size, alpha=0.8)
logits = model(x)
print("Output logits:", logits.shape)
logits

Output logits: torch.Size([2, 10, 1000])


tensor([[[ 5.4700e-01, -3.1152e-01,  4.8622e-02,  ..., -1.6796e-03,
          -1.4546e-01, -5.1911e-01],
         [-3.1036e-01,  8.2553e-01, -1.5342e-01,  ..., -5.8901e-01,
           3.7547e-01,  2.9392e-01],
         [ 5.8428e-01,  1.0994e-01, -1.1075e-01,  ...,  3.3377e-01,
           1.0201e-01, -1.9540e-01],
         ...,
         [ 1.9379e-02,  4.7495e-01,  3.9328e-01,  ..., -1.4321e-01,
           6.3253e-01, -9.5563e-01],
         [ 5.9369e-01, -1.0885e-01,  1.9067e-01,  ..., -4.3912e-02,
           1.3999e-01, -9.0806e-01],
         [ 7.2416e-03,  2.9712e-01, -3.1221e-01,  ..., -1.4748e-01,
          -3.2595e-01,  1.2130e-01]],

        [[ 5.3162e-01,  6.1521e-03, -3.3096e-02,  ...,  4.4141e-01,
          -2.7279e-01, -2.0244e-02],
         [ 2.7498e-01, -3.4813e-01,  4.4897e-01,  ...,  3.4805e-01,
           2.4139e-01, -5.0521e-01],
         [ 2.0578e-01,  1.2037e-01, -1.0812e-01,  ..., -1.2177e-01,
           2.4607e-01,  4.9796e-01],
         ...,
         [ 2.3163e-01,  4

In [7]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [8]:
%pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
   ---------------------------------------- 0.0/480.6 kB ? eta -:--:--
   ----- ---------------------------------- 61.4/480.6 kB 1.7 MB/s eta 0:00:01
   ------------------------- -------------- 307.2/480.6 kB 3.8 MB/s eta 0:00:01
   ---------------------------------------- 480.6/480.6 kB 4.3 MB/s eta 0:00:00
Using cached dill-0.3.8-py3-n


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from datasets import load_dataset

ds = load_dataset("glaiveai/glaive-code-assistant")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# ======= Data Preparation =======
def tokenize(text, vocab):
    """Simple tokenization function to convert text to token indices"""
    return [vocab.get(char, vocab['<unk>']) for char in text]

def prepare_data(texts, vocab, seq_len):
    """Converts multiple texts into training data (X, Y)"""
    tokenized_data = [tokenize(text, vocab) for text in texts]
    X, Y = [], []
    for tokens in tokenized_data:
        for i in range(len(tokens) - seq_len):
            X.append(tokens[i:i + seq_len])  # Input
            Y.append(tokens[i + 1:i + seq_len + 1])  # Target
    return torch.tensor(X), torch.tensor(Y)

# Model Definition
class FractionalAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, alpha=0.8):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.alpha = alpha
    
    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        attn_output = attn_output ** self.alpha  # Fractional power of attention weights
        return attn_output

class FractionalTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, alpha=0.8):
        super().__init__()
        self.attention = FractionalAttention(embed_dim, num_heads, alpha)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.GELU(),
            nn.Linear(4 * embed_dim, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_out = self.attention(x)
        x = x + attn_out  # Residual connection
        x = self.norm1(x)
        
        ffn_out = self.ffn(x)
        x = x + ffn_out  # Residual connection
        x = self.norm2(x)
        return x

class FractionalGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, num_heads, seq_len, alpha=0.8):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(seq_len, embed_dim)
        self.layers = nn.ModuleList([
            FractionalTransformerBlock(embed_dim, num_heads, alpha) 
            for _ in range(num_layers)
        ])
        self.output_head = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, x):
        batch_size, seq_len = x.shape
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).repeat(batch_size, 1)
        x = self.token_embedding(x) + self.position_embedding(positions)
        
        for layer in self.layers:
            x = layer(x)
        
        logits = self.output_head(x)
        return logits

# Model Training 
def train_model(model, data_loader, optimizer, loss_fn, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for X, Y in data_loader:
            optimizer.zero_grad()
            logits = model(X)
            loss = loss_fn(logits.view(-1, logits.size(-1)), Y.view(-1))
            loss.backward()  # Fractional gradients are applied automatically
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(data_loader)}")