In [1]:
import torch
import torch.nn as nn
import transformers
import torch.nn.utils.prune as prune
from torch.nn import TransformerDecoderLayer
import math
import torch.distributions as dist
from torch.distributions import Categorical

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Define the device to use (GPU)
device1 = torch.device("cuda:0")  # First GPU
device2 = torch.device("cuda:1")  # Second GPU

# Load the GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Move the model to the respective GPUs
model.to(device1)
model.to(device2)

# Input text
input_text1 = "Hello, this is GPU 0:"
input_text2 = "Hello, this is GPU 1:"

# Encode the input text
input_ids1 = tokenizer.encode(input_text1, return_tensors="pt").to(device1)
input_ids2 = tokenizer.encode(input_text2, return_tensors="pt").to(device2)

# Generate text on GPU 0
output1 = model.generate(input_ids1, max_length=50, num_return_sequences=1)
decoded_output1 = tokenizer.decode(output1[0], skip_special_tokens=True)

# Generate text on GPU 1
output2 = model.generate(input_ids2, max_length=50, num_return_sequences=1)
decoded_output2 = tokenizer.decode(output2[0], skip_special_tokens=True)

# Print the generated text
print("GPU 0 Output:", decoded_output1)
print("GPU 1 Output:", decoded_output2)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /root/anaconda3/envs/py39/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /root/anaconda3/envs/py39/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
2023-09-28 10:35:04.156539: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-28 10:35:04.202769: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The attention mask and the pad token id 

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [2]:
import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, hidden_size, num_heads, feedforward_size, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        
        # Multi-head self-attention layer
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout_rate)
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(hidden_size)
        
        # Position-wise feedforward network
        self.feedforward = nn.Sequential(
            nn.Linear(hidden_size, feedforward_size),
            nn.ReLU(),
            nn.Linear(feedforward_size, hidden_size)
        )
        
        # Layer normalization
        self.norm2 = nn.LayerNorm(hidden_size)
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, inputs, attention_mask=None):
        # Multi-head self-attention
        # Compute multi-head self-attention using the provided inputs
        attention_output, _ = self.self_attention(inputs, inputs, inputs, attn_mask=attention_mask)
        # Apply dropout to the attention output and add it to the original inputs
        attention_output = self.dropout(attention_output) + inputs
        # Apply layer normalization to the attention output
        attention_output = self.norm1(attention_output)
        
        # Position-wise feedforward
        # Pass the attention output through the position-wise feedforward network
        ff_output = self.feedforward(attention_output)
        # Apply dropout to the feedforward output and add it to the attention output
        ff_output = self.dropout(ff_output) + attention_output
        # Apply layer normalization to the feedforward output
        ff_output = self.norm2(ff_output)
        
        return ff_output

class LargeLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_heads, feedforward_size, dropout_rate=0.1, max_sequence_length=512):
        super(LargeLanguageModel, self).__init__()
        
        # Set the maximum sequence length and vocabulary size
        self.max_sequence_length = max_sequence_length
        self.vocab_size = vocab_size
        
        # Create the embedding layer (on a separate device, e.g., GPU)
        self.embedding = nn.Embedding(vocab_size, embedding_dim).to('cuda:0')
        
        # Create a list of transformer blocks (split across devices)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(hidden_dim, num_heads, feedforward_size, dropout_rate).to(f'cuda:{i % 2}')  # Assuming 2 GPUs
            for i in range(num_layers)
        ])
        
        # Create the final linear layer for prediction (on a separate device)
        self.linear = nn.Linear(hidden_dim, vocab_size).to('cuda:1')  # Use the second GPU

        
    def forward(self, input_ids, attention_mask=None):
        # Embed the input tokens
        embedded = self.embedding(input_ids)
        
        # Apply each transformer block to the embedded input
        transformer_output = embedded
        for i, block in enumerate(self.transformer_blocks):
            transformer_output = block(transformer_output, attention_mask=attention_mask)
        
        # Reshape the transformer_output for the linear layer
        batch_size, seq_length, hidden_dim = transformer_output.size()
        transformer_output = transformer_output.view(batch_size * seq_length, hidden_dim)
        
        # Pass the transformer_output through the linear layer for prediction
        logits = self.linear(transformer_output)
        
        return logits
    
    def generate_text(self, input_ids, max_length, temperature=1.0, top_k=None, top_p=None):
        # Clone the input_ids to avoid modifying the original
        generated_ids = input_ids.clone()
    
        # Loop to generate text up to max_length
        for _ in range(max_length):
            # Forward pass to get logits for the next token
            logits = self.forward(generated_ids)
    
            # Apply temperature for token sampling
            logits = logits[-1, :] / temperature
    
            # Sampling logic based on top_k and top_p
            if top_k is not None:
                # Apply top-k sampling
                logits, indices = torch.topk(logits, top_k)
                probs = torch.softmax(logits, dim=-1)
                predicted_id = torch.multinomial(probs, num_samples=1).squeeze()
            elif top_p is not None:
                # Apply nucleus (top-p) sampling
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1]
                sorted_indices_to_remove[:, 0] = 0
                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = float('-inf')
                probs = torch.softmax(logits, dim=-1)
                predicted_id = torch.multinomial(probs, num_samples=1).squeeze()
            else:
                # Regular softmax-based sampling
                probs = torch.softmax(logits, dim=-1)
                predicted_id = torch.multinomial(probs, num_samples=1).squeeze()
    
    
            # Append the predicted_id to generated_ids
            generated_ids = torch.cat((generated_ids, predicted_id.unsqueeze(0).unsqueeze(0)), dim=1)
    
            # Check if the generated token is the end token
            if predicted_id == self.vocab_size - 1:
                break
    
        return generated_ids


In [3]:
# Example usage
vocab_size = 51160
embedding_dim = 768
hidden_dim = 768
num_layers = 40
num_heads = 12
feedforward_size = 4*hidden_dim
dropout_rate = 0.1
model = LargeLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, num_heads, feedforward_size, dropout_rate)
input_ids = torch.tensor([[1, 2, 3, 4, 5, 6, 7]])
logits = model(input_ids)
print(logits.shape)

# Generate text
generated_text = model.generate_text(input_ids, max_length=50, temperature=0.8)
#decoded_text = tokenizer.decode(generated_text)
print("Generated Text:", generated_text)
print(generated_text.shape)
# Decoding
decoded = tokenizer.decode(generated_text)
print("Decoded:", decoded)

NameError: name 'vocab_size' is not defined

In [None]:
import numpy as np

class LiquidStateNetwork:
    def __init__(self, input_size, reservoir_size, spectral_radius=0.9):
        self.input_size = input_size
        self.reservoir_size = reservoir_size
        self.spectral_radius = spectral_radius
        self.reservoir = np.random.rand(reservoir_size, reservoir_size) - 0.5
        self.reservoir *= spectral_radius / max(np.abs(np.linalg.eigvals(self.reservoir)))

    def process(self, input_data):
        # Process input data through the liquid state network
        # Compute the network's dynamic state
        dynamic_state = np.dot(self.reservoir, input_data)
        return dynamic_state

# Example usage
input_size = 10
reservoir_size = 100
lsn = LiquidStateNetwork(input_size, reservoir_size)

# Training the LSN (You would need to implement your own training algorithm)
training_data = np.random.rand(input_size, 100)
lsn.train(training_data)

# Processing input data through the LSN
input_data = np.random.rand(input_size)
dynamic_state = lsn.process(input_data)

# You can use the dynamic state for further processing or prediction tasks.
