In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm
import pandas as pd


In [3]:
# Load model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
df = pd.read_csv("/Users/michaelsklar/aiplay/wikipedia/wikipedia_three_sentences.csv")
 
# Define a fixed sequence length
max_length = 50

# Prepare dataset
class SentenceDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        first_sentence = str(self.dataframe.iloc[idx]['first_sentences'])
        second_sentence = str(self.dataframe.iloc[idx]['second_sentences'])
        third_sentence = str(self.dataframe.iloc[idx]['third_sentences'])
        
        hyper_input_encodings = tokenizer(first_sentence, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
        input_encodings = tokenizer(second_sentence, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
        target_encodings = tokenizer(third_sentence, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

        #return hyper_input_encodings, input_encodings, target_encodings
        return input_encodings, target_encodings

dataset = SentenceDataset(df)

#Must modify the GPT-2 architecture to work with the hypernetwork!

#Must modify the below code to work with the hypernetwork!

# Evaluation loop (simplified)
model.eval()
total_loss = 0
for input_encodings, target_encodings in tqdm(DataLoader(dataset, batch_size=10)):
    input_ids = input_encodings['input_ids'].squeeze(0)  # Squeeze to remove batch dimension
    attention_mask = input_encodings['attention_mask'].squeeze(0)
    labels = target_encodings['input_ids'].squeeze(0)

    # Align the target tensor to the model's output size
    labels = labels[:, :input_ids.size(1)]
    labels_padded = torch.full(input_ids.shape, tokenizer.pad_token_id, dtype=torch.long)
    labels_padded[:, :labels.size(1)] = labels
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_padded)
        total_loss += outputs.loss.item()

avg_loss = total_loss / len(dataset)
perplexity = torch.exp(torch.tensor(avg_loss))

print(f"Perplexity: {perplexity}")

  0%|          | 2/24546 [00:41<140:25:21, 20.60s/it]


KeyboardInterrupt: 

In [20]:
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class LoRAModule(nn.Module):
    def __init__(self, input_dim, output_dim, rank):
        super(LoRAModule, self).__init__()
        self.rank = rank
        self.lora_A = nn.Parameter(torch.randn(input_dim, rank))
        self.lora_B = nn.Parameter(torch.randn(rank, output_dim))

    def forward(self, weight):
        lora_modification = self.lora_A @ self.lora_B
        return weight + lora_modification

class HyperNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, num_layers, rank):
        super(HyperNetwork, self).__init__()
        self.rank = rank
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.param_generator = nn.Linear(input_dim, num_layers * 2 * rank * output_dim)

    def forward(self, hyper_input):
        generated_params = self.param_generator(hyper_input)
        generated_params = generated_params.view(self.num_layers, 2, self.rank, self.output_dim)

        lora_layers = []
        for i in range(self.num_layers):
            lora_A_params = generated_params[i, 0, :, :].transpose(0, 1)  # Transpose to match shape of lora_A
            lora_B_params = generated_params[i, 1, :, :]  # Already matches shape of lora_B
            lora_layer = LoRAModule(self.output_dim, 3 * self.output_dim, self.rank)  # Adjusted dimensions
            lora_layer.lora_A.data = lora_A_params
            lora_layer.lora_B.data = lora_B_params
            lora_layers.append(lora_layer)

        return lora_layers



class GPT2WithHyperNetLoRA(nn.Module):
    def __init__(self, gpt2_model_name, hyper_input_dim, rank):
        super(GPT2WithHyperNetLoRA, self).__init__()
        self.rank = rank
        self.gpt2 = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
        num_layers = len(self.gpt2.transformer.h)
        hidden_size = self.gpt2.config.n_embd
        self.hypernet = HyperNetwork(hyper_input_dim, 3 * hidden_size, num_layers, rank)

    def forward(self, input_ids, attention_mask, hyper_input):
        lora_layers = self.hypernet(hyper_input)

        for i, layer in enumerate(self.gpt2.transformer.h):
            lora_mod = lora_layers[i]
            original_weight = layer.attn.c_attn.weight
            hidden_size = self.gpt2.config.n_embd

            # Correctly split the original weight tensor into query, key, and value components
            q_weight, k_weight, v_weight = original_weight.chunk(3, dim=0)

            # Apply LoRA modification separately to each component
            q_weight_modified = lora_mod(q_weight)
            k_weight_modified = lora_mod(k_weight)
            v_weight_modified = lora_mod(v_weight)

            # Concatenate the modified components back into a single weight tensor
            modified_weight = torch.cat([q_weight_modified, k_weight_modified, v_weight_modified], dim=0)
            layer.attn.c_attn.weight = modified_weight

        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        return outputs

# Example usage
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2WithHyperNetLoRA("gpt2", tokenizer.model_max_length, rank=10)

# Dummy data
input_ids = torch.tensor(tokenizer.encode("Example sentence", return_tensors="pt"))
attention_mask = torch.ones_like(input_ids)
hyper_input = torch.randn(1, tokenizer.model_max_length)

# Forward pass
outputs = model(input_ids, attention_mask, hyper_input)

  input_ids = torch.tensor(tokenizer.encode("Example sentence", return_tensors="pt"))


RuntimeError: The size of tensor a (256) must match the size of tensor b (2304) at non-singleton dimension 0