In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn.functional as F
from taker import Model
id = 1


Load the texts

In [3]:
# Load texts
id = 1
loaded_texts  = torch.load(f'generated_texts-{id}.pth')
prompts = [i[0] for i in loaded_texts]
completions = [i[1] for i in loaded_texts]
full_texts = [i[0] + i[1] for i in loaded_texts]

# prompt len is roughly 150 characters


Load the model

In [5]:
# Generate activations if there are none
m = Model("mistralai/Mistral-7B-v0.1", dtype="int4")
m.to("cuda")
m.do_activations["mlp_pre_out"] = False
m.do_activations["attn_pre_out"] = False


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

- Loaded mistralai/Mistral-7B-v0.1
 - Registered 32 Attention Layers


Get activations.
- Get the output of each MLP layer and save them
- Only save every "layers_to_skip"-th layer
- I.e: for layers_to_skip = 15, save layers 0, 15, and 30

In [None]:

file_name = f"/mnt/vol_b/text_activations-{id}.pth"

a = 0
layers_to_skip = 15

activations = []
# Open the file once and append data in each iteration
with torch.no_grad():
    for text in [i for i in full_texts[0:300]]:
        print(a)

        act = m.get_text_activations(text=text)
        atten_out = activation[1]
        ff_out = activation[2]

        # Take every 15th layer starting from the first one
        activations.append(ff_out[0::layers_to_skip, :, :].to("cpu"))

        # Append the data to file
        # if (a % 10 == 0):
        #     torch.save( activations, f"file_name-{a}" )
        #     activations = []
        a = a + 1


In [35]:
torch.save(activations, f"/mnt/vol_b/text_activations-{id}.pth")

In [None]:
# Inspect text


a = 0
for i in loaded_texts:
    print("TEXTS: ", a )
    print(i[0] + "//" +  i[1])

    a = a + 1


Define predictor model
- it is a simple single-layer transformer model
- I think it likely has ATTN + MLP but unsure 
- Output from running the model is the final activation from the final token 
- I.e: [n_texts, n_tokens, d_model] -> [n_texts, -1, d_model]

In [75]:
# Takes in batch, seq_len, features
class CustomTransformerModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(CustomTransformerModel, self).__init__()
        self.embedding = nn.Linear(input_size, 4096) # Example embedding size
        self.transformer_block = nn.TransformerEncoderLayer(d_model=4096, nhead=8)
        self.output_linear = nn.Linear(4096, output_size)
        self.sigmoid = nn.Sigmoid()
        self.output_size = output_size

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2) # Transformer expects [seq_len, batch, features]
        x = self.transformer_block(x)
        x = x.permute(1, 0, 2) # Revert permutation
        x = self.output_linear(x)
        x = x[:, -1, :] # Output is the final token
        return x

    # loss is ( x - y.mean() )**2


Define a method to train

In [82]:
# train


def train(data_in, data_out, model, criterion, num_epochs, save=False, learning_rate=0.001):


    # Create a complete dataset
    full_dataset = TensorDataset(data_in.to("cuda").float(), data_out.to("cuda").float())

    # Define the sizes for your training and validation sets
    total_size = len(full_dataset)
    train_size = int(0.8 * total_size)
    val_size = total_size - train_size

    # Split the dataset
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    # Create DataLoaders for both training and validation sets
    train_dataloader = DataLoader(train_dataset, batch_size=400, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=500)  # No need to shuffle the validation data


    model.to("cuda")

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)



    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for inputs, targets in train_dataloader:
            inputs, targets = inputs.to("cuda"), targets.to("cuda")

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_dataloader.dataset)
        # Validation phase
        model.eval()  # Set the model to evaluation mode
        running_loss = 0.0
        with torch.no_grad():  # No gradients need to be calculated
            for inputs, targets in val_dataloader:
                inputs, targets = inputs.to("cuda").float(), targets.to("cuda").float()
                outputs = model(inputs)

                loss = criterion(outputs, targets)
                running_loss += loss.item() * inputs.size(0)
        val_loss = running_loss / len(val_dataloader.dataset)

        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.8f}, Validation Loss: {val_loss:.8f}')

    if(save):
        # Ensure the model is in evaluation mode
        model.eval()

        predictions = []
        with torch.no_grad():  # No gradients needed for inference
            for inputs, _ in full_dataset:  # Assuming your dataset returns inputs and targets
                inputs = inputs.to('cuda').float().unsqueeze(0)

                # Get the model output
                outputs = model(inputs)


                predictions.append(outputs.cpu())

        # Concatenate all batches of predictions
        all_predictions = torch.stack(predictions, dim=0)

        # Save the tensor to a file
        torch.save(all_predictions, 'model_predictions.pth')



In [None]:
# layers are [0, ..., 31] for ff and attn
# we only save 15 layers so need to map: {0<->0, 15<->1, 30<->2}
layer_index = 30 # 0 / 15 / 30
layer_index_mapped = (layer_index % layers_to_skip)

# The prompt ends around token 30-40

# input is tokens 0-100
train_start = 0
train_end = 100

# output is mean of the tokens 100-120
average_start = 100
average_end = 120

# - activations is list of affective shape [text, layer, token, d_model]
# - act has shape [layer, token, d_model]
# we combine this to get the data for training:
# data_in.shape  == [n_texts, n_tokens=100, d_model] (100 "token" position inputs)
# data_out.shape == [n_texts, d_model] (1 "token" position output)
data_in  = torch.stack([act[layer_index_mapped, train_start:train_end, :] for act in activations])
data_out = torch.mean(
    torch.stack([
        act[layer_index_mapped, average_start:average_end, :] for act in activations
    ]),
    dim=1
)

# Define the loss and predictor model we will be using
criterion = nn.MSELoss()
model = CustomTransformerModel(input_size=4096, output_size=4096).to("cuda")

In [88]:

train(data_in, data_out, model, criterion, num_epochs=30, save=True, learning_rate=0.0001)

Epoch [1/30], Training Loss: 0.32469651, Validation Loss: 0.11559182
Epoch [2/30], Training Loss: 0.12473597, Validation Loss: 0.06328046
Epoch [3/30], Training Loss: 0.07575580, Validation Loss: 0.05229685
Epoch [4/30], Training Loss: 0.06461286, Validation Loss: 0.04777841
Epoch [5/30], Training Loss: 0.05988586, Validation Loss: 0.04125999
Epoch [6/30], Training Loss: 0.05334065, Validation Loss: 0.03238321
Epoch [7/30], Training Loss: 0.04462754, Validation Loss: 0.02351909
Epoch [8/30], Training Loss: 0.03600687, Validation Loss: 0.01712981
Epoch [9/30], Training Loss: 0.02973778, Validation Loss: 0.01423174
Epoch [10/30], Training Loss: 0.02686051, Validation Loss: 0.01369918
Epoch [11/30], Training Loss: 0.02615321, Validation Loss: 0.01377128
Epoch [12/30], Training Loss: 0.02605203, Validation Loss: 0.01332568
Epoch [13/30], Training Loss: 0.02543598, Validation Loss: 0.01211494
Epoch [14/30], Training Loss: 0.02401829, Validation Loss: 0.01042738
Epoch [15/30], Training Loss:

In [90]:
res = torch.load("./model_predictions.pth")