In [1]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import random
import time
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
# set the seed so we get the same results from here on for each run
torch.manual_seed(42)

<torch._C.Generator at 0x125709cb0>

In [4]:
class TestModel(torch.nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(10, hidden_size)
        self.linear = torch.nn.Linear(hidden_size, hidden_size)
        self.lm_head = torch.nn.Linear(hidden_size, 10)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = self.linear(x)
        x = self.lm_head(x)
        return x

In [5]:
# set a reasonable hidden size to illustrate the small fraction params needed to be added for LoRA
hidden_size = 1024
model = TestModel(hidden_size)


In [7]:
# dummy inputs
input_ids = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]])

In [8]:
# toy example of a detokenizer
# The vocabulary is only consists of 10 words (different colors)
detokenizer = ['red', 'green', 'blue', 'yellow', 'black', 'white', 'purple', 'orange', 'pink', 'brown']


In [17]:
# this is the same generation step as we saw in lesson 2 (batching)
def generate_token(model, **kwargs):
    with torch.no_grad():
        logits = model(**kwargs)
    last_logis = logits[:, -1, :]
    next_token_ids = last_logis.argmax(dim=1)

    return [detokenizer[token_id] for token_id in next_token_ids]

In [18]:
# generate one token
next_token = generate_token(model, input_ids=input_ids)[0]
next_token

'orange'

In [19]:
# dummy input tensor
# shape: (batch_size, sequence_length, hidden_size)
X = torch.randn(1, 8, 1024)

In [20]:
 # LoRA A and B tensors
 # A has shape (hidden_size, rank)
 # B has shape (rank, hidden_size)
 lora_a = torch.randn(1024, 2)
 lora_b = torch.randn(2, 1024)


In [21]:
W = model.linear.weight

In [22]:
W.shape

torch.Size([1024, 1024])

In [23]:
W2 = lora_a @ lora_b

In [24]:
W2.shape

torch.Size([1024, 1024])

In [26]:
# Compare number of elements of A and B with number of elements of W
# W here has shape (hidden_size, hidden_size)
lora_numel = lora_a.numel() + lora_b.numel()
base_numel = W.numel()
print("|A+B| / |W|:", lora_numel / base_numel)

|A+B| / |W|: 0.00390625


In [27]:
class LoraLayer(torch.nn.Module):
    def __init__(self, base_layer, r):
        super().__init__()
        self.base_layer = base_layer

        d_in, d_out = self.base_layer.weight.shape
        self.lora_a = torch.randn(d_in, r)
        self.lora_b = torch.randn(r, d_out)

    def forward(self, x):
        y1 = self.base_layer(x)
        y2 = x @ self.lora_a @ self.lora_b
        return y1 + y2


In [28]:
# warp the linear layer of our toy model, use rank 2
lora_layer = LoraLayer(model.linear, 2)
lora_layer(X).shape

torch.Size([1, 8, 1024])

In [30]:
lora_model = copy.deepcopy(model)

lora_model.linear = lora_layer

In [31]:
lora_model

TestModel(
  (embedding): Embedding(10, 1024)
  (linear): LoraLayer(
    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (lm_head): Linear(in_features=1024, out_features=10, bias=True)
)

In [32]:
next_token = generate_token(lora_model, input_ids=input_ids)
next_token[0]

'white'