In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Step 1: Load and Quantize the Model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [None]:
# Quantize the model weights for efficient memory usage
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [None]:
# Step 2: Define Low-Rank Adaptation Matrices Manually
# Assuming the model layer has 2048 input and output features
# Define low-rank matrices A and B (rank = 4 for this example)
rank = 4
in_features = 2048
out_features = 2048

# Low-rank matrices for adaptation (A: 2048x4, B: 4x2048)
A = torch.randn(in_features, rank)  # Matrix A with shape (2048, 4)
B = torch.randn(rank, out_features) # Matrix B with shape (4, 2048)


In [None]:
# Define a function for applying the LoRA transformation
def apply_lora(x):
    # Perform low-rank adaptation by applying A and B
    lora_output = torch.matmul(torch.matmul(x, A), B)  # shape remains (batch, seq_len, 2048)
    return lora_output

# Step 3: Define the Forward Pass with LoRA
prompt = "The cat sat on"
inputs = tokenizer(prompt, return_tensors="pt")
print(inputs['input_ids'])
# Perform forward pass and inject LoRA adjustments
with torch.no_grad():  # fine-tuning, so we do not require gradient calculations
    # Step 3a: Get the embeddings for the input
    embeddings = model.model.embed_tokens(inputs["input_ids"])
    print(embeddings)

    # Step 3b: Process through the first attention layer of LLaMA with LoRA
    attention_layer = model.model.layers[0].self_attn  # Get first attention layer

    # Pass embeddings through q_proj layer without transposing
    q_proj_output = attention_layer.q_proj(embeddings)  # Should have shape (batch, seq_len, 2048)
    print(q_proj_output)

    # Apply LoRA adjustment
    lora_adjustment = apply_lora(embeddings)  # This should be (batch, seq_len, 2048)
    attention_output = q_proj_output + lora_adjustment  # Combine with LoRA output

    # Step 4: Pass through LM Head for final output logits
    logits = model.lm_head(attention_output)


tensor([[   1,  450, 6635, 3290,  373]])
tensor([[[-1.0910e-03,  1.9302e-03, -1.6632e-03,  ...,  1.9932e-04,
          -6.5231e-04, -4.9973e-04],
         [-9.8267e-03,  9.6436e-03,  2.0386e-02,  ..., -7.0496e-03,
          -7.1716e-03, -3.8910e-04],
         [-2.2531e-05, -5.3406e-03, -3.5400e-03,  ...,  1.3123e-02,
          -6.5308e-03,  1.1963e-02],
         [ 1.9409e-02,  8.7891e-03,  1.4954e-02,  ..., -3.2196e-03,
           4.7607e-03, -5.0049e-03],
         [ 7.3853e-03,  4.7607e-03,  6.9885e-03,  ..., -3.3722e-03,
          -8.4305e-04,  1.2451e-02]]])
tensor([[[ 0.0011, -0.0008,  0.0003,  ..., -0.0008,  0.0009,  0.0013],
         [-0.0003, -0.0065,  0.0042,  ...,  0.0098, -0.0069, -0.0074],
         [-0.0047,  0.0071, -0.0050,  ...,  0.0131, -0.0112, -0.0116],
         [-0.0079,  0.0164, -0.0023,  ...,  0.0075, -0.0123, -0.0115],
         [ 0.0026,  0.0100,  0.0005,  ...,  0.0010, -0.0017, -0.0010]]])


In [None]:

# Step 4: Obtain Predictions
# Apply softmax to get probabilities for the next token in the vocabulary
probs = torch.softmax(logits, dim=-1)
predicted_token = torch.argmax(probs, dim=-1)

# Decode the predicted token ID to get the word
predicted_word = tokenizer.decode(predicted_token[0])
print("Predicted next word:", predicted_word)


Predicted next word: cdnussenódroundependant


## Code explanation
The code provided outlines a process of fine-tuning a language model with LoRA (Low-Rank Adaptation) by adding low-rank matrices into its attention layers, while quantizing the model for memory efficiency. Here’s a step-by-step breakdown:

### Step 1: Load and Quantize the Model
```python
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
1. The `model_name` variable holds the identifier for the TinyLlama model, which is loaded via Hugging Face's `AutoModelForCausalLM`.
2. The tokenizer is also loaded to convert text into token IDs for input.

```python
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
```
3. The `quantize_dynamic` function converts the model’s linear layers to a quantized version, reducing memory use and possibly speeding up inference. This quantization affects only the linear layers, not all components.

### Step 2: Define LoRA Matrices
```python
rank = 4
in_features = 2048
out_features = 2048
A = torch.randn(in_features, rank)  # Matrix A with shape (2048, 4)
B = torch.randn(rank, out_features) # Matrix B with shape (4, 2048)
```
4. LoRA modifies the model’s weights by introducing low-rank matrices `A` and `B` for adaptation. Here, `rank` is set to 4, meaning that each 2048-dimensional input or output will be projected into a 4-dimensional space.
5. `A` is a `2048x4` matrix (input dimension to low-rank dimension), while `B` is a `4x2048` matrix (low-rank dimension back to the original).

```python
def apply_lora(x):
    lora_output = torch.matmul(torch.matmul(x, A), B)  # shape remains (batch, seq_len, 2048)
    return lora_output
```
6. The `apply_lora` function takes in `x` (embeddings from the model) and transforms it using `A` and `B`, which adapt the model’s behavior while keeping the overall dimension unchanged.

### Step 3: Define the Forward Pass with LoRA
```python
prompt = "The cat sat on"
inputs = tokenizer(prompt, return_tensors="pt")
```
7. The input prompt is tokenized, converting it into tensor form (`inputs["input_ids"]`) for use with the model.

```python
with torch.no_grad():
    embeddings = model.model.embed_tokens(inputs["input_ids"])
```
8. Inside a `no_grad()` block (disables gradient calculations for efficiency), we get embeddings by passing the token IDs through the model’s embedding layer.

```python
attention_layer = model.model.layers[0].self_attn  # Get first attention layer
q_proj_output = attention_layer.q_proj(embeddings)
```
9. We select the first attention layer and pass the embeddings through its query projection (`q_proj`). This transformation returns `q_proj_output`, representing attention-based information of shape `(batch, seq_len, 2048)`.

```python
lora_adjustment = apply_lora(embeddings)  # Apply LoRA adjustment
attention_output = q_proj_output + lora_adjustment  # Combine with LoRA output
```
10. The LoRA transformation `apply_lora(embeddings)` is applied to the embeddings. The resulting `lora_adjustment` is added to `q_proj_output`, creating `attention_output` with the combined effects of the query projection and the LoRA adaptation.

### Step 4: Obtain Predictions
```python
logits = model.lm_head(attention_output)
```
11. The adjusted output from the attention layer is passed to the `lm_head` (language modeling head) of the model to compute `logits`, representing scores for each token in the vocabulary.

```python
probs = torch.softmax(logits, dim=-1)
predicted_token = torch.argmax(probs, dim=-1)
```
12. `softmax` is applied to `logits` to obtain probabilities for each token in the vocabulary, and `argmax` is used to select the token with the highest probability as `predicted_token`.

```python
predicted_word = tokenizer.decode(predicted_token[0])
print("Predicted next word:", predicted_word)
```
13. Finally, the predicted token ID is decoded back into a word, which is printed as the model’s next predicted word based on the prompt "The cat sat on".

### Summary
This code demonstrates the process of LoRA adaptation on an attention layer of a language model. By adding the LoRA matrices `A` and `B` to the attention layer’s query projection, it modifies the behavior of the model to specialize it for specific tasks, all while keeping model memory and computational costs lower than a full fine-tuning would require.

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [None]:

# Define low-rank matrices A and B for LoRA with requires_grad=True for training
rank = 8  # LoRA rank, which is typically low
A = torch.randn((2048, rank), requires_grad=True, device=device)  # Low-rank projection down on GPU
B = torch.randn((rank, 2048), requires_grad=True, device=device)  # Low-rank projection up on GPU

# Function to apply LoRA transformation
def apply_lora(x):
    lora_output = x @ A @ B  # Low-rank adaptation
    return lora_output

# Fine-tuning setup: Assuming we are training LoRA parameters
optimizer = torch.optim.Adam([A, B], lr=1e-4)

# Define dummy target labels and loss function for demonstration purposes
# Move target_labels to GPU
loss_fn = torch.nn.CrossEntropyLoss()
target_labels = torch.tensor([1, 2, 3, 4,5], device=device)  # Replace with actual target labels

# Training loop for updating only LoRA matrices (A and B)
num_training_steps=50
for step in range(num_training_steps):
    optimizer.zero_grad()

    # Tokenize input and pass it through the embedding layer
    inputs = torch.tensor([   1,  450, 6635, 3290,  373]).to(device)  # Move inputs to GPU
    print("inputs:=======", inputs)
    embeddings = model.model.embed_tokens(inputs)
    print("embeddings:=======", embeddings)
    # Apply LoRA to the first attention layer’s query projection
    attention_layer = model.model.layers[0].self_attn
    q_proj_output = attention_layer.q_proj(embeddings)
    lora_adjustment = apply_lora(embeddings)
    adjusted_output = q_proj_output + lora_adjustment

    # Pass through LM head to obtain logits
    logits = model.lm_head(adjusted_output)
    print("Prediction======")

    # # Adjust logits to match target_labels if necessary
    # if logits.size(1) > target_labels.size(0):  # Assuming mismatch is in sequence length
    #     logits = logits[:, :-1, :]  # Adjust logits by slicing if one extra token is present

    # Calculate loss and backpropagate to update only LoRA matrices A and B
    loss = loss_fn(logits.view(-1, logits.size(-1)), target_labels.view(-1))
    loss.backward()
    optimizer.step()

print(f"Step {step + 1}/{num_training_steps}, Loss: {loss.item()}")


In [1]:
import torch

In [4]:
model.model.embed_tokens(torch.tensor([   1,  450, 6635, 3290,  373]))

tensor([[-1.0910e-03,  1.9302e-03, -1.6632e-03,  ...,  1.9932e-04,
         -6.5231e-04, -4.9973e-04],
        [-9.8267e-03,  9.6436e-03,  2.0386e-02,  ..., -7.0496e-03,
         -7.1716e-03, -3.8910e-04],
        [-2.2531e-05, -5.3406e-03, -3.5400e-03,  ...,  1.3123e-02,
         -6.5308e-03,  1.1963e-02],
        [ 1.9409e-02,  8.7891e-03,  1.4954e-02,  ..., -3.2196e-03,
          4.7607e-03, -5.0049e-03],
        [ 7.3853e-03,  4.7607e-03,  6.9885e-03,  ..., -3.3722e-03,
         -8.4305e-04,  1.2451e-02]], grad_fn=<EmbeddingBackward0>)