In [None]:
!pip install torch datasets gdown
!pip install transformers==4.53.2 # PLEASE KEEP THIS VERSION
!pip install -U bitsandbytes
!pip install -U datasets



In [None]:
!gdown --id 1JYr9Do94hfzc91NyxKBSJCwsTNw5ygK6 # Our modified LLama model
!gdown --id 17PhF8wGp9X5puN_kr0WzW7r5_ynlShXs # Our evaluation pipeline
!gdown --id 1ZNbNV_ePNckVuNbkzQjJAqJoODy-GSW8 # Eval configuration file

Downloading...
From (original): https://drive.google.com/uc?id=1JYr9Do94hfzc91NyxKBSJCwsTNw5ygK6
From (redirected): https://drive.google.com/uc?id=1JYr9Do94hfzc91NyxKBSJCwsTNw5ygK6&confirm=t&uuid=93c1fdef-cded-49dd-8582-6e73840ede7f
To: /content/modelling_llama_open.py
100% 58.8k/58.8k [00:00<00:00, 94.2MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=17PhF8wGp9X5puN_kr0WzW7r5_ynlShXs
From (redirected): https://drive.google.com/uc?id=17PhF8wGp9X5puN_kr0WzW7r5_ynlShXs&confirm=t&uuid=ccb9c2d8-5964-4b46-a6fc-2fb87d8498ad
To: /content/eval_classification_final.py
100% 75.4k/75.4k [00:00<00:00, 87.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZNbNV_ePNckVuNbkzQjJAqJoODy-GSW8
To: /content/config.json
100% 603/603 [00:00<00:00, 3.30MB/s]


In [None]:
# Our Dataset
!rm -r ./data
!mkdir data
!gdown --id 1LjcvWQ84JqZQKMQrsxxIVnv0uug8hKWP -O data.zip
!unzip data.zip -d data

# 🦙 LLaMA Testing

**Setup your Hugging Face access token**  
Click the 🔑 key icon on the left sidebar of Colab, then add a new environment variable:

- **Name:** `HF_TOKEN`  
- **Value:** *(your Hugging Face access token)*  
You can generate or manage your token here:  
👉 [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)

> ⚠️ **Note:**  
This notebook tests the **LLaMA 3.1 8B Instruct** model. Please ensure you have been granted access to this model on Hugging Face before proceeding.

---

### ⚙️ Recommended Hardware

To run **LLaMA 3.1 8B** smoothly:

- For **full-precision (16-bit)** loading, we recommend using **NVIDIA A100** (Colab Pro/Pro+) or a **3090/4090** if running locally.
- On **Google Colab Free**, please select the **T4 GPU** and use the **8-bit quantized version** of the model for better memory efficiency.

---

### 🧊 Optional: Enable Quantized Loading (8-bit or 4-bit)

To reduce memory usage and enable LLaMA to run on smaller GPUs, use the `BitsAndBytesConfig` for quantized loading:

```python
from transformers import BitsAndBytesConfig, LlamaForCausalLM

# Enable 8-bit quantization (use 4-bit if needed)
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True  # For 4-bit: load_in_4bit=True
)

# Load the model with quantization
model = LlamaForCausalLM.from_pretrained(
    model_name,
    use_auth_token=access_token,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
)
```

> 💡 If you want to switch back to 16-bit full precision, simply **comment out or remove the `quantization_config` argument** in `from_pretrained`.

---

In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig

# ---- Here is our modified LLama modeling file ---
from modelling_llama_open import LlamaForCausalLM
# --------------------------------------------------------

import torch
from google.colab import userdata
import json

# ---- Here is our evaluation pipeline ---
from eval_classification_final import ZeroTuning
# --------------------------------------------------------

access_token = userdata.get('HF_TOKEN')  # Replace with your access token
# access_token = "hf_xxxx"

In [None]:
# Set the model name
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True,padding_side="left", use_auth_token=access_token)

# Configure quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True
)

# Load the model with 8bit quantization config
model = LlamaForCausalLM.from_pretrained(
    model_name,
    use_auth_token=access_token,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=quantization_config,
)

# # Load the model with 16bit quantization config
# model = LlamaForCausalLM.from_pretrained(
#     model_name,
#     use_auth_token=access_token,
#     torch_dtype=torch.float16, # Use 16bit
#     device_map="auto"
# )

tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1
# model.half() # This might still be useful depending on your GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = model.to(device)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
input_text = "Classify the sentence into one of the following sentiments: positive or negative.\nSentence: generally, clockstoppers will fulfill your wildest fantasies about being a different kind of time traveler, while happily killing 94 minutes.\nSentiment:"
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)

outputs = model.generate(
    inputs["input_ids"],
    max_new_tokens=50,
    pad_token_id=tokenizer.pad_token_id,
    # do_sample=True,
    # num_beams=1,
    use_cache=True,
    # Our model supports a new parameter: input_len = (start, end, rate, layers, heads).
    # You only need to tune the `rate` value.
    input_len=(0, 0, 1, None, None)
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text[len(input_text):]

' negative\nExplanation: The sentence is negative because it expresses a negative opinion about the movie "Clockstoppers". The phrase "killing 94 minutes" is an idiomatic expression meaning "wasting time" or "being a waste of time",'

We can see that with the vanilla LLM, the model's output is negative, which is incorrect.

**Now, let's try improving the attention on the initial tokens and see how the output changes.**

In [None]:
input_text = "Classify the sentence into one of the following sentiments: positive or negative.\nSentence: generally, clockstoppers will fulfill your wildest fantasies about being a different kind of time traveler, while happily killing 94 minutes.\nSentiment:"
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)

outputs = model.generate(
    inputs["input_ids"],
    max_new_tokens=50,
    pad_token_id=tokenizer.pad_token_id,
    # do_sample=True,
    # num_beams=1,
    use_cache=True,
    # Our model supports a new parameter: input_len = (text_start, text_end, rate, layers, heads).
    # You only need to tune the `rate` value.
    input_len=(0, 0, 4, None, None)
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text[len(input_text):]

' positive\nExplanation: The word "happily" is a positive word, which suggests that the outcome of being a time traveler is a good one. Additionally, the phrase "wildest fantasies" implies that the experience will be exciting and fulfilling.'

# 📝 Evaluation Pipeline


✅ **Supported Parameters**

| Parameter | Description | Possible Values |
|----------|-------------|-----------------|
| `model_path` | Path to the pretrained model | - HuggingFace model ID (e.g., `meta-llama/Meta-Llama-3.1-8B-Instruct`)  <br> - Local model path |
| `model_type` | Type of model | - `llama`: For LLaMA models  <br> - `qwen`: For Qwen models |
| `dataset_name` | Name of the dataset to evaluate | **Classification:**  <br> - `sst2`: Sentiment classification  <br> - `sst5`: 5-class sentiment  <br> - `MR`: Movie reviews  <br> - `SUBJ`: Subjectivity  <br> - `TREC`: Question classification  <br> - `CB`: Commitment bias  <br> - `BoolQ`: Boolean questions  <br><br> **Multiple Choice:**  <br> - `ARCC`: ARC Challenge  <br> - `PIQA`: Physical intuition  <br> - `CQA`: Commonsense QA  <br> - `AQUA`: Math word problems  <br> - `MMLU`: Multi-task language understanding  <br> - `MathQA`: Math problems  <br> - `LogiQA`: Logical reasoning |
| `data_path` | Path to the dataset | - Local path to dataset directory  <br> - Default: `./data` |
| `num_samples` | Number of samples to evaluate | - Any positive integer  <br> - `-1`: Use all samples |
| `rate` | Attention enhancement rate | - Float ≥ 0  <br> - Default: `1.0` |
| `rate_min`, `rate_max`, `rate_step` | Range and step size for sweeping the `rate` parameter | - `rate_min`: Starting value (float)  <br> - `rate_max`: Ending value (float)  <br> - `rate_step`: Step size (float) |
| `few_shot_number` | Number of few-shot examples | - Non-negative integer  <br> - `0`: No few-shot examples |
| `verbose` | Whether to print detailed logs | - `True`: Print detailed output  <br> - `False`: Print only essential info |
| `heads` | Target attention heads | - Comma-separated list (e.g., `0,1,2`)  <br> - `None`: Use all heads |
| `layers` | Target model layers | - Comma-separated list (e.g., `0,1,2`)  <br> - `None`: Use all layers |
| `exploring_mode` | Exploration strategy for heads/layers | - `0`: Test all layer–head combinations  <br> - `1`: Test each head across specified layers  <br> - `2`: Test each layer across specified heads |
| `output_dir` | Directory for saving results | - Local output directory path  <br> - Default: `./outputs` |


**Let's test the effect of ZeroTuning using SST2 as the test dataset.**

First, you can load the default configuration by specifying the local config path.

In [None]:
from eval_classification_final import ZeroTuning

config_path = "./config.json"
hftoken = userdata.get('HF_TOKEN')

accuracy_vanilla = ZeroTuning(config_path=config_path, hftoken=hftoken, model=model, tokenizer=tokenizer)
accuracy_vanilla

Loading from cache directory: ./data/sst2


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Running single evaluation...

Testing with attention enhancement rate: 1.0
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:52,  4.42it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:50,  4.52it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:50,  4.51it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


Results saved to: ,/outputs/eval_sst2_20250920_053149.json

sst2 Statistics:
Rate: 1.0
Total samples: 500
Valid predictions: 347
Invalid predictions: 153
Correct predictions: 318
Accuracy: 0.6360





0.636

**Alternatively, you can manually create a configuration object by assigning values directly.**

In this example, we set the scaling factor to 4 to increase the attention on the initial token.

In [None]:
# Create a configuration file
config_data = {
    "model": {
        "path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "type": "llama",
        "use_8bit": True,
      },
    "dataset": {
        "name": "sst2",
        "path": "./data",
        "num_samples": 500
    },
    "evaluation": {
        "rate": 4, # Set scaling factor to 4
        "rate_min": None,
        "rate_max": None,
        "rate_step": None,
        "few_shot_number": 0,
        "verbose": False
    },
    "attention": {
        "heads": None,
        "layers": None,
        "exploring_mode": None
    },
    "output": {
        "output_dir": None
    }
}

accuracy_boosted = ZeroTuning(config=config_data, hftoken=hftoken, model=model, tokenizer=tokenizer)
accuracy_boosted

Loading from cache directory: ./data/sst2
Running single evaluation...

Testing with attention enhancement rate: 4
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:49,  4.54it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:48,  4.59it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:49,  4.56it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 4
Total samples: 500
Valid predictions: 498
Invalid predictions: 2
Correct predictions: 443
Accuracy: 0.8860





0.886

**Oh! The accuracy is clearly increased!**

**Next, let's use grid search to explore different scaling factors, ranging from 1 to 4 with a step size of 1.**

In [None]:
# Create a configuration file
config_data = {
    "model": {
        "path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "type": "llama",
        "use_8bit": True,
      },
    "dataset": {
        "name": "sst2",
        "path": "./data",
        "num_samples": 500
    },
    "evaluation": {
        "rate": None,
        "rate_min": 1,
        "rate_max": 4,
        "rate_step": 1,
        "few_shot_number": 0,
        "verbose": False
    },
    "attention": {
        "heads": None,
        "layers": None,
        "exploring_mode": None
    },
    "output": {
        "output_dir": None
    }
}

accuracy_boosted = ZeroTuning(config=config_data, hftoken=hftoken, model=model, tokenizer=tokenizer)
accuracy_boosted

Loading from cache directory: ./data/sst2
Running rate interval testing from 1 to 4 with step 0.5


Testing rates:   0%|          | 0/7 [00:00<?, ?it/s]


Testing with attention enhancement rate: 1.0
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:53,  4.40it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:51,  4.48it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:51,  4.46it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 1.0
Total samples: 500
Valid predictions: 347
Invalid predictions: 153
Correct predictions: 318
Accuracy: 0.6360

Testing with attention enhancement rate: 1.5
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:49,  4.54it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:48,  4.59it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:49,  4.56it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 1.5
Total samples: 500
Valid predictions: 393
Invalid predictions: 107
Correct predictions: 365
Accuracy: 0.7300

Testing with attention enhancement rate: 2.0
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:50,  4.50it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:48,  4.61it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:47,  4.62it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 2.0
Total samples: 500
Valid predictions: 469
Invalid predictions: 31
Correct predictions: 426
Accuracy: 0.8520

Testing with attention enhancement rate: 2.5
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:49,  4.54it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:48,  4.58it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:49,  4.56it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 2.5
Total samples: 500
Valid predictions: 480
Invalid predictions: 20
Correct predictions: 437
Accuracy: 0.8740

Testing with attention enhancement rate: 3.0
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:48,  4.58it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:48,  4.57it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:49,  4.54it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 3.0
Total samples: 500
Valid predictions: 492
Invalid predictions: 8
Correct predictions: 442
Accuracy: 0.8840

Testing with attention enhancement rate: 3.5
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:49,  4.57it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:51,  4.46it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:51,  4.47it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 3.5
Total samples: 500
Valid predictions: 496
Invalid predictions: 4
Correct predictions: 444
Accuracy: 0.8880

Testing with attention enhancement rate: 4.0
Few-shot examples: 0


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:48,  4.58it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:48,  4.60it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:49,  4.52it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 4.0
Total samples: 500
Valid predictions: 498
Invalid predictions: 2
Correct predictions: 443
Accuracy: 0.8860

Rate interval testing results saved to: results/rate_interval_sst2_20250920_053545.txt
Best rate: 3.50
Best accuracy: 0.8880





(np.float64(3.5), 0.888)

**Now let's analyze the behavior differences across various attention heads:**

- We set `rate = 2`, which doubles the attention to the initial tokens.  
- Then, we iterate over heads [1, 10, 20, 30].

We observed that heads 1 and 20 perform significantly worse than the vanilla baseline—these are what we call **down-effective heads**.  
In contrast, heads 10 and 30 outperform the baseline, making them **up-effective heads**.

In [None]:
# Create a configuration file
config_data = {
    "model": {
        "path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "type": "llama",
        "use_8bit": True,
      },
    "dataset": {
        "name": "sst2",
        "path": "./data",
        "num_samples": 500
    },
    "evaluation": {
        "rate": 2,
        "rate_min": None,
        "rate_max": None,
        "rate_step": None,
        "few_shot_number": 0,
        "verbose": False
    },
    "attention": {
        "heads": [1,2,4,8],
        "layers": None,
        "exploring_mode": 1
    },
    "output": {
        "output_dir": None
    }
}

accuracy_boosted = ZeroTuning(config=config_data, hftoken=hftoken, model=model, tokenizer=tokenizer)

Loading from cache directory: ./data/sst2

Exploring performance across 4 heads with 32 layers
Target heads: [1, 10, 20, 30]
Target layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]


Testing heads:   0%|          | 0/4 [00:00<?, ?it/s]


Testing with attention enhancement rate: 2
Few-shot examples: 0
Target layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
Target heads: [1]


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:54,  4.37it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:52,  4.44it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:52,  4.42it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 2
Total samples: 500
Valid predictions: 260
Invalid predictions: 240
Correct predictions: 237
Accuracy: 0.4740

Head 1 accuracy: 0.4740

Testing with attention enhancement rate: 2
Few-shot examples: 0
Target layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
Target heads: [10]


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:52,  4.43it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:52,  4.42it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:51,  4.45it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 2
Total samples: 500
Valid predictions: 364
Invalid predictions: 136
Correct predictions: 339
Accuracy: 0.6780

Head 10 accuracy: 0.6780

Testing with attention enhancement rate: 2
Few-shot examples: 0
Target layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
Target heads: [20]


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:49,  4.57it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:51,  4.46it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:56,  4.28it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 2
Total samples: 500
Valid predictions: 355
Invalid predictions: 145
Correct predictions: 329
Accuracy: 0.6580

Head 20 accuracy: 0.6580

Testing with attention enhancement rate: 2
Few-shot examples: 0
Target layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
Target heads: [30]


Evaluating sst2:   0%|                                                      | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|                                              | 1/500 [00:00<01:51,  4.46it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   0%|▏                                             | 2/500 [00:00<01:52,  4.44it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎                                             | 3/500 [00:00<01:52,  4.43it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating sst2:   1%|▎     


sst2 Statistics:
Rate: 2
Total samples: 500
Valid predictions: 394
Invalid predictions: 106
Correct predictions: 360
Accuracy: 0.7200

Head 30 accuracy: 0.7200





We found that **reducing** the initial token attention for down-effective heads can also **improve** the model's performance.

In [None]:
# Create a configuration file
config_data = {
    "model": {
        "path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "type": "llama",
        "use_8bit": True,
      },
    "dataset": {
        "name": "sst2",
        "path": "./data",
        "num_samples": 500
    },
    "evaluation": {
        "rate": 0.5,
        "rate_min": None,
        "rate_max": None,
        "rate_step": None,
        "few_shot_number": 0,
        "verbose": False
    },
    "attention": {
        "heads": [1],
        "layers": None,
        "exploring_mode": 1
    },
    "output": {
        "output_dir": None
    }
}

accuracy_boosted = ZeroTuning(config=config_data, hftoken=hftoken, model=model, tokenizer=tokenizer)

Now, you can test other datasets. For example, let's try decreasing the initial token's attention for the BoolQ dataset.

In [None]:
# Create a configuration file
config_data = {
    "model": {
        "path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "type": "llama",
        "use_8bit": True,
      },
    "dataset": {
        "name": "BoolQ",
        "path": "./data",
        "num_samples": 500
    },
    "evaluation": {
        "rate": 1,
        "rate_min": None,
        "rate_max": None,
        "rate_step": None,
        "few_shot_number": 0,
        "verbose": False
    },
    "attention": {
        "heads": None,
        "layers": None,
        "exploring_mode": None
    },
    "output": {
        "output_dir": None
    }
}

accuracy_vanilla = ZeroTuning(config=config_data, hftoken=hftoken, model=model, tokenizer=tokenizer)

Loading from cache directory: ./data/BoolQ


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Running single evaluation...

Testing with attention enhancement rate: 1
Few-shot examples: 0


Evaluating BoolQ:   0%|                                                     | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   0%|                                             | 1/500 [00:00<01:56,  4.28it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   0%|▏                                            | 2/500 [00:00<01:55,  4.33it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   1%|▎                                            | 3/500 [00:00<01:55,  4.29it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   1%|▎    


BoolQ Statistics:
Rate: 1
Total samples: 500
Valid predictions: 495
Invalid predictions: 5
Correct predictions: 362
Accuracy: 0.7240





In [None]:
# Create a configuration file
config_data = {
    "model": {
        "path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "type": "llama",
        "use_8bit": True,
      },
    "dataset": {
        "name": "BoolQ",
        "path": "./data",
        "num_samples": 500
    },
    "evaluation": {
        "rate": 0.3,
        "rate_min": None,
        "rate_max": None,
        "rate_step": None,
        "few_shot_number": 0,
        "verbose": False
    },
    "attention": {
        "heads": None,
        "layers": None,
        "exploring_mode": None
    },
    "output": {
        "output_dir": None
    }
}

accuracy_boosted = ZeroTuning(config=config_data, hftoken=hftoken, model=model, tokenizer=tokenizer)

Loading from cache directory: ./data/BoolQ
Running single evaluation...

Testing with attention enhancement rate: 0.3
Few-shot examples: 0


Evaluating BoolQ:   0%|                                                     | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   0%|                                             | 1/500 [00:00<01:50,  4.52it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   0%|▏                                            | 2/500 [00:00<01:51,  4.46it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   1%|▎                                            | 3/500 [00:00<01:53,  4.39it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating BoolQ:   1%|▎    


BoolQ Statistics:
Rate: 0.3
Total samples: 500
Valid predictions: 499
Invalid predictions: 1
Correct predictions: 403
Accuracy: 0.8060



