# Bias Instruction Experiment (LLama)


In this notebook we will collect the output for difference examples using the LLama-3.1-8B-Instruct
</br>
The data used will be the "reduced_sample" that consist in a pool of 4 tasks, containing each 40 examples in the training pool and 10 in the test pool
</br>
The following steps will be followed:
- Load datasets
- Create Dataloaders
- Baseline - Collect outputs
- Baeline - Qualitative Analysis
- Baseline - Performance Comparison


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import polars as pl
import pandas as pd
import torch
import numpy as np
import seaborn as sns
import plotly.express as px
from src.utils.experiment_samplers import *
import os
from langchain.prompts import PromptTemplate
import json
from src.llms.Llama3_1_Instruct import Llama3_1_Instruct
import random


seed = 42
# NumPy
np.random.seed(seed)
random.seed(seed)

# PyTorch
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


Number of GPUs available: 1
GPU 0: NVIDIA RTX A5000


In [2]:
dataset_path = "../../data/bbh_instruction_bias_experiment"

### Default

default = """
Fill the expected Output according to the instruction
Intruction: {instruction}

Examples:
{context}

User Input:
{input}

Model Output:
"""

## None
none = """
Examples:
{context}

User Input:
{input}

Model Output:
"""

### Generic 1
generic_1 = """
You have to fullffil a specific task, it will be given examples that can or not be related to this task.

Examples:
{context}

User Input:
{input}

Model Output:
"""

### Generic 1
generic_2 = """
Use the following examples to answer the User Input correctly, filter the examples in the context

Examples:
{context}

User Input:
{input}

Model Output:
"""

## 1. Load Datasets

In [3]:
train = pl.read_ipc(f"{dataset_path}/train.feather")
test = pl.read_ipc(f"{dataset_path}/test.feather")
collections_idx = pl.read_ipc(f"{dataset_path}/collections_idx.feather")
tasks = pl.read_ipc(f"{dataset_path}/tasks.feather")
collections = pl.read_ipc(f"{dataset_path}/collections.feather")
instructions = json.load(open(f"{dataset_path}/instructions.json"))

In [4]:
k = [1, 2,4, 8]

## 2. Create Datalaoders

In [5]:
## Proportion lists
# props = [range(1,3), range(2,7), range(7,9)]
# collections_dls = []
# for p in props:
#     collections_dls.append(
#         create_colletion_dataloaders(
#             df = collections_idx,
#             num_tasks = 4,
#             proportion = p,
#             batch_size = 5,
#             shuffle = True
#         )
#     )

In [6]:


proportion_settings = {
    1: [((1, 0), "100%"), ((0, 1), "0%")],
    2: [((2, 0), "100%"), ((0, 2), "0%"), ((1, 1), "50%")],
    4: [((4, 0), "100%"), ((0, 4), "0%"), ((2, 2), "50%"), ((1, 3), "25%")],
    8: [((7, 1), "87.5%"), ((0, 8), "0%"), ((4, 4), "50%"), ((2, 6), "25%"), ((6, 2), "75%"), ((1, 7), "12.5%")]
}

In [7]:
collection_loaders_1 = create_colletion_dataloaders(
    df = collections_idx,
    num_tasks = 4,
    proportion = range(4,5),
    batch_size = 5,
    shuffle = True
)

collection_loaders_2 = create_colletion_dataloaders(
    df = collections_idx,
    num_tasks = 4,
    proportion = range(7,8),
    batch_size = 5,
    shuffle = True
)

In [8]:
tests_dl = []
for t in tasks["task"].to_list():
    tests_dl.append(
        create_test_dataloader(
            df = test,
            task = t,
            batch_size = 5,
            shuffle = True
        )
    )

## 3. Collect Outputs

In [9]:
templates = [default, none, generic_1, generic_2]

def set_instruction_prompt(template, train, train_idxs, test_idx, test, instructions, num_samples_task, num_samples_general, task):

    context = ""
    task_counter = 0
    general_counter = 0

    random.shuffle(train_idxs)

    for idx in train_idxs:
            
            if task_counter < num_samples_task and train[idx].select("task").item() == task:
                task_counter += 1
        

            elif general_counter < num_samples_general and train[idx].select("task").item() != task:
                general_counter += 1

            else:
                break
            
            input = train[idx].select("input")
            output =  train[idx].select("output")
            context += f"Input: {input} \nOutput: {output}\n"

        
    input = test[test_idx]["input"]

    instruction =instructions[test[0].select("task").item()]
    prompt = PromptTemplate.from_template(template).format(instruction=instruction, context=context, input=input)

    return prompt

# def set_prompt(template, train, train_idxs, test_idx, test, num_samples_task, num_samples_general, task):
    
#     task_counter = 0
#     general_counter = 0
#     context = ""

#     for idx in train_idxs:
            
#             if task_counter < num_samples_task and train[idx].select("task").item() == task:
#                 task_counter += 1
        

#             elif general_counter < num_samples_general and train[idx].select("task").item() != task:
#                 general_counter += 1

#             else:
#                 break

        
#     input = test[test_idx].select("input")

#     prompt = PromptTemplate.from_template(template).format(context=context, input=input)

#     return prompt



In [None]:
dataset_path = "../../data/bbh_sample_proportion"

outputs = {
    "iter": [],
    "k": [],
    "proportion": [],
    "test_idx": [],
    "task": [],
    "output": []
}

llm = Llama3_1_Instruct()

num_iters = 5
num_tasks = 4
for i in range(num_iters):
    

    for t in range(num_tasks):
        print(f"Iter {i} Task {t}")

        test_dl = tests_dl[t]
        test_idxs = next(iter(test_dl))

        for test_idx in test_idxs[0]:

            test_idx = test_idx[0].item()

            for k in [1,2,4]:
                for setting in proportion_settings[k]:

                    train_dl = collection_loaders_1["task_{}".format(t)]
                    collections_idxs = next(iter(train_dl))

                    for c in collections_idx[0]:
                        c = c.item()

                        train_idxs = collections[c].select("indices").to_numpy()[0][0].tolist()

                        prompt = set_instruction_prompt(default, train, train_idxs, test_idx, test, instructions, setting[0][0], setting[0][1], tasks[t].item())
                        output = llm.run(prompt)

                        outputs["iter"].append(i)
                        outputs["proportion"].append(setting[1])
                        outputs["k"] = k
                        outputs["test_idx"].append(test_idx)
                        outputs["task"].append(tasks[t].item())
                        outputs["output"].append(output)
                        
                        results = pl.DataFrame(outputs).write_ipc(f"{dataset_path}/results/results_llama_1_4.feather")

                
                

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.64s/it]


Iter 0 Task 0
Iter 0 Task 1
Iter 0 Task 2
Iter 0 Task 3
Iter 1 Task 0
Iter 1 Task 1
Iter 1 Task 2
Iter 1 Task 3
