# Bias Instruction Experiment (LLama)


In this notebook we will collect the output for difference examples using the LLama-3.1-8B-Instruct
</br>
The data used will be the "reduced_sample" that consist in a pool of 4 tasks, containing each 40 examples in the training pool and 10 in the test pool
</br>
The following steps will be followed:
- Load datasets
- Create Dataloaders
- Baseline - Collect outputs
- Baeline - Qualitative Analysis
- Baseline - Performance Comparison


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import polars as pl
import pandas as pd
import torch
import numpy as np
import seaborn as sns
import plotly.express as px
from src.utils.experiment_samplers import *
import os
from langchain.prompts import PromptTemplate
import json
from src.llms.Llama3_1_Instruct import Llama3_1_Instruct


seed = 42
# NumPy
np.random.seed(seed)

# PyTorch
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


Number of GPUs available: 1
GPU 0: NVIDIA RTX A5000


In [2]:
dataset_path = "../../data/bbh_instruction_bias_experiment"

### Default

default = """
Fill the expected Output according to the instruction
Intruction: {instruction}

Examples:
{context}

User Input:
{input}

Model Output:
"""

## None
none = """
Examples:
{context}

User Input:
{input}

Model Output:
"""

### Generic 1
generic_1 = """
You have to fullffil a specific task, it will be given examples that can or not be related to this task.

Examples:
{context}

User Input:
{input}

Model Output:
"""

### Generic 1
generic_2 = """
Use the following examples to answer the User Input correctly, filter the examples in the context

Examples:
{context}

User Input:
{input}

Model Output:
"""

## 1. Load Datasets

In [3]:
train = pl.read_ipc(f"{dataset_path}/train.feather")
test = pl.read_ipc(f"{dataset_path}/test.feather")
collections_idx = pl.read_ipc(f"{dataset_path}/collections_idx.feather")
tasks = pl.read_ipc(f"{dataset_path}/tasks.feather")
collections = pl.read_ipc(f"{dataset_path}/collections.feather")
instructions = json.load(open(f"{dataset_path}/instructions.json"))

## 2. Create Datalaoders

In [4]:
## Proportion lists
props = [range(1,3), range(2,7), range(7,9)]
collections_dls = []
for p in props:
    collections_dls.append(
        create_colletion_dataloaders(
            df = collections_idx,
            num_tasks = 4,
            proportion = p,
            batch_size = 5,
            shuffle = True
        )
    )

In [5]:
collections_dls

[{'task_0': <torch.utils.data.dataloader.DataLoader at 0x7ff862511a50>,
  'task_1': <torch.utils.data.dataloader.DataLoader at 0x7ff8597d8810>,
  'task_2': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e17d0>,
  'task_3': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e3690>},
 {'task_0': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e1610>,
  'task_1': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e2750>,
  'task_2': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e2010>,
  'task_3': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e2990>},
 {'task_0': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e2bd0>,
  'task_1': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e3a10>,
  'task_2': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e0790>,
  'task_3': <torch.utils.data.dataloader.DataLoader at 0x7ff8597e3210>}]

In [6]:
tests_dl = []
for t in tasks["task"].to_list():
    tests_dl.append(
        create_test_dataloader(
            df = test,
            task = t,
            batch_size = 5,
            shuffle = True
        )
    )

## 3. Collect Outputs

In [7]:
templates = [default, none, generic_1, generic_2]

def set_instruction_prompt(template, train, train_idxs, test_idx, test, instructions):




    context = ""
    for idx in train_idxs:
            input = train[idx].select("input").to_numpy()[0]
            output =  train[idx].select("output").to_numpy()[0]
            context += f"Input: {input} \nOutput: {output}\n"

        
    input = test[test_idx]["input"].to_numpy()[0]

    instruction =instructions[test[0].select("task").item()]
    prompt = PromptTemplate.from_template(template).format(instruction=instruction, context=context, input=input)

    return prompt

def set_prompt(template, train, train_idxs, test_idx, test):


    context = ""
    for idx in train_idxs:
            input = train[idx].select("input").to_numpy()[0]
            output =  train[idx].select("output").to_numpy()[0]
            context += f"Input: {input} \nOutput: {output}\n"

        
    input = test[test_idx].select("input")

    prompt = PromptTemplate.from_template(template).format(context=context, input=input)

    return prompt



In [8]:
templates

['\nFill the expected Output according to the instruction\nIntruction: {instruction}\n\nExamples:\n{context}\n\nUser Input:\n{input}\n\nModel Output:\n',
 '\nExamples:\n{context}\n\nUser Input:\n{input}\n\nModel Output:\n',
 '\nYou have to fullffil a specific task, it will be given examples that can or not be related to this task.\n\nExamples:\n{context}\n\nUser Input:\n{input}\n\nModel Output:\n',
 '\nUse the following examples to answer the User Input correctly, filter the examples in the context\n\nExamples:\n{context}\n\nUser Input:\n{input}\n\nModel Output:\n']

In [9]:


outputs = {
    "iter": [],
    "template": [],
    "test_idx": [],
    "task": [],
    "proportion": [],
    "output": []
}

llm = Llama3_1_Instruct()

num_iters = 5
num_tasks = 4
for i in range(num_iters):
    print(f"Iter {i}")	
    for t in range(num_tasks):

        test_dl = tests_dl[t]
        test_idxs = next(iter(test_dl))
        for test_idx in test_idxs[0]:
            test_idx = test_idx[0].item()
            for j in range(len(props)):

                train_dl = collections_dls[j]["task_{}".format(t)]
                collections_idxs = next(iter(train_dl))

                for c in collections_idx[0]:
                    c = c.item()

                    train_idxs = collections[c].select("indices").to_numpy()[0][0].tolist()

                    for t_idx in range(templates):
                        if templates[t_idx] == 0:
                            prompt = set_instruction_prompt(templates[t_idx], train, train_idxs, test_idx, test, instructions)
                        else:
                            prompt = set_prompt(templates[t_idx], train, train_idxs, test_idx, test)
                        output = llm.run(prompt)

                        outputs["iter"].append(i)
                        outputs["template"].append(t_idx)
                        outputs["test_idx"].append(test_idx)
                        outputs["task"].append(t)
                        outputs["proportion"].append(j)
                        outputs["output"].append(output)
                        
                        results = pl.DataFrame(outputs)
                        results.write_ipc(f"{dataset_path}/results/outputs_llama.feather")
                
                raise Exception("error")
                
                

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.01it/s]


Iter 0
[{'generated_text': '[ [ ( ) ] ] ]\n\nUser Input:\nIn the following sentences, explain the antecedent'}]
[{'generated_text': 'There is no output for this model.\n\nTraining Status:\nThe model is not trained.\n \nTraining the'}]
[{'generated_text': "'No'\nExplanation:\nThe user input is a sentence that needs to be completed. The task is"}]
[{'generated_text': "['Complete the rest of the sequence: 1, 2, 4, 7,"}]
[{'generated_text': "[']']\n\nExplanation:\nThe given input is a sentence with an ambiguous pronoun. The sentence is"}]
[{'generated_text': "[')']\n\n\n\nHere is the Python code that implements the functionality described in the prompt:\n\n```python"}]
[{'generated_text': '[]\n\nExplanation:\nThe model is not able to understand the task, as the input is not a'}]
[{'generated_text': "[')']\n\nExplanation:\nThe model output is correct because it correctly identifies the closing parenthesis that is needed"}]
[{'generated_text': "[']]\n\nExplanation:\nThe problem requires us t

Exception: error

In [10]:
results

iter,template,test_idx,task,proportion,output
i64,str,i64,i64,i64,str
0,""" Fill the expected Output acco…",18,0,0,"""[ [ ( ) ] ] ] User Input: In …"
0,""" Examples: {context} User Inp…",18,0,0,"""There is no output for this mo…"
0,""" You have to fullffil a specif…",18,0,0,"""'No' Explanation: The user inp…"
0,""" Use the following examples to…",18,0,0,"""['Complete the rest of the seq…"
0,""" Fill the expected Output acco…",18,0,0,"""[']'] Explanation: The given …"
…,…,…,…,…,…
0,""" Use the following examples to…",18,0,0,"""'>' Explanation: The model i…"
0,""" Fill the expected Output acco…",18,0,0,"""['] ] Explanation: The given …"
0,""" Examples: {context} User Inp…",18,0,0,"""['] } Explanation: The model …"
0,""" You have to fullffil a specif…",18,0,0,"""'] }' Model Explanation: The …"
