# Bias Insturction Analysis - Setup


The goal of this notebook is to prepare the data that will be needed to do the studies described in the README
</br> Here we will execute the following steps:
- Create the collections dataset with indices, string and proportions by task and test dataframe
- Create a DataLoader to sample collections based on it's proportion to target task
- Create a DataLoader to sample X test samples by task
- Create the three different prompts to be tested

In [9]:
import h5py
import numpy as np
import pandas as pd
import polars as pl
import torch

from torch.utils.data import TensorDataset, DataLoader

data_path = "../../data/bbh/datamodels/reduced_sample"
save_path = "../../data/bbh_instruction_bias_experiment"

## 1. Create Collections, Train and Test Dataset

In [10]:
f = h5py.File(f"{data_path}/train_collection.h5", "r")
collections = f["train_collection"][:]
train = pl.read_csv(f"{data_path}/train_set.csv").with_columns(pl.arange(0, pl.len()).alias("index"))
test = pl.read_csv(f"{data_path}/test_set.csv")


In [11]:
tasks = train["task"].unique()
collections_idx = pl.DataFrame({"collection_idx": [i for i in range(len(collections))], "indices": [indices.tolist() for indices in collections]})
_exploded = collections_idx.clone()
_exploded = _exploded.explode("indices").join(train, left_on="indices", right_on="index")

for i in range(0, len(tasks)):
    _exploded = _exploded.with_columns(pl.col("task").str.contains(tasks[i]).alias(f"prop_task_{i}"))

collections_idx = (
    _exploded
    .group_by("collection_idx")
    .agg(
        count_task_0 = pl.col("prop_task_0").sum(),
        count_task_1 = pl.col("prop_task_1").sum(),
        count_task_2 = pl.col("prop_task_2").sum(),
        count_task_3 = pl.col("prop_task_3").sum(),
    )
)

#TODO: Create "task_idx" column in Polars base don the arr tasks

def return_task_idx(x):
    task_to_idx = {task: idx for idx, task in enumerate(tasks)}
    return task_to_idx[x]
task_to_idx = {task: idx for idx, task in enumerate(tasks)}

test = test.with_columns([
    pl.col("task").replace(task_to_idx).cast(int).alias("task_idx")
])

train = train.with_columns([
    pl.col("task").replace(task_to_idx).cast(int).alias("task_idx")
])


In [12]:
## Saving files
collections_idx.write_ipc(f"{save_path}/collections_idx.feather")
train.write_ipc(f"{save_path}/train.feather")
test.write_ipc(f"{save_path}/test.feather")
pl.DataFrame({"task": tasks}).write_ipc(f"{save_path}/tasks.feather")

## 2. Create Collections Dataloader

What it will be done here will be transformed into a utils function

In [13]:
collections_idx.head()

collection_idx,count_task_0,count_task_1,count_task_2,count_task_3
i64,u32,u32,u32,u32
3805,2,2,1,3
3150,1,6,0,1
2757,3,3,1,1
5377,2,3,1,2
3281,1,2,2,3


In [14]:
def create_colletion_dataloaders(
    df: pl.DataFrame,
    num_tasks: int,
    proportion: range,
    batch_size: int,
    shuffle: bool = True,  
):
    
    dataloaders = {}

    for i in range(num_tasks):
        task = i
        _df = (
            df
            .clone()
            .filter(pl.col(f"count_task_{task}").is_in(proportion))
            .select(["collection_idx"])
        )

        dl = DataLoader(
            TensorDataset(_df.to_torch()),
            batch_size=batch_size,
            shuffle=shuffle
        )

        dataloaders[f"task_{task}"] = dl

    return dataloaders


In [15]:
dls = create_colletion_dataloaders(
    collections_idx,
    num_tasks=4,
    proportion=range(7, 8),
    batch_size=4,
    shuffle=True
)

dls

{'task_0': <torch.utils.data.dataloader.DataLoader at 0x7f588262fc10>,
 'task_1': <torch.utils.data.dataloader.DataLoader at 0x7f56f935a910>,
 'task_2': <torch.utils.data.dataloader.DataLoader at 0x7f56f935b410>,
 'task_3': <torch.utils.data.dataloader.DataLoader at 0x7f56f9358150>}

## 3. Create Test Dataloaders

What it will be done here will be transformed into a utils function

In [22]:
def create_test_dataloader(
    df: pl.DataFrame,
    task: str,
    batch_size: int,
    shuffle: bool = True,
):
    



    _df = (
        df
        .clone()
        .with_columns(pl.arange(0, pl.len()).alias("idx"))
        .filter(pl.col(f"task").eq(task))
        .select(["idx"])
    )

    dl = DataLoader(
            TensorDataset(_df.to_torch()),
            batch_size=batch_size,
            shuffle=shuffle
        )


    return dl


In [30]:
d = create_test_dataloader(test, task="word_sorting", batch_size=4, shuffle=True)
next(iter(d))

[tensor([[30],
         [34],
         [31],
         [39]])]

## 4. Prompts

In [None]:
### Default

default = """
Fill the expected Output according to the instruction
Intruction: {instruction}

Examples:
{context}

User Input:
{input}

Model Output:
"""

## None
none = """"""

### Generic 1
generic_1 = """
You have to fullffil a specific task, it will be given examples that can or not be related to this task.

Examples:
{context}

User Input:
{input}

Model Output:
"""

### Generic 1
generic_2 = """
Use the following examples to answer the User Input correctly, filter the examples in the context

Examples:
{context}

User Input:
{input}

Model Output:
"""
