# Datamodels Retriever Experiment

This document has the goal to show the implementation of a context retriever using Datamodels and its comparison against classical approaches

# 1. Run Classical Approaches

First it's needed to run the comparison subject, to do so we will use the script present in this folder, it's just necessary to run

```
python run_classical_retriever.py
```

This will run the retriever for each sample from the test dataset, saving the data at every 50 samples as checkpoint

# 2. Split Data for Datamodeling

Here we will be spliting the data to achieve a dev dataset containing a representative numbe of samples to each subtask
The "k" used here is 15

In [49]:
from src.utils import split_dev_set, subset_df
from src.retriever import DatamodelsRetriever
from src.datamodels import Datamodels, DatamodelConfig
from src.evaluator import Rouge_L_evaluator
from src.llms import Llama3_1
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os

# Limit available GPUs to GPU 0 and 1
os.environ["CUDA_VISIBLE_DEVICES"] = "4"


In [6]:
train = pd.read_csv("../../data/instruction-induction-data/processed/induce_tasks_examples.csv")
train_subset = subset_df(train, 200, "task")
train_subset.to_csv("../../data/instruction-induction-data/processed/train.csv")

split_dev_set(
    path="../../data/instruction-induction-data/processed/train.csv",
    saving_path="../../data/instruction-induction-data/datamodels",    
    k_samples=15,
    task_column="task",
)

## 3. Split the Collections to be trained

In [9]:
#### First time create collection #####
retriever = DatamodelsRetriever(k=8)
retriever.create_collections_index(
    "../../data/instruction-induction-data/datamodels_15_10_2024/train_set.csv",
    "../../data/instruction-induction-data/datamodels_15_10_2024",
    n_samples=500,
    test_per=0.2,

)

In [10]:
llama = Llama3_1()
llama.run("What is the best vegatable for salad")

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.86s/it]


'What is the best vegatable for salad?\nI love vegatable salads, I think they are the best thing to'

In [13]:
######### Datamodels config for experiment #########
config = DatamodelConfig(
    k = 8,
    train_collections_idx_path = "../../data/instruction-induction-data/datamodels_15_10_2024/train_collection.h5",
    train_collections_idx = None,
    test_collections_idx_path = "../../data/instruction-induction-data/datamodels_15_10_2024/test_collection.h5",
    test_collections_idx = None,
    test_set = None,
    test_set_path = "../../data/instruction-induction-data/datamodels_15_10_2024/dev_set.csv",
    train_set = None,
    train_set_path = "../../data/instruction-induction-data/datamodels_15_10_2024/train_set.csv",
    collections_path = "../../data/instruction-induction-data/datamodels_15_10_2024/collections/15-10-2024",
    pre_collections_path = "../../data/instruction-induction-data/datamodels_15_10_2024/pre_collections/15-10-2024",
    instructions= None,
    instructions_path= "../../data/instruction-induction-data/datamodels_15_10_2024/intructions.json",
    llm = llama,
    evaluator=Rouge_L_evaluator(),
    model =  None,
)

In [14]:
datamodel = Datamodels(config)
datamodel.set_test_set()
datamodel.set_train_set()
datamodel.set_train_collection_index()
datamodel.set_instructions_from_path()

Loaded test set from  ../../data/instruction-induction-data/datamodels_15_10_2024/dev_set.csv
Loaded train set from  ../../data/instruction-induction-data/datamodels_15_10_2024/train_set.csv
Loaded train collection index from  ../../data/instruction-induction-data/datamodels_15_10_2024/train_collection.h5


In [17]:
datamodel.create_pre_collection(start_idx=0, end_idx=1)

template = """
            Fill the expected Output according to the instruction
            Intruction: {instruction}

            Examples:
            {context}

            User Input:
            {input}

            Model Output:
        """

Collection id: 0
Checkpoint 0 saved


In [3]:
df = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/pre_collections/15-10-2024/pre_collection_0.pickle")
df.head()

Unnamed: 0,collection_idx,test_idx,input,predicted_output,true_output,optinal_output
0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The manager was mentioned by the judge.,
1,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The presidents were encouraged by the professors.,
2,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The secretary was recommended by the banker.,
3,0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The presidents were thanked by the secretaries.,
4,0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The doctor was recognized by the bankers.,


In [4]:




def extract_output(text):
    # Split the string and strip leading/trailing spaces and newlines
    return text.split(" Model Output:\n ", 1)[-1].strip()

df = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/pre_collections/15-10-2024/pre_collection_0.pickle")


result = Rouge_L_evaluator().evaluate(df["true_output"].to_numpy(),  df["predicted_output"].apply(extract_output).to_numpy())


In [7]:
result

array([1.        , 0.71428571, 1.        , 1.        , 1.        ,
       0.77777778, 1.        , 1.        , 0.77777778, 1.        ,
       1.        , 0.77777778, 1.        , 1.        , 1.        ,
       0.28571429, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.66666667, 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.4       , 0.        ,
       0.33333333, 0.        , 1.        , 0.28571429, 0.        ,
       0.5       , 0.        , 0.        , 0.        , 0.5       ,
       0.        , 0.28571429, 0.        , 1.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.4       ,
       0.        , 0.        , 0.18181818, 0.        , 1.        ,
       0.        , 1.        , 1.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.66666667, 0.28571429,
       0.        , 0.4       , 1.        , 0.33333333, 0.        ,
       0.33333333, 0.        , 0.66666667, 0.28571429, 0.33333

In [8]:
df_collection = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/collections/15-10-2024/pre_collection_10.pickle")
df_collection.head()


Unnamed: 0,collection_idx,test_idx,input,evaluation
0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
1,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.714286
2,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
3,0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.823529
4,0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.777778


## Result Analysis

In [10]:
traind_df = pd.read_csv("../../data/instruction-induction-data/datamodels_15_10_2024/train_set.csv")
traind_df.head()

Unnamed: 0,task,input,output,possible_outputs
0,active_to_passive,The lawyer avoided the secretaries.,The secretaries were avoided by the lawyer.,
1,active_to_passive,The judges supported the scientists.,The scientists were supported by the judges.,
2,active_to_passive,The secretary recommended the professor.,The professor was recommended by the secretary.,
3,active_to_passive,The artists recommended the secretary.,The secretary was recommended by the artists.,
4,active_to_passive,The secretary avoided the senator.,The senator was avoided by the secretary.,


In [11]:
dev_df = pd.read_csv("../../data/instruction-induction-data/datamodels_15_10_2024/dev_set.csv")
dev_df.head()

Unnamed: 0,task,input,output,possible_outputs
0,active_to_passive,The judge mentioned the manager.,The manager was mentioned by the judge.,
1,active_to_passive,The professors encouraged the presidents.,The presidents were encouraged by the professors.,
2,active_to_passive,The banker recommended the secretary.,The secretary was recommended by the banker.,
3,active_to_passive,The secretaries thanked the presidents.,The presidents were thanked by the secretaries.,
4,active_to_passive,The bankers recognized the doctor.,The doctor was recognized by the bankers.,


In [12]:
pre_collection = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/pre_collections/15-10-2024/pre_collection_10.pickle")
pre_collection.head()

Unnamed: 0,collection_idx,test_idx,input,predicted_output,true_output,optinal_output
0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The manager was mentioned by the judge.,
1,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The presidents were encouraged by the professors.,
2,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The secretary was recommended by the banker.,
3,0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The presidents were thanked by the secretaries.,
4,0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The doctor was recognized by the bankers.,


In [13]:
collection = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/collections/15-10-2024/pre_collection_10.pickle")
collection.head()

Unnamed: 0,collection_idx,test_idx,input,evaluation
0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
1,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.714286
2,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
3,0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.823529
4,0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.777778


## Single Sample Demonstration

In [19]:
### Anlyse a sing sample
sample = pre_collection.loc[0]
sample["evaluation"] = collection.copy().loc[0]["evaluation"]
sample["task"] = dev_df.loc[sample["test_idx"]]["task"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["evaluation"] = collection.copy().loc[0]["evaluation"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["evaluation"] = collection.copy().loc[0]["evaluation"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["task"] = dev_df.loc[sample["test_idx"]]["task"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [25]:
print(sample["input"])

[0 0 0 ... 0 0 0]


In [24]:
print(sample["predicted_output"])


            Fill the expected Output according to the instruction
            Intruction: Write the input sentence in passive form.

            Examples:
            Input: 5040 
Output: five thousand and forty
Input: 1 64 
Output: 65
Input: grenade 
Output: r
Input: Sentence 1: A school bus is driving uphill on a rural road. Sentence 2: A race care driving along a dirt road. 
Output: 1 - probably not
Input: souvenir 
Output: near
Input: The student recognized the professors. 
Output: The professors were recognized by the student.
Input: Sentence 1: White House in damage control over Obama Supreme Court remarks Sentence 2: Fact check: Obama's Supreme Court remarks 
Output: 4 - almost perfectly
Input: camouflage 
Output: c


            User Input:
            The judge mentioned the manager.

            Model Output:
         The manager was mentioned by the judge.




In [26]:
print(sample["true_output"])

The manager was mentioned by the judge.


In [27]:
print(sample["evaluation"])

1.0


## Evaluation Results by Task

In [43]:
samples = pre_collection.copy()
samples["evaluation"] = collection["evaluation"]
samples["task"] = samples["test_idx"].apply(lambda idx: dev_df.loc[idx, 'task'])

In [44]:
samples["task"].value_counts()

task
active_to_passive          165
antonyms                   165
diff                       165
first_word_letter          165
larger_animal              165
letters_list               165
negation                   165
num_to_verbal              165
orthography_starts_with    165
rhymes                     165
second_word_letter         165
sentence_similarity        165
sentiment                  165
singular_to_plural         165
sum                        165
synonyms                   165
taxonomy_animal            165
translation_en-de          165
translation_en-es          165
translation_en-fr          165
word_in_context            165
Name: count, dtype: int64

In [45]:
task_results = samples.groupby('task').agg(
    mean=('evaluation', 'mean'),
    q25=('evaluation', lambda x: x.quantile(0.25)),
    q50=('evaluation', lambda x: x.quantile(0.50)),  # Same as median
    q75=('evaluation', lambda x: x.quantile(0.75)),
    q100=('evaluation', lambda x: x.quantile(1.00))
).reset_index()

len(task_results)

21

In [50]:
task_results

Unnamed: 0,task,mean,q25,q50,q75,q100
0,active_to_passive,0.498462,0.0,0.5,0.823529,1.0
1,antonyms,0.074574,0.0,0.0,0.0,1.0
2,diff,0.331053,0.0,0.285714,0.4,1.0
3,first_word_letter,0.136107,0.0,0.0,0.166667,1.0
4,larger_animal,0.3495,0.0,0.285714,0.666667,1.0
5,letters_list,0.241986,0.0,0.0,0.571429,1.0
6,negation,0.521,0.266667,0.588235,0.8,1.0
7,num_to_verbal,0.428086,0.0,0.5,0.8,1.0
8,orthography_starts_with,0.164283,0.0,0.181818,0.222222,1.0
9,rhymes,0.0,0.0,0.0,0.0,0.0
