# Datamodels Retriever Experiment

This document has the goal to show the implementation of a context retriever using Datamodels and its comparison against classical approaches

# 1. Run Classical Approaches

First it's needed to run the comparison subject, to do so we will use the script present in this folder, it's just necessary to run

```
python run_classical_retriever.py
```

This will run the retriever for each sample from the test dataset, saving the data at every 50 samples as checkpoint

# 2. Split Data for Datamodeling

Here we will be spliting the data to achieve a dev dataset containing a representative numbe of samples to each subtask
The "k" used here is 15

In [8]:
from src.utils import split_dev_set, subset_df
from src.retriever import NaiveDatamodelsRetriever
from src.datamodels.pipeline import DatamodelPipeline
from src.datamodels.config import DatamodelConfig, MemMapConfig
from src.evaluator import Rouge_L_evaluator
from src.llms import Llama3_1
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import os

# Limit available GPUs to GPU 0 and 1
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


In [None]:
# train = pd.read_csv("../../data/instruction-induction-data/processed/induce_tasks_examples.csv")
# train_subset = subset_df(train, 200, "task")
# train_subset.to_csv("../../data/instruction-induction-data/processed/train.csv")

# split_dev_set(
#     path="../../data/instruction-induction-data/processed/train.csv",
#     saving_path="../../data/instruction-induction-data/datamodels",    
#     k_samples=15,
#     task_column="task",
# )

## 3. Split the Collections to be trained

In [None]:
#### First time create collection #####
# retriever = DatamodelsRetriever(k=8)
# retriever.create_collections_index(
#     "../../data/instruction-induction-data/datamodels_15_10_2024/train_set.csv",
#     "../../data/instruction-induction-data/datamodels_15_10_2024",
#     n_samples=500,
#     test_per=0.2,

# )

In [None]:




def extract_output(text):
    # Split the string and strip leading/trailing spaces and newlines
    return text.split(" Model Output:\n ", 1)[-1].strip()

df = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/pre_collections/15-10-2024/prcollection_0.pickle")


result = Rouge_L_evaluator().evaluate(df["true_output"].to_numpy(),  df["predicted_output"].apply(extract_output).to_numpy(), datamodel.llm.tokenizer)


In [15]:
df_collection = pd.read_feather("../../data/instruction-induction-data/datamodels/proportion_study/210_5/pre_collections/__pre_collection_0.feather")
df_collection

Unnamed: 0,collection_idx,test_idx,input,predicted_output,true_output,optinal_output
0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",The presidents did not recognize the scientist.,The scientist was recognized by the presidents.,
1,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",The senators mentioned the artists.,The artists were mentioned by the senators.,
2,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,The secretaries were avoided by the lawyer.,
3,0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",The athletes were introduced by the presidents...,The athletes were introduced by the presidents.,
4,0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",The student was helped by the scientists.\n\n ...,The student was helped by the scientists.,
...,...,...,...,...,...,...
100,0,100,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",know\n User Input:\n Sen...,same,"['same', 'yes', 'true']"
101,0,101,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Please inspect your father's will carefully. W...,same,"['same', 'yes', 'true']"
102,0,102,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",o,not the same,"['not the same', 'no', 'false']"
103,0,103,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",h\n\n User Input:\n Sent...,same,"['same', 'yes', 'true']"


## Result Analysis

In [None]:
traind_df = pd.read_csv("../../data/instruction-induction-data/datamodels_15_10_2024/train_set.csv")
traind_df.head()

Unnamed: 0,task,input,output,possible_outputs
0,active_to_passive,The lawyer avoided the secretaries.,The secretaries were avoided by the lawyer.,
1,active_to_passive,The judges supported the scientists.,The scientists were supported by the judges.,
2,active_to_passive,The secretary recommended the professor.,The professor was recommended by the secretary.,
3,active_to_passive,The artists recommended the secretary.,The secretary was recommended by the artists.,
4,active_to_passive,The secretary avoided the senator.,The senator was avoided by the secretary.,


In [None]:
dev_df = pd.read_csv("../../data/instruction-induction-data/datamodels_15_10_2024/dev_set.csv")
dev_df.head()

Unnamed: 0,task,input,output,possible_outputs
0,active_to_passive,The judge mentioned the manager.,The manager was mentioned by the judge.,
1,active_to_passive,The professors encouraged the presidents.,The presidents were encouraged by the professors.,
2,active_to_passive,The banker recommended the secretary.,The secretary was recommended by the banker.,
3,active_to_passive,The secretaries thanked the presidents.,The presidents were thanked by the secretaries.,
4,active_to_passive,The bankers recognized the doctor.,The doctor was recognized by the bankers.,


In [None]:
pre_collection = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/pre_collections/15-10-2024/pre_collection_10.pickle")
pre_collection.head()

Unnamed: 0,collection_idx,test_idx,input,predicted_output,true_output,optinal_output
0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The manager was mentioned by the judge.,
1,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The presidents were encouraged by the professors.,
2,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The secretary was recommended by the banker.,
3,0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The presidents were thanked by the secretaries.,
4,0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",\n Fill the expected Output accordi...,The doctor was recognized by the bankers.,


In [None]:
collection = pd.read_pickle( "../../data/instruction-induction-data/datamodels_15_10_2024/collections/15-10-2024/pre_collection_10.pickle")
collection.head()

Unnamed: 0,collection_idx,test_idx,input,evaluation
0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
1,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.714286
2,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
3,0,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.823529
4,0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.777778


## Single Sample Demonstration

In [7]:
### Anlyse a sing sample
sample = pre_collection.loc[0]
sample["evaluation"] = collection.copy().loc[0]["evaluation"]
sample["task"] = dev_df.loc[sample["test_idx"]]["task"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["evaluation"] = collection.copy().loc[0]["evaluation"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["evaluation"] = collection.copy().loc[0]["evaluation"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["task"] = dev_df.loc[sample["test_idx"]]["task"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [8]:
print(sample["input"])

[0 0 0 ... 0 0 0]


In [9]:
print(sample["predicted_output"])


            Fill the expected Output according to the instruction
            Intruction: Write the input sentence in passive form.

            Examples:
            Input: 5040 
Output: five thousand and forty
Input: 1 64 
Output: 65
Input: grenade 
Output: r
Input: Sentence 1: A school bus is driving uphill on a rural road. Sentence 2: A race care driving along a dirt road. 
Output: 1 - probably not
Input: souvenir 
Output: near
Input: The student recognized the professors. 
Output: The professors were recognized by the student.
Input: Sentence 1: White House in damage control over Obama Supreme Court remarks Sentence 2: Fact check: Obama's Supreme Court remarks 
Output: 4 - almost perfectly
Input: camouflage 
Output: c


            User Input:
            The judge mentioned the manager.

            Model Output:
         The manager was mentioned by the judge.




In [10]:
print(sample["true_output"])

The manager was mentioned by the judge.


In [11]:
print(sample["evaluation"])

1.0


## Evaluation Results by Task

In [12]:
samples = pre_collection.copy()
samples["evaluation"] = collection["evaluation"]
samples["task"] = samples["test_idx"].apply(lambda idx: dev_df.loc[idx, 'task'])

In [13]:
samples["task"].value_counts()

task
active_to_passive          165
antonyms                   165
diff                       165
first_word_letter          165
larger_animal              165
letters_list               165
negation                   165
num_to_verbal              165
orthography_starts_with    165
rhymes                     165
second_word_letter         165
sentence_similarity        165
sentiment                  165
singular_to_plural         165
sum                        165
synonyms                   165
taxonomy_animal            165
translation_en-de          165
translation_en-es          165
translation_en-fr          165
word_in_context            165
Name: count, dtype: int64

In [15]:
task_results = samples.groupby('task').agg(
    mean=('evaluation', 'mean'),
    q25=('evaluation', lambda x: x.quantile(0.25)),
    q50=('evaluation', lambda x: x.quantile(0.50)),  # Same as median
    q75=('evaluation', lambda x: x.quantile(0.75)),
    q100=('evaluation', lambda x: x.quantile(1.00))
).reset_index()

len(task_results)

21

In [16]:
task_results

Unnamed: 0,task,mean,q25,q50,q75,q100
0,active_to_passive,0.498462,0.0,0.5,0.823529,1.0
1,antonyms,0.074574,0.0,0.0,0.0,1.0
2,diff,0.331053,0.0,0.285714,0.4,1.0
3,first_word_letter,0.136107,0.0,0.0,0.166667,1.0
4,larger_animal,0.3495,0.0,0.285714,0.666667,1.0
5,letters_list,0.241986,0.0,0.0,0.571429,1.0
6,negation,0.521,0.266667,0.588235,0.8,1.0
7,num_to_verbal,0.428086,0.0,0.5,0.8,1.0
8,orthography_starts_with,0.164283,0.0,0.181818,0.222222,1.0
9,rhymes,0.2548,0.0,0.0,0.4,1.0
