# Impact Context Reduction

This notebook server to select the tasks and prepare the dataset to be used for the experiment

In [10]:
import pandas as pd
import plotly.express as px
import random
from sentence_transformers import SentenceTransformer
import torch

random.seed(42)


## Load and Select 

The data from Instruction Induction is loaded and then it's selected the tasks with unique response value

In [11]:
folder = "../../data/instruction-induction-data/raw"
tasks_df = pd.read_csv(f"{folder}/induce_tasks_examples.csv")

In [12]:
tasks_df

Unnamed: 0,task,input,output,possible_outputs
0,active_to_passive,The tourist supported the authors.,The authors were supported by the tourist.,
1,active_to_passive,The athlete contacted the tourists.,The tourists were contacted by the athlete.,
2,active_to_passive,The judges believed the bankers.,The bankers were believed by the judges.,
3,active_to_passive,The president encouraged the actor.,The actor was encouraged by the president.,
4,active_to_passive,The lawyers believed the authors.,The authors were believed by the lawyers.,
...,...,...,...,...
67640,word_in_context,Sentence 1: I know the feeling! Sentence 2: Ha...,same,"['same', 'yes', 'true']"
67641,word_in_context,"Sentence 1: Confidence is always borrowed, nev...",not the same,"['not the same', 'no', 'false']"
67642,word_in_context,Sentence 1: Messages must go through diplomati...,not the same,"['not the same', 'no', 'false']"
67643,word_in_context,Sentence 1: The end of the year. Sentence 2: O...,not the same,"['not the same', 'no', 'false']"


In [13]:
tasks_df["task"].unique()

array(['active_to_passive', 'antonyms', 'cause_and_effect', 'diff',
       'first_word_letter', 'informal_to_formal', 'larger_animal',
       'letters_list', 'negation', 'num_to_verbal',
       'orthography_starts_with', 'rhymes', 'second_word_letter',
       'sentence_similarity', 'sentiment', 'singular_to_plural', 'sum',
       'synonyms', 'taxonomy_animal', 'translation_en-de',
       'translation_en-es', 'translation_en-fr', 'word_in_context'],
      dtype=object)

In [14]:
selected_tasks_list = ['active_to_passive', 'antonyms', 'diff', 'first_word_letter',
        'larger_animal', 'letters_list', 'negation',
       'num_to_verbal', 'orthography_starts_with', 'rhymes',
       'second_word_letter', 'sentence_similarity', 'sentiment',
       'singular_to_plural', 'sum', 'synonyms', 'taxonomy_animal',
       'translation_en-de', 'translation_en-es', 'translation_en-fr',
       'word_in_context']

selected_tasks = tasks_df[tasks_df["task"].isin(selected_tasks_list)]


In [15]:
## Select tasks for test set
ex_tasks_df = pd.read_csv(f"{folder}/execute_tasks_examples.csv")
test = ex_tasks_df[ex_tasks_df["task"].isin(selected_tasks_list)]


## Plot distribuition of samples

In [16]:
fig = px.histogram(selected_tasks, x="task", title="Induce Tasks Count")
fig.show()

In [17]:
fig = px.histogram(test, x="task", title="Induce Tasks Count")
fig.show()

## Undersampling the pool of tasks

In order to avoid class unbalancement to select the examples we will create a upper limit of 900 examples per classs
They will be randomly selected

In [18]:
test

Unnamed: 0,task,input,output,possible_outputs
0,active_to_passive,The professor mentioned the artist.,The artist was mentioned by the professor.,
1,active_to_passive,The presidents recommended the lawyer.,The lawyer was recommended by the presidents.,
2,active_to_passive,The professors thanked the tourists.,The tourists were thanked by the professors.,
3,active_to_passive,The scientist contacted the judge.,The judge was contacted by the scientist.,
4,active_to_passive,The doctor stopped the managers.,The managers were stopped by the doctor.,
...,...,...,...,...
2135,word_in_context,Sentence 1: He put the cup back in the saucer....,same,"['same', 'yes', 'true']"
2136,word_in_context,Sentence 1: His treatment of the race question...,not the same,"['not the same', 'no', 'false']"
2137,word_in_context,Sentence 1: Please bracket this remark. Senten...,not the same,"['not the same', 'no', 'false']"
2138,word_in_context,Sentence 1: Violate the sanctity of the church...,not the same,"['not the same', 'no', 'false']"


In [19]:
def get_random_sample_per_category(df: pd.DataFrame, n_samples_per_category: int = 900) -> pd.DataFrame:
  
    """
    Returns a sampled DataFrame with a specified number of rows per category.

    Args:
    df (pd.DataFrame): The input DataFrame to be sampled.
    num_samples (int): The number of rows to sample per category.

    Returns:
    pd.DataFrame: The sampled DataFrame.

    Notes:
    - If a category has fewer rows than the specified number of samples, all rows in that category are included in the sample.
    - If a category has more rows than the specified number of samples, a random sample of the specified number of rows is returned.
    - If a category has exactly the specified number of rows, all rows in that category are included in the sample.
    - If a category has less than the specified number of rows, a warning message is printed.

    Example:
    >>> df = pd.DataFrame({'task': ['A', 'B', 'A', 'B', 'C', 'C'], 'value': [1, 2, 3, 4, 5, 6]})
    >>> get_random_sample_per_category(df, 2)
    task  value
    0     A      1
    2     A      3
    1     B      2
    3     B      4
    4     C      5
    5     C      6
    """

    # Group the DataFrame by the "task" column
    grouped_df = df.groupby('task')

    # Create an empty DataFrame to store the results
    sampled_df = pd.DataFrame()

    # Iterate through each group and randomly sample n_samples_per_category rows
    for name, group in grouped_df:
        if len(group) >= n_samples_per_category:  # Check if there are enough rows in the group
            sampled_df = pd.concat([sampled_df, group.sample(n_samples_per_category, random_state=42)])
        else:
            print(f"Warning: Category '{name}' has less than {n_samples_per_category} rows. All rows included.")
            sampled_df = pd.concat([sampled_df, group])

    return sampled_df


In [20]:
#undersampling the examples
sampled_df = get_random_sample_per_category(selected_tasks, 900)
fig = px.histogram(sampled_df, x="task", title="Induce Tasks Count")
fig.show()

## Creating Embeddings

In [21]:
model = SentenceTransformer('sentence-transformers/stsb-roberta-large')


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [22]:
with torch.no_grad():
    test["input_encoding"] = list(model.encode(test["input"].tolist()))
    test["output_encoding"] = list(model.encode(test["output"].tolist()))
    sampled_df["input_encoding"] = list(model.encode(sampled_df["input"].tolist()))
    sampled_df["output_encoding"] = list(model.encode(sampled_df["output"].tolist()))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
test.head()

Unnamed: 0,task,input,output,possible_outputs,input_encoding,output_encoding
0,active_to_passive,The professor mentioned the artist.,The artist was mentioned by the professor.,,"[-0.20012294, 0.13508783, -0.2985632, -0.87351...","[-0.09183042, -0.04686316, -0.33048978, -1.268..."
1,active_to_passive,The presidents recommended the lawyer.,The lawyer was recommended by the presidents.,,"[-0.4970448, 0.4357985, -0.6567291, -0.3953907...","[-0.66726875, 0.34483844, -0.8113683, -1.00565..."
2,active_to_passive,The professors thanked the tourists.,The tourists were thanked by the professors.,,"[-0.3240893, -0.14898217, -1.352726, 0.3582022...","[-0.323816, -0.33156234, -1.244837, 0.34510475..."
3,active_to_passive,The scientist contacted the judge.,The judge was contacted by the scientist.,,"[0.102656126, 0.30964082, -0.1751958, -0.03980...","[0.2995165, 0.50604427, -0.22411016, -0.802559..."
4,active_to_passive,The doctor stopped the managers.,The managers were stopped by the doctor.,,"[-1.1253037, -0.44782987, -0.21806452, 0.75703...","[-0.73601687, -0.23098752, -0.51134634, 0.3905..."


In [24]:
sampled_df.head()

Unnamed: 0,task,input,output,possible_outputs,input_encoding,output_encoding
70,active_to_passive,The lawyer avoided the secretaries.,The secretaries were avoided by the lawyer.,,"[-0.24385086, -0.64486957, -0.64685297, -0.420...","[-0.28393745, -0.95630246, -0.746785, -0.78612..."
827,active_to_passive,The judges supported the scientists.,The scientists were supported by the judges.,,"[-0.5501171, 0.6825413, -0.9810991, -0.5485433...","[-0.56822073, 0.5160339, -1.1946347, -0.480039..."
231,active_to_passive,The secretary recommended the professor.,The professor was recommended by the secretary.,,"[-0.0900043, -0.7063172, -0.6908893, -0.936058...","[-0.015668165, -0.70485973, -0.8372537, -0.885..."
588,active_to_passive,The artists recommended the secretary.,The secretary was recommended by the artists.,,"[-0.68770576, -0.63871086, -0.1755478, -0.7951...","[-0.5098828, -0.47384205, -0.53428304, -1.1165..."
39,active_to_passive,The secretary avoided the senator.,The senator was avoided by the secretary.,,"[-0.15601312, 0.07417155, -0.52595055, -0.6627...","[-0.25937113, -0.28749138, -0.81921023, -0.821..."


## Save the .csv files to final dataframes

In [25]:
# save csvs
test.to_pickle("test.pickle")
sampled_df.to_pickle("pool.pickle")