# Data Exploration of Instruction Induction Datasets

**Objectives**:
- Visualize how the tasks are organized in "execute" and "induce"
- Visualize the quantities of example for each class for each folder
- Unify all examples of test and execution in an unique .csv

In [8]:
import json
import glob
import pandas as pd
import plotly.express as px


## Visualize Num of Examples per Task

In [9]:
def get_all_tasks_specifications(folder):
    tasks = {"name": [], "count": []}
    json_files = glob.glob(f"{folder}/*.json")
    for f in json_files:
        task_name = f.split(".")[2].split("\\")[1]
        tasks["name"].append(task_name)

        with open(f, "r", encoding="utf-8") as json_file:
            task = json.load(json_file)
            tasks["count"].append(task["metadata"]["num_examples"])
    
    df = pd.DataFrame.from_dict(tasks)
    
    return df




        
        


In [10]:
folder = "../data/instruction-induction-data/raw/induce"
induce_tasks_meta = get_all_tasks_specifications(folder)
fig = px.histogram(induce_tasks_meta, x="name", y="count", title="Induce Tasks Count")
fig.show()

In [11]:
folder = "../data/instruction-induction-data/raw/execute"
execute_tasks_meta = get_all_tasks_specifications(folder)
fig = px.histogram(execute_tasks_meta, x="name", y="count", title="Execute Tasks Count")
fig.show()

##  Creates Unified Dataset

In [12]:
def get_all_tasks_examples(folder, filename):
    """
    Reads all JSON files in the specified folder, extracts specific information from each JSON file,
    and stores it in a dictionary called tasks. The information stored in the tasks dictionary includes
    the task name, input, output, and possible outputs. The function then converts the tasks dictionary
    into a pandas DataFrame and saves it as a CSV file.

    It has a special behavior to some specific JSON files, which have more than one possible output

    Args:
        folder (str): The path to the folder containing the JSON files.
        filename (str): The name of the CSV file to save the tasks DataFrame.

    Returns:
        Pandas DataFrame: The DataFrame containing the tasks information.
    """


    tasks = {"task": [], "input": [], "output": [], "possible_outputs": []}
    json_files = glob.glob(f"{folder}/*.json")
    for f in json_files:
        task_name = f.split(".")[2].split("\\")[1]

        with open(f, "r", encoding="utf-8") as json_file:
            task = json.load(json_file)
            num_examples =  task["metadata"]["num_examples"]
            try:
                if json_file.name.startswith(folder + "\\cause"):
                    for i in range(1, int(num_examples)+1):
                        tasks["input"].append(task["examples"][str(i)]["cause"])
                        tasks["output"].append(task["examples"][str(i)]["effect"]) 
                        tasks["task"].append(task_name)
                        tasks["possible_outputs"].append(None)
                        
                elif json_file.name.startswith(folder + "\\rhymes"):
                    for i in range(1, int(num_examples)+1):
                        tasks["input"].append(task["examples"][str(i)]["input"])
                        tasks["output"].append(task["examples"][str(i)]["output"]) 
                        tasks["task"].append(task_name)
                        tasks["possible_outputs"].append(task["examples"][str(i)]["other_rhymes"])
                
                elif json_file.name.startswith(folder + "\\translation"):
                    for i in range(1, int(num_examples)+1):
                        tasks["input"].append(task["examples"][str(i)]["input"])
                        tasks["output"].append(task["examples"][str(i)]["output"]) 
                        tasks["task"].append(task_name)
                        tasks["possible_outputs"].append(task["examples"][str(i)]["possible_translations"])
                
                elif json_file.name.startswith(folder + "\\word_in_context"):
                    for i in range(1, int(num_examples)+1):
                        tasks["input"].append(task["examples"][str(i)]["input"])
                        tasks["output"].append(task["examples"][str(i)]["output"]) 
                        tasks["task"].append(task_name)
                        tasks["possible_outputs"].append(task["examples"][str(i)]["possible_outputs"])	
                
                else:
                    for i in range(1, int(num_examples)+1):
                        tasks["input"].append(task["examples"][str(i)]["input"])
                        tasks["output"].append(task["examples"][str(i)]["output"]) 
                        tasks["task"].append(task_name)
                        tasks["possible_outputs"].append(None)
            except:
                print(f"Task {task_name} has no input/output format examples")
                pass

    
    df = pd.DataFrame.from_dict(tasks)
    df.to_csv(filename, index=False)
    return df

In [13]:
induce_df = get_all_tasks_examples("../data/instruction-induction-data/raw/induce", "induce_tasks_examples.csv")
execute_df = get_all_tasks_examples("../data/instruction-induction-data/raw/execute", "execute_tasks_examples.csv")

Task common_concept has no input/output format examples
Task common_concept has no input/output format examples


In [14]:
induce_df["task"].unique()

array(['active_to_passive', 'antonyms', 'cause_and_effect', 'diff',
       'first_word_letter', 'informal_to_formal', 'larger_animal',
       'letters_list', 'negation', 'num_to_verbal',
       'orthography_starts_with', 'rhymes', 'second_word_letter',
       'sentence_similarity', 'sentiment', 'singular_to_plural', 'sum',
       'synonyms', 'taxonomy_animal', 'translation_en-de',
       'translation_en-es', 'translation_en-fr', 'word_in_context'],
      dtype=object)