# Data Exploration of Instruction Induction Datasets

**Objectives**:
- Visualize how the tasks are organized in "execute" and "induce"
- Visualize the quantities of example for each class for each folder
- Unify all examples of test and execution in an unique .csv

In [39]:
import json
import glob
import pandas as pd
import plotly.express as px


## Visualize Num of Examples per Task

In [44]:
def get_all_tasks_specifications(folder):
    tasks = {"name": [], "count": []}
    json_files = glob.glob(f"{folder}/*.json")
    for f in json_files:
        task_name = f.split(".")[2].split("\\")[1]
        tasks["name"].append(task_name)

        with open(f, "r", encoding="utf-8") as json_file:
            task = json.load(json_file)
            tasks["count"].append(task["metadata"]["num_examples"])
    
    df = pd.DataFrame.from_dict(tasks)
    
    return df




        
        


In [47]:
folder = "../data/instruction-induction-data/raw/induce"
induce_tasks_meta = get_all_tasks_specifications(folder)
fig = px.histogram(induce_tasks_meta, x="name", y="count", title="Induce Tasks Count")
fig.show()

In [45]:
folder = "../data/instruction-induction-data/raw/execute"
execute_tasks_meta = get_all_tasks_specifications(folder)
fig = px.histogram(execute_tasks_meta, x="name", y="count", title="Execute Tasks Count")
fig.show()

##  Creates Unified Dataset

In [57]:
def get_all_tasks_examples(folder, filename):
    tasks = {"task": [], "input": [], "output": []}
    json_files = glob.glob(f"{folder}/*.json")
    for f in json_files:
        task_name = f.split(".")[2].split("\\")[1]

        with open(f, "r", encoding="utf-8") as json_file:
            task = json.load(json_file)
            num_examples =  task["metadata"]["num_examples"]
            try:
                for i in range(1, int(num_examples)+1):
                    tasks["input"].append(task["examples"][str(i)]["input"])
                    tasks["output"].append(task["examples"][str(i)]["output"]) 
                    tasks["task"].append(task_name)
            except:
                print(f"Task {task_name} has no input/output format examples")
                pass

    
    df = pd.DataFrame.from_dict(tasks)
    df.to_csv(filename, index=False)

In [58]:
get_all_tasks_examples("../data/instruction-induction-data/raw/induce", "induce_tasks_examples.csv")
get_all_tasks_examples("../data/instruction-induction-data/raw/execute", "execute_tasks_examples.csv")

Task cause_and_effect has no input/output format examples
Task common_concept has no input/output format examples
Task cause_and_effect has no input/output format examples
Task common_concept has no input/output format examples
