In [1]:
import jsonlines
import json
import random
import pandas as pd
from datasets import load_dataset

# ARC: [https://huggingface.co/datasets/allenai/ai2_arc](https://huggingface.co/datasets/allenai/ai2_arc)

In [2]:
arc_c = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="train")
arc_e = load_dataset("allenai/ai2_arc", "ARC-Easy", split="train")
df_arc_c = pd.DataFrame(arc_c) 
df_arc_e = pd.DataFrame(arc_e)

In [3]:
df_arc_c 

Unnamed: 0,id,question,choices,answerKey
0,Mercury_SC_415702,George wants to warm his hands quickly by rubb...,"{'text': ['dry palms', 'wet palms', 'palms cov...",A
1,MCAS_2009_5_6516,Which of the following statements best explain...,"{'text': ['The refrigerator door is smooth.', ...",B
2,Mercury_7233695,A fold observed in layers of sedimentary rock ...,"{'text': ['cooling of flowing magma.', 'conver...",B
3,Mercury_7041615,Which of these do scientists offer as the most...,"{'text': ['worldwide disease', 'global mountai...",D
4,Mercury_7041860,A boat is acted on by a river current flowing ...,"{'text': ['west', 'east', 'north', 'south'], '...",B
...,...,...,...,...
1114,Mercury_SC_415480,Which change would most likely increase the nu...,"{'text': ['flood', 'drought', 'fire', 'landsli...",A
1115,Mercury_7172795,The skin is the largest organ in the human bod...,"{'text': ['It is made of cells.', 'It acts as ...",C
1116,NCEOGA_2013_8_59,Which food provides the most energy for the bo...,"{'text': ['potato', 'meat', 'milk', 'fruit'], ...",D
1117,Mercury_7219643,Screech owls have two color variations-red and...,"{'text': ['nesting', 'feeding', 'reproduction'...",D


In [4]:
df_arc_e

Unnamed: 0,id,question,choices,answerKey
0,Mercury_7220990,Which factor will most likely cause a person t...,{'text': ['a leg muscle relaxing after exercis...,B
1,MCAS_2007_8_5189,Lichens are symbiotic organisms made of green ...,"{'text': ['carbon dioxide', 'food', 'protectio...",B
2,Mercury_SC_401169,When a switch is used in an electrical circuit...,"{'text': ['cause the charge to build.', 'incre...",D
3,MCAS_2004_8_27,Which of the following is an example of an ass...,"{'text': ['contact lens', 'motorcycle', 'rainc...",A
4,NYSEDREGENTS_2006_8_10,"Rocks are classified as igneous, metamorphic, ...","{'text': ['their color', 'their shape', 'how t...",3
...,...,...,...,...
2246,CSZ_2009_8_CSZ20770,"Iron oxides, such as rust, form when iron meta...","{'text': ['I and O', 'Ir and O', 'Fe and O', '...",C
2247,Mercury_7181318,When water evaporates from Earth's surface int...,"{'text': ['The mass is reduced.', 'The volume ...",C
2248,Mercury_7010080,Which process directly adds carbon into the at...,"{'text': ['increasing plant populations', 'dec...",C
2249,NCEOGA_2013_8_37,Scientists think that dolphins and whales may ...,"{'text': ['They swim the same way.', 'They eat...",D


In [5]:
arc_c[0]

{'id': 'Mercury_SC_415702',
 'question': 'George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?',
 'choices': {'text': ['dry palms',
   'wet palms',
   'palms covered with oil',
   'palms covered with lotion'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'A'}

In [6]:
print(arc_c[0]['question']+ "\n" + 
      "Options:\n" + 
      arc_c[0]['choices']['label'][0] + ") " + arc_c[0]['choices']['text'][0]
        + "\n" + arc_c[0]['choices']['label'][1] + ") " + arc_c[0]['choices']['text'][1]
        + "\n" + arc_c[0]['choices']['label'][2] + ") " + arc_c[0]['choices']['text'][2]
        + "\n" + arc_c[0]['choices']['label'][3] + ") " + arc_c[0]['choices']['text'][3])

George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?
Options:
A) dry palms
B) wet palms
C) palms covered with oil
D) palms covered with lotion


In [7]:
def process_arc(dataset):
    data= []
    for d in dataset: 
        question = d['question'] + "\n" + "Options:\n"
        for i in range(len(d['choices']['label'])):
            question += d['choices']['label'][i] + ") " + d['choices']['text'][i] + "\n"
        data.append({
            "question": question,
            "answer": d['answerKey']
        })
    return data

In [8]:
arc_c_data = process_arc(arc_c)
arc_e_data = process_arc(arc_e)
df_arc_c = pd.DataFrame(arc_c_data)
df_arc_e = pd.DataFrame(arc_e_data)

In [9]:
df_arc_c

Unnamed: 0,question,answer
0,George wants to warm his hands quickly by rubb...,A
1,Which of the following statements best explain...,B
2,A fold observed in layers of sedimentary rock ...,B
3,Which of these do scientists offer as the most...,D
4,A boat is acted on by a river current flowing ...,B
...,...,...
1114,Which change would most likely increase the nu...,A
1115,The skin is the largest organ in the human bod...,C
1116,Which food provides the most energy for the bo...,D
1117,Screech owls have two color variations-red and...,D


In [10]:
df_arc_e

Unnamed: 0,question,answer
0,Which factor will most likely cause a person t...,B
1,Lichens are symbiotic organisms made of green ...,B
2,When a switch is used in an electrical circuit...,D
3,Which of the following is an example of an ass...,A
4,"Rocks are classified as igneous, metamorphic, ...",3
...,...,...
2246,"Iron oxides, such as rust, form when iron meta...",C
2247,When water evaporates from Earth's surface int...,C
2248,Which process directly adds carbon into the at...,C
2249,Scientists think that dolphins and whales may ...,D


In [11]:
with jsonlines.open("arc_challenge.jsonl", "w") as f:
    f.write_all(arc_c_data)
    
with jsonlines.open("arc_easy.jsonl", "w") as f:
    f.write_all(arc_e_data)

In [12]:
full_arc = arc_c_data + arc_e_data
random.shuffle(full_arc)
with jsonlines.open("arc.jsonl", "w") as f:
    f.write_all(full_arc)

# MMLU [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)

In [13]:
mmlu = load_dataset("cais/mmlu", 'auxiliary_train', split="train")

In [14]:
df_mmlu = pd.DataFrame(mmlu)
df_mmlu.head()

Unnamed: 0,train
0,"{'answer': 1, 'choices': ['Adams only.', 'Broo..."
1,"{'answer': 3, 'choices': ['guilty, because thi..."
2,"{'answer': 2, 'choices': ['Yes, because Mann t..."
3,"{'answer': 1, 'choices': ['must permit Don to ..."
4,"{'answer': 2, 'choices': ['partial breach of c..."


In [15]:
def process_mmlu(dataset): 
    data = []
    for d in dataset: 
        question = d['question'] + "\n" + "Options:\n"
        for i in range(1, len(d['choices'])+1):
            question += f"{i}) " + d['choices'][i-1] + "\n"
        data.append({
            "question": question,
            "answer": d['answer']
        })
    return data

In [16]:
mmlu_data = process_mmlu(mmlu['train'])

# Merge all datasets

In [17]:
full_data = full_arc + mmlu_data
len(full_data)

103212

In [19]:
random.shuffle(full_data)
with jsonlines.open("full_data.jsonl", "w") as f:
    f.write_all(full_data)