In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Preprocess Dataset

## Finetune Datasets

In [2]:
dataset = load_dataset("ai2_arc", 'ARC-Challenge')

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 1119
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 1172
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 299
    })
})


In [4]:
train_ds = dataset["train"]
valid_ds = dataset["validation"]
test_ds = dataset["test"]
print(train_ds)

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 1119
})


In [5]:
def transform_arc_dataset(dataset, instruction_prompt):
    """
    Transforms the ARC dataset into the specified Pandas DataFrame format.

    Parameters:
    dataset (list): A list of dictionaries, each representing a question in the ARC dataset.

    Returns:
    pd.DataFrame: A DataFrame with two columns as specified.
    """
    transformed_data = []

    for entry in dataset:
        # Extracting question and choices
        question = entry['question']
        choices = entry['choices']['text']
        labels = entry['choices']['label']
        answer_key = entry['answerKey']

        # Formatting Column 1: Question and Choices
        formatted_question = instruction_prompt + question + "\n" + "\n".join([f"{label}. {choice}" for label, choice in zip(labels, choices)])

        # Formatting Column 2: Answer Key and Corresponding Choice
        answer_index = labels.index(answer_key)
        formatted_answer = f"{answer_key}. {choices[answer_index]}"

        # Appending the formatted data
        transformed_data.append([formatted_question, formatted_answer])

    # Creating a DataFrame
    df = pd.DataFrame(transformed_data, columns=['Question and Choices', 'Answer'])

    return df

In [6]:
instruction = """Based on the question, analyze each option and select the most appropriate answer. Please provide a final answer to the question.\n"""
train_df = transform_arc_dataset(train_ds, instruction)
valid_df = transform_arc_dataset(valid_ds, instruction)
test_df = transform_arc_dataset(test_ds, instruction)

In [8]:
print(train_df.iloc[0,0])

Based on the question, analyze each option and select the most appropriate answer. Please provide a final answer to the question.
George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?
A. dry palms
B. wet palms
C. palms covered with oil
D. palms covered with lotion


In [9]:
print(train_df.head(2))
print(valid_df.head(2))
print(test_df.head(2))

                                Question and Choices  \
0  Based on the question, analyze each option and...   
1  Based on the question, analyze each option and...   

                                    Answer  
0                             A. dry palms  
1  B. The refrigerator door contains iron.  
                                Question and Choices  \
0  Based on the question, analyze each option and...   
1  Based on the question, analyze each option and...   

                                        Answer  
0  D. Record the details of the investigation.  
1                                   C. drought  
                                Question and Choices  \
0  Based on the question, analyze each option and...   
1  Based on the question, analyze each option and...   

                                   Answer  
0  C. Planetary days will become shorter.  
1         B. buildings will be made safer  


In [10]:
train_df.to_csv('../data/train_df-arc_challenge.csv', index=False)
valid_df.to_csv('../data/valid_df-arc_arc_challenge.csv', index=False)
test_df.to_csv('../data/test_df-arc_arc_challenge.csv', index=False)

## Pretrain Data

In [28]:
courpus_path = r"../data/ARC-V1-Feb2018-2/ARC_Corpus.txt"

In [29]:
def process_large_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        corpus = [line.strip("\n") for line in file.readlines()]
    pd.DataFrame(corpus).to_csv(r"../data/ARC_corpus.csv", header=False, index = False)


process_large_file(courpus_path)

In [2]:
corpus_csv_path = r"../data/ARC_corpus.csv"
corpus_df = pd.read_csv(corpus_csv_path, header = None)

In [3]:
pd.set_option('display.max_colwidth', None)
print(corpus_df.head(20))

                                                                                                                                                                                                                                                                                                                                                               0
0                                                                                                                                                                                                                                                          Large international companies are involved in bauxite, iron ore, diamond, and gold mining operations.
1                                                                                                                                                                                                                                                                                                     

In [13]:
# Assuming your DataFrame is named df and the column is named 'column_name'
# text_to_check = "The force of gravity overcomes the"
text_to_check = "will use magnets to explore how gravity"
# exists = text_to_check in corpus_df.values.squeeze(1)
corpus_data = corpus_df.values.squeeze(1)
results = [entry for idx, entry in enumerate(corpus_data) if text_to_check in entry]
for idx, entry in enumerate(corpus_df.values.squeeze(1)):
    if text_to_check in entry:
        print(idx, entry)

28957 Then the super cool part- children will use magnets to explore how gravity can easily be overcome by other forces- almost like defying gravity!


In [12]:
print(corpus_data[1000])

Thus, when we see such strata inclined instead of horizontal, evidence of an episode of deformation.


In [17]:
print(corpus_df.values.squeeze(1)[:10])

['Large international companies are involved in bauxite, iron ore, diamond, and gold mining operations.\n'
 'Paleoceanography, 8(2): 193-208.\n'
 'Of course, for many in the media, “hydrogen sulphide delivery helps prevent disease damage in cells in certain disease models” will always be trumped by “farts cure cancer” when it comes to headlines.\n'
 'The same problems apply with wolf-domestic dog hybrids.\n'
 'taking stock of delightful days\n'
 'The an- tlu-opologist and the ethnologist find in trop- ical America some of the most complicated and interesting problems of research.\n'
 'ORDER ODONATA (Damselflies and Dragonflies) Diagnosis: large, to over three inches long; four wings, transparent and membranous, held vertically (damselflies) or laterally (dragonflies) at rest; chew- ing mouth parts, tooth-like; nymphs aquatic, feeding on mosquito larvae to small fish; adults terrestrial, feeding on other insects (Figure 14.27).\n'
 'until they institute such safeguards and assurances of