In [7]:
from datasets import load_dataset
import pandas as pd

# Preprocess Dataset

## Finetune Datasets - ARC challenge

In [26]:
dataset = load_dataset("ai2_arc", 'ARC-Easy')

In [27]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2251
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2376
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 570
    })
})


In [28]:
train_ds = dataset["train"]
valid_ds = dataset["validation"]
test_ds = dataset["test"]
print(train_ds)

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 2251
})


In [29]:
def transform_arc_dataset(dataset, instruction_prompt):
    """
    Transforms the ARC dataset into the specified Pandas DataFrame format.

    Parameters:
    dataset (list): A list of dictionaries, each representing a question in the ARC dataset.

    Returns:
    pd.DataFrame: A DataFrame with two columns as specified.
    """
    transformed_data = []

    for entry in dataset:
        # Extracting question and choices
        question = entry['question']
        choices = entry['choices']['text']
        labels = entry['choices']['label']
        answer_key = entry['answerKey']

        # Formatting Column 1: Question and Choices
        formatted_question = instruction_prompt + question + "\n" + "\n".join([f"{label}. {choice}" for label, choice in zip(labels, choices)])

        # Formatting Column 2: Answer Key and Corresponding Choice
        answer_index = labels.index(answer_key)
        formatted_answer = f"{answer_key}. {choices[answer_index]}"

        # Appending the formatted data
        transformed_data.append([formatted_question, formatted_answer])

    # Creating a DataFrame
    df = pd.DataFrame(transformed_data, columns=['Question and Choices', 'Answer'])

    return df

In [30]:
instruction = """Based on the question, analyze each option and select the most appropriate answer. Please provide a final answer to the question.\n"""
train_df = transform_arc_dataset(train_ds, instruction)
valid_df = transform_arc_dataset(valid_ds, instruction)
test_df = transform_arc_dataset(test_ds, instruction)

In [31]:
print(train_df.iloc[0,0])

Based on the question, analyze each option and select the most appropriate answer. Please provide a final answer to the question.
Which factor will most likely cause a person to develop a fever?
A. a leg muscle relaxing after exercise
B. a bacterial population in the bloodstream
C. several viral particles on the skin
D. carbohydrates being digested in the stomach


In [32]:
print(train_df.head(2))
print(valid_df.head(2))
print(test_df.head(2))

                                                                                                                                                                                                                                                                                                                                                                 Question and Choices  \
0  Based on the question, analyze each option and select the most appropriate answer. Please provide a final answer to the question.\nWhich factor will most likely cause a person to develop a fever?\nA. a leg muscle relaxing after exercise\nB. a bacterial population in the bloodstream\nC. several viral particles on the skin\nD. carbohydrates being digested in the stomach   
1                                                  Based on the question, analyze each option and select the most appropriate answer. Please provide a final answer to the question.\nLichens are symbiotic organisms made of green algae and fungi. W

In [33]:
train_df.to_csv('../data/train_df-ARC-Easy.csv', index=False)
valid_df.to_csv('../data/valid_df-ARC-Easy.csv', index=False)
test_df.to_csv('../data/test_df-ARC-Easy.csv', index=False)

## Pretrain Data - ARC corpus

In [3]:
courpus_path = r"../data/ARC-V1-Feb2018-2/ARC_Corpus.txt"

In [4]:
def process_large_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        corpus = [line.strip("\n") for line in file.readlines()]
    print(len(corpus))
    return pd.DataFrame(corpus)
    # pd.DataFrame(corpus).to_csv(r"../data/ARC_corpus.csv", header=False, index = False)


corpus = process_large_file(courpus_path)

14621856


In [2]:
corpus_csv_path = r"../data/ARC_corpus.csv"
corpus_df = pd.read_csv(corpus_csv_path, header = None)

In [3]:
pd.set_option('display.max_colwidth', None)
print(corpus_df.head(20))

                                                                                                                                                                                                                                                                                                                                                               0
0                                                                                                                                                                                                                                                          Large international companies are involved in bauxite, iron ore, diamond, and gold mining operations.
1                                                                                                                                                                                                                                                                                                     

In [4]:
corpus_df['text_length'] = corpus_df.iloc[:,0].str.len()

In [5]:
sorted_df = corpus_df.sort_values(by='text_length', ascending=False)

In [6]:
pd.set_option('display.max_colwidth', None)
print(sorted_df.head(20))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [None]:
# Assuming your DataFrame is named df and the column is named 'column_name'
# text_to_check = "The force of gravity overcomes the"
text_to_check = "will use magnets to explore how gravity"
# exists = text_to_check in corpus_df.values.squeeze(1)
corpus_data = corpus_df.values.squeeze(1)
results = [entry for idx, entry in enumerate(corpus_data) if text_to_check in entry]
for idx, entry in enumerate(corpus_df.values.squeeze(1)):
    if text_to_check in entry:
        print(idx, entry)

28957 Then the super cool part- children will use magnets to explore how gravity can easily be overcome by other forces- almost like defying gravity!


In [None]:
print(corpus_data[1000])

Thus, when we see such strata inclined instead of horizontal, evidence of an episode of deformation.


In [None]:
print(corpus_df.values.squeeze(1)[:10])

['Large international companies are involved in bauxite, iron ore, diamond, and gold mining operations.\n'
 'Paleoceanography, 8(2): 193-208.\n'
 'Of course, for many in the media, “hydrogen sulphide delivery helps prevent disease damage in cells in certain disease models” will always be trumped by “farts cure cancer” when it comes to headlines.\n'
 'The same problems apply with wolf-domestic dog hybrids.\n'
 'taking stock of delightful days\n'
 'The an- tlu-opologist and the ethnologist find in trop- ical America some of the most complicated and interesting problems of research.\n'
 'ORDER ODONATA (Damselflies and Dragonflies) Diagnosis: large, to over three inches long; four wings, transparent and membranous, held vertically (damselflies) or laterally (dragonflies) at rest; chew- ing mouth parts, tooth-like; nymphs aquatic, feeding on mosquito larvae to small fish; adults terrestrial, feeding on other insects (Figure 14.27).\n'
 'until they institute such safeguards and assurances of

## Conversational entailment

In [None]:
dataset = load_dataset("sled-umich/Conversation-Entailment")

Downloading readme: 100%|██████████| 2.93k/2.93k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 197k/197k [00:00<00:00, 1.84MB/s]
Downloading data: 100%|██████████| 387k/387k [00:00<00:00, 4.28MB/s]]
Downloading data files: 100%|██████████| 2/2 [00:00<00:00,  9.48it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 166.08it/s]
Generating validation split: 291 examples [00:00, 19497.17 examples/s]
Generating test split: 584 examples [00:00, 77039.58 examples/s]


In [None]:
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['dialog_speaker_list', 'h', 'dialog_num_list', 'dialog_source', 'dialog_text_list', 'entailment', 'id', 'type'],
        num_rows: 291
    })
    test: Dataset({
        features: ['dialog_speaker_list', 'h', 'dialog_num_list', 'dialog_source', 'dialog_text_list', 'entailment', 'id', 'type'],
        num_rows: 584
    })
})


In [None]:
test_ds = dataset["test"]
valid_ds = dataset["validation"]

In [None]:
def transform_to_dataframe(dataset, custom_instruction):
    """
    Transforms the dataset into a DataFrame with formatted conversation and entailment info.
    
    :param dataset: List of dictionaries containing the dataset information.
    :param custom_instruction: String containing a customized instruction for the model.
    :return: A pandas DataFrame with the transformed data.
    """
    data = []

    for entry in dataset:
        # Formatting the conversation
        conversation = 'Conversation:\n' + '\n'.join(
            f"{speaker}: '{text}'" for speaker, text in zip(entry['dialog_speaker_list'], entry['dialog_text_list'])
        )
        conversation += f"\nHypothesis: '{entry['h']}'\n{custom_instruction}"

        # Formatting the entailment and type
        entailment_info = f"Entailment: {entry['entailment']}. Type: {entry['type']}."

        data.append([conversation, entailment_info])

    return pd.DataFrame(data, columns=['Input', 'Label'])

In [None]:
custom_instruction = "Please analyze the conversation and check whether the hypothesis is true and identify the type of entailment."
test_df = transform_to_dataframe(test_ds,custom_instruction)
valid_df = transform_to_dataframe(valid_ds,custom_instruction)

In [None]:
pd.set_option('display.max_colwidth', None)
print(test_df.head(2))

                                                                                                                                                                                                                                                                                                                                                                                                                         Input  \
0                                                                                                                                                                                                                                Conversation:\nB: 'Hi, Archie.  I'm Sharon.'\nHypothesis: 'SpeakerB is Archie'\nPlease analyze the conversation and check whether the hypothesis is true and identify the type of entailment.   
1  Conversation:\nB: 'Yeah.  I've seen that, that's, uh, that was a really good movie.  Probably one of the best things about it was the scenery and, uh, I thought 

In [None]:
valid_df.to_csv('../data/valid_df-CE.csv', index=False)
test_df.to_csv('../data/test_df-CE.csv', index=False)

## TRIP

In [8]:
dataset = load_dataset("sled-umich/TRIP")

In [9]:
print(dataset)

DatasetDict({
    ClozeDev: Dataset({
        features: ['example_id', 'length', 'label', 'breakpoint', 'confl_sents', 'confl_pairs', 'stories'],
        num_rows: 322
    })
    ClozeTest: Dataset({
        features: ['example_id', 'length', 'label', 'breakpoint', 'confl_sents', 'confl_pairs', 'stories'],
        num_rows: 351
    })
    ClozeTrain: Dataset({
        features: ['example_id', 'length', 'label', 'breakpoint', 'confl_sents', 'confl_pairs', 'stories'],
        num_rows: 799
    })
    OrderDev: Dataset({
        features: ['example_id', 'length', 'label', 'breakpoint', 'confl_sents', 'confl_pairs', 'stories'],
        num_rows: 393
    })
    OrderTest: Dataset({
        features: ['example_id', 'length', 'label', 'breakpoint', 'confl_sents', 'confl_pairs', 'stories'],
        num_rows: 408
    })
    OrderTrain: Dataset({
        features: ['example_id', 'length', 'label', 'breakpoint', 'confl_sents', 'confl_pairs', 'stories'],
        num_rows: 2330
    })
})


In [22]:
train_order_ds = dataset["OrderTest"]
train_cloze_ds = dataset["ClozeTest"]

In [23]:
# print(train_ds[1])

In [24]:
def transform_trip_to_dataframe_order(dataset, custom_instruction):
    """
    Transforms the TRIP dataset into a DataFrame with numbered stories and their detailed attributes.
    
    :param dataset: List of dictionaries, each containing multiple stories.
    :return: A pandas DataFrame with the transformed data.
    """
    data = []

    for entry in dataset:
        for story in entry['stories']:
            # Numbering and concatenating the sentences to form the story
            story_text = f"Actor: {story['actor']}, Location: {story['location']}, Story: "
            story_text += ' '.join(f"[{i}] {sentence}" for i, sentence in enumerate(story['sentences']))
            story_text += "\n" + custom_instruction

            # Compiling story details
            details = f"Plausible: {'True' if story['plausible'] else 'False'}, Breakpoint: {story['breakpoint']}, Conflict Sentences: {story['confl_sents']}, Type: {story.get('type', 'N/A')}"

            data.append([story_text, details])

    return pd.DataFrame(data, columns=['Story', 'Details'])

In [25]:
def transform_cloze_to_dataframe(dataset, cloze_ci):
    """
    Transforms the ClozeTrain part of the TRIP dataset into a DataFrame.
    
    :param dataset: List of dictionaries, each containing multiple cloze stories.
    :return: A pandas DataFrame with the transformed data.
    """
    data = []

    for entry in dataset:
        for story in entry['stories']:
            # Numbering and concatenating the sentences to form the story
            story_text = f"Actor: {story['actor']}, Location: {story['location']}, Story: "
            story_text += ' '.join(f"[{i}] {sentence}" for i, sentence in enumerate(story['sentences']))
            story_text += "\n" + cloze_ci

            # Compiling story details
            details = (f"Plausible: {'True' if story['plausible'] else 'False'}, "
                       f"Breakpoint: {story['breakpoint']}, "
                       f"Conflict Sentences: {story['confl_sents']}, "
                       f"Conflict Pairs: {story['confl_pairs']}, "
                       f"Type: {story.get('type', 'N/A')}")

            data.append([story_text, details])

    return pd.DataFrame(data, columns=['Story', 'Details'])

In [26]:
pd.set_option('display.max_colwidth', None)
ci = "Instruction: Please review each story, paying special attention to the numbered sentences. Identify the key sentence or sentences that disrupt the logical consistency or narrative coherence. Evaluate whether the overall sequence of events aligns with real-world physics, and determine the plausibility of the story based on these observations. Specifically highlight any sentences that contribute to making the story implausible."
order_df = transform_trip_to_dataframe_order(train_order_ds, ci)
print(order_df.shape)


cloze_ci = "Instruction: Examine each story, focusing on numbered and conflicting sentences. Analyze their interplay and impact on narrative coherence and plausibility. Determine the story's alignment with intuitive physics and logical consistency, highlighting key elements that affect its plausibility."
cloze_df = transform_cloze_to_dataframe(train_cloze_ds, cloze_ci)
print(cloze_df.shape)

train_df = combined_df = pd.concat([order_df, cloze_df], ignore_index=True)
print(train_df.shape)

(816, 2)
(702, 2)
(1518, 2)


In [27]:
train_df.to_csv('../data/test_df-TRIP.csv', index=False)

## CommonSenseQA

In [28]:
import pandas as pd
import requests
import json

# Function to download and load the dataset
def download_commonsenseqa(url):
    response = requests.get(url)
    data = response.json()
    return pd.json_normalize(data, record_path=['data'])

# URL for the CommonsenseQA dataset
url = "https://s3.amazonaws.com/commensenseqa/train_rand_split.json"

# Downloading and loading the dataset
commonsenseqa_df = download_commonsenseqa(url)

# Displaying the first few rows of the DataFrame
print(commonsenseqa_df.head())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)