In [136]:
import pandas as pd
import ast
from datasets import load_dataset

In [59]:
csv_QA = [
    r"../data/train_df-arc_challenge.csv",
    r"../data/test_df-arc_challenge.csv",
    r"../data/train_df-ARC-Easy.csv",
    r"../data/test_df-ARC-Easy.csv",
    r"../data/train_df_common.csv",
    r"../data/valid_df-arc_challenge.csv",
    r"../data/valid_df-ARC-Easy.csv",
    r"../data/dev_df_common.csv"
]

In [61]:
csv_ce =[
     r"../data/train_df-CE.csv",
     r"../data/test_df-CE.csv",
]

In [62]:
def reformat(ans):
    choice = ans[0]
    short = ans[3:]
    return "the correct choice is {} which is {}.".format(choice,short.strip("."))

In [63]:
def batch_reformat(file_path):
    df = pd.read_csv(file_path)
    df["Answer"] = df["Answer"].apply(reformat)
    df.to_csv(file_path, index=False)

In [64]:
for path in csv_QA:
    batch_reformat(path)

In [132]:
def reformat_ce(ans):
    ans=ans.split(". ")
    entail = ans[0].split(": ")
    Type = ans[1].split(": ")
    return "the Entailment is {} and its type is {}".format(entail[1],Type[1])

In [133]:

def ce_format(file_path):
    df=pd.read_csv(r"../data/test_df-CE.csv")
    df["Label"] = df["Label"].apply(reformat_ce)
    df.to_csv(file_path, index=False)



In [134]:
for path in csv_ce:
    ce_format(path)

In [60]:
csv_Trip=[
    r"../data/train_df-TRIP.csv",
    r"../data/test_df-TRIP.csv",
    r"../data/valid_df-TRIP.csv"
]

In [188]:
dataset = load_dataset("sled-umich/TRIP")
train_order_ds = dataset["OrderDev"]
train_cloze_ds = dataset["ClozeDev"]

Found cached dataset trip (C:/Users/dry19/.cache/huggingface/datasets/sled-umich___trip/default/1.0.1/6e4c49afb825381fbdd640ee9352cca3e083d46acf090aa4a804872ef65dcea8)
100%|██████████| 6/6 [00:00<00:00, 249.77it/s]


In [189]:
def transform_trip_to_dataframe_order(dataset, custom_instruction):
    """
    Transforms the TRIP dataset into a DataFrame with numbered stories and their detailed attributes.
    
    :param dataset: List of dictionaries, each containing multiple stories.
    :return: A pandas DataFrame with the transformed data.
    """
    data = []

    for entry in dataset:
        for story in entry['stories']:
            # Numbering and concatenating the sentences to form the story
            story_text = f"Actor: {story['actor']}, Location: {story['location']}, Story: "
            story_text += ' '.join(f"[{i}] {sentence}" for i, sentence in enumerate(story['sentences']))
            story_text += "\n" + custom_instruction

            # Compiling story details
            pla= True if story['plausible'] else False
            
            breakp= story['breakpoint']
            conflict = story['confl_sents']
            Type = story.get('type', 'N/A')
            detail=''
            if pla:
                detail = "The sentences are plausible without any conflict."
            else:
                detail = "The sentences are not plausible. The breakpoint sentence is sentence {} as it is conflicting with sentence {}. The conflict type is {} ".format(
                    breakp,conflict,Type
                )
            data.append([story_text, detail])

    return pd.DataFrame(data, columns=['Story', 'Details'])

In [190]:
def transform_cloze_to_dataframe(dataset, cloze_ci):
    """
    Transforms the ClozeTrain part of the TRIP dataset into a DataFrame.
    
    :param dataset: List of dictionaries, each containing multiple cloze stories.
    :return: A pandas DataFrame with the transformed data.
    """
    data = []

    for entry in dataset:
        for story in entry['stories']:
            # Numbering and concatenating the sentences to form the story
            story_text = f"Actor: {story['actor']}, Location: {story['location']}, Story: "
            story_text += ' '.join(f"[{i}] {sentence}" for i, sentence in enumerate(story['sentences']))
            story_text += "\n" + cloze_ci

            # Compiling story details
            pla= True if story['plausible'] else False
            
            breakp= story['breakpoint']
            conflict = story['confl_sents']
            Type = story.get('type', 'N/A')
            pair = story['confl_pairs']
            detail=''
            if pla:
                detail = "The sentences are plausible without any conflict."
            else:
                detail = "The sentences are not plausible. The breakpoint sentence is sentence {} as it is conflicting with sentence {}.The conflict pair is {}. The conflict type is {} ".format(
                    breakp,conflict,pair,Type
                )
            data.append([story_text, detail])

            # details = (f"Plausible: {'True' if story['plausible'] else 'False'}, "
            #            f"Breakpoint: {story['breakpoint']}, "
            #            f"Conflict Sentences: {story['confl_sents']}, "
            #            f"Conflict Pairs: {story['confl_pairs']}, "
            #            f"Type: {story.get('type', 'N/A')}")

            data.append([story_text, detail])

    return pd.DataFrame(data, columns=['Story', 'Details'])

In [191]:
pd.set_option('display.max_colwidth', None)
ci = "Instruction: Please review each story, paying special attention to the numbered sentences. Identify the key sentence or sentences that disrupt the logical consistency or narrative coherence. Evaluate whether the overall sequence of events aligns with real-world physics, and determine the plausibility of the story based on these observations. Specifically highlight any sentences that contribute to making the story implausible."
order_df = transform_trip_to_dataframe_order(train_order_ds, ci)
print(order_df.shape)


cloze_ci = "Instruction: Examine each story, focusing on numbered and conflicting sentences. Analyze their interplay and impact on narrative coherence and plausibility. Determine the story's alignment with intuitive physics and logical consistency, highlighting key elements that affect its plausibility."
cloze_df = transform_cloze_to_dataframe(train_cloze_ds, cloze_ci)
print(cloze_df.shape)

train_df = combined_df = pd.concat([order_df, cloze_df], ignore_index=True)
print(train_df.shape)

(786, 2)
(1288, 2)
(2074, 2)


In [192]:
train_df.to_csv('../data/valid_df-TRIP.csv', index=False)