In [1]:
from datasets import load_dataset, concatenate_datasets

In [2]:
ds1 = load_dataset("CLUTRR/v1",'rob_train_clean_23_test_all_23')  # Blood relations

Using the latest cached version of the dataset since CLUTRR/v1 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'rob_train_clean_23_test_all_23' at /root/.cache/huggingface/datasets/CLUTRR___v1/rob_train_clean_23_test_all_23/1.0.0/a8158d1fac10864c3424d53662fe63bf7d82dd87 (last modified on Sat Jul 19 10:20:48 2025).


In [3]:
ds1

DatasetDict({
    train: Dataset({
        features: ['id', 'story', 'query', 'target', 'target_text', 'clean_story', 'proof_state', 'f_comb', 'task_name', 'story_edges', 'edge_types', 'query_edge', 'genders', 'task_split'],
        num_rows: 8098
    })
    validation: Dataset({
        features: ['id', 'story', 'query', 'target', 'target_text', 'clean_story', 'proof_state', 'f_comb', 'task_name', 'story_edges', 'edge_types', 'query_edge', 'genders', 'task_split'],
        num_rows: 2026
    })
    test: Dataset({
        features: ['id', 'story', 'query', 'target', 'target_text', 'clean_story', 'proof_state', 'f_comb', 'task_name', 'story_edges', 'edge_types', 'query_edge', 'genders', 'task_split'],
        num_rows: 447
    })
})

In [4]:
ds1['train']

Dataset({
    features: ['id', 'story', 'query', 'target', 'target_text', 'clean_story', 'proof_state', 'f_comb', 'task_name', 'story_edges', 'edge_types', 'query_edge', 'genders', 'task_split'],
    num_rows: 8098
})

In [5]:
print(type(ds1))

<class 'datasets.dataset_dict.DatasetDict'>


In [6]:
ds1['test']

Dataset({
    features: ['id', 'story', 'query', 'target', 'target_text', 'clean_story', 'proof_state', 'f_comb', 'task_name', 'story_edges', 'edge_types', 'query_edge', 'genders', 'task_split'],
    num_rows: 447
})

In [7]:
train=ds1['train']

In [8]:
data_lines=[]

In [9]:
for st, que, tar in zip(train['story'],train['query'],train['target_text']):
    line='|'.join([st,que,tar])
    data_lines.append(line)

In [10]:
dummy_choices=set(ds1['train']['target_text'])

In [11]:
print(len(dummy_choices))

18


In [12]:
train=ds1['validation']
for st, que, tar in zip(train['story'],train['query'],train['target_text']):
    line='|'.join([st,que,tar])
    data_lines.append(line)


In [13]:
train=ds1['test']
for st, que, tar in zip(train['story'],train['query'],train['target_text']):
    line='|'.join([st,que,tar])
    data_lines.append(line)


In [14]:
import json
import random
import ast

def convert_data_to_json_format(data_lines,dummy_choices_pool=dummy_choices):
    """
    Converts a list of strings in a specific pipe-separated format into a JSON
    object structured like the provided example.

    This function parses each input line, generates multiple-choice options,
    and constructs a detailed JSON object for each entry.

    Args:
        data_lines (list): A list of strings, where each string is in the format:
                           '[Story text...]|[('Person1', 'Person2')]|[answer]'

    Returns:
        str: A JSON formatted string representing the list of converted puzzle objects.
    """
    output_array = []
    
    for line in data_lines:
        try:
            # 1. Parse the Input String
            # The line is split by the '|' delimiter into its three core components.
            line=line.replace('[','')
            line=line.replace(']','')
            story, query_str, correct_answer_text = line.strip().split('|')
            correct_answer_text = correct_answer_text.strip()

            # 2. Extract Names from the Query
            # ast.literal_eval is used to safely parse the string tuple "('Name1', 'Name2')"
            # into an actual Python tuple.
            names_tuple = ast.literal_eval(query_str.strip())
            person1, person2 = names_tuple

            # 3. Construct the 'question' Field
            # The question is formed by combining the story and a direct query about the
            # relationship between the two individuals.
            question_text = f"{story} How is {person1} related to {person2}?"

            # 4. Generate Multiple-Choice Options
            # Since the source data only has the answer, we must generate other options.
            choices = [correct_answer_text]
            
            # Create a temporary pool of incorrect choices, excluding the correct one.
            temp_pool = [c for c in dummy_choices_pool if c.lower() != correct_answer_text.lower()]
            
            # Add 3 unique incorrect choices from the pool.
            choices.extend(random.sample(temp_pool, 3))
            
            # Shuffle the choices to randomize the position of the correct answer.
            random.shuffle(choices)

            # 5. Determine the Correct Answer's Letter ('A', 'B', 'C', or 'D')
            correct_answer_index = choices.index(correct_answer_text)
            correct_answer_letter = chr(ord('A') + correct_answer_index)
            choice_index=['A)','B)','C)','D)']
            new_choices=[]
            for idx,choice in enumerate(choices):
                n_c=choice_index[idx]+' '+choice.capitalize()
                new_choices.append(n_c)
            choices=new_choices
            # 6. Create a Simple Explanation
            # A generic explanation is created based on the known information.
            explanation_text = (f"Based on the provided story, the relationship between "
                                f"{person1} and {person2} is '{correct_answer_text}'.")

            # 7. Assemble the Final JSON Object
            # All the processed parts are combined into a dictionary that matches the target format.
            json_object = {
                "topic": "Blood Relations and Family Tree",
                "question": question_text,
                "choices": choices,
                "answer": correct_answer_letter,
                "explanation": explanation_text
            }
            
            output_array.append(json_object)

        except (ValueError, SyntaxError) as e:
            # Skip any lines that don't fit the expected format.
            print(f"Skipping malformed line: '{line}'. Error: {e}")
            continue

    # 8. Return the Final JSON String
    # The list of dictionaries is converted into a nicely indented JSON string.
    return json.dumps(output_array, indent=2)

In [15]:
train_array=convert_data_to_json_format(data_lines)

In [16]:
def save_json_to_file(json_string, filename):
    """
    Saves a JSON string to a specified file.

    Args:
        json_string (str): The JSON formatted string to save.
        filename (str): The name of the file to save the JSON to (e.g., 'output.json').
    """
    try:
        # The 'with' statement ensures the file is properly closed after writing.
        # 'w' mode means we are writing to the file (and will overwrite it if it exists).
        # 'encoding='utf-8'' is good practice to prevent issues with special characters.
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(json_string)
        print(f"Successfully saved JSON data to '{filename}'")
    except IOError as e:
        print(f"Error: Could not write to file '{filename}'. Reason: {e}")

In [17]:
save_json_to_file(train_array, 'blood_relation_final.json')

Successfully saved JSON data to 'blood_relation_final.json'


In [18]:
import random

def shuffle_and_split_puzzles(json_data, train_ratio = 0.8):
    """
    Shuffles a list of puzzle examples and splits it into a training and testing set.

    Args:
        json_data (list): A list of dictionaries, where each dictionary represents a puzzle.

    Returns:
        tuple: A tuple containing the training set (list) and the testing set (list).
    """
    random.seed(42)
    json_data=eval(json_data)

    print(f"Original number of examples: {len(json_data)}\n")
    random.shuffle(json_data)
    
    print("--- Data has been shuffled ---\n")

    # Define the ratio for the training set (e.g., 80% for training).
    
    # Calculate the index at which to split the data.
    total_size = len(json_data)
    split_index = int(total_size * train_ratio)

    # Split the shuffled data into training and testing sets using list slicing.
    train_set = json_data[:split_index]
    test_set = json_data[split_index:]
    train_set=json.dumps(train_set, indent=2)
    test_set=json.dumps(test_set,indent=2)
    return train_set, test_set

In [19]:
train_array,validation_array=shuffle_and_split_puzzles(train_array)

Original number of examples: 10571

--- Data has been shuffled ---



In [20]:
save_json_to_file(train_array, 'blood_relation_final_train.json')

Successfully saved JSON data to 'blood_relation_final_train.json'


In [21]:
save_json_to_file(validation_array,'blood_relation_final_val.json')

Successfully saved JSON data to 'blood_relation_final_val.json'


In [22]:
train_array=eval(train_array)
validation_array=eval(validation_array)
train_array[:5]

[{'topic': 'Blood Relations and Family Tree',
  'question': "Martha and her husband John went to church. They were religious, and tried to go every Sunday. Renee's father, John just got out of a court hearing. How is Martha related to Renee?",
  'choices': ['A) Niece', 'B) Nephew', 'C) Brother', 'D) Daughter'],
  'answer': 'D',
  'explanation': "Based on the provided story, the relationship between Martha and Renee is 'daughter'."},
 {'topic': 'Blood Relations and Family Tree',
  'question': "Helen's son, Joseph, went with her to the store to buy candy for Halloween. James is proud to be married to his loving wife Helen. Latisha takes her son James to the park every Saturday. How is Latisha related to Joseph?",
  'choices': ['A) Aunt', 'B) Brother', 'C) Daughter-in-law', 'D) Grandson'],
  'answer': 'D',
  'explanation': "Based on the provided story, the relationship between Latisha and Joseph is 'grandson'."},
 {'topic': 'Blood Relations and Family Tree',
  'question': "Dorothea is tak

# Truth and Liar

In [23]:
with open('truth_liar_full.json','r',encoding='utf-8') as f:
    t_string=f.read()

In [24]:
print(type(t_string))

<class 'str'>


In [25]:
truth_n_liar_array=eval(t_string)

In [26]:
t_train_array,t_validation_array=shuffle_and_split_puzzles(t_string)

Original number of examples: 10000

--- Data has been shuffled ---



In [27]:
print(len(train_array))

8456


In [28]:
t_train_array=eval(t_train_array)
t_validation_array=eval(t_validation_array)

In [29]:
print(len(t_train_array))

8000


In [30]:
train_array.extend(t_train_array)
validation_array.extend(t_validation_array)

In [31]:
print(len(train_array))

16456


In [32]:
train_set=json.dumps(train_array, indent=2)
test_set=json.dumps(validation_array,indent=2)

In [33]:
save_json_to_file(train_set, 'final_blood_truth_train.json')

Successfully saved JSON data to 'final_blood_truth_train.json'


In [34]:
save_json_to_file(test_set,'final_blood_truth_val.json')

Successfully saved JSON data to 'final_blood_truth_val.json'


# Seating Arrangements

In [35]:
with open('amd_seating_10000_COMPREHENSIVE_CHAMPION.json','r',encoding='utf-8') as f:
    t_string=f.read()

In [36]:
seating_array=eval(t_string)

In [37]:
s_train_array, s_validation_array=shuffle_and_split_puzzles(t_string)

Original number of examples: 10000

--- Data has been shuffled ---



In [38]:
s_train_array=eval(s_train_array)
s_validation_array=eval(s_validation_array)

In [39]:
s_train_array[:5]

[{'topic': 'Seating Arrangement',
  'question': 'In a linear arrangement: Emilio, Paki, Qing, Vega, Gina. Who sits immediately to the right of Emilio?',
  'choices': ['A) Emilio', 'B) Qing', 'C) Vega', 'D) Paki'],
  'answer': 'D',
  'explanation': 'Arrangement: Emilio, Paki, Qing, Vega, Gina. Looking at the linear arrangement, Paki is the adjacent person.',
  'metadata': {'arrangement_type': 'linear',
   'num_people': 5,
   'question_type': 'adjacent_right',
   'token_count': 41}},
 {'topic': 'Seating Arrangement',
  'question': 'In a linear arrangement: Luis, Victor, Dean, Bruno, Feng. Who sits at position 4 from the left?',
  'choices': ['A) Dean', 'B) Feng', 'C) Luis', 'D) Bruno'],
  'answer': 'D',
  'explanation': 'Arrangement: Luis, Victor, Dean, Bruno, Feng. Based on the spatial relationships, Bruno is the correct answer.',
  'metadata': {'arrangement_type': 'linear',
   'num_people': 5,
   'question_type': 'nth_position',
   'token_count': 37}},
 {'topic': 'Seating Arrangement',

In [40]:
print(len(s_train_array))

8000


In [41]:
train_array.extend(t_train_array)
validation_array.extend(t_validation_array)

In [42]:
print(len(train_array))
print(len(t_validation_array))

24456
2000


In [43]:
train_set=json.dumps(train_array, indent=2)
test_set=json.dumps(validation_array,indent=2)

In [44]:
save_json_to_file(train_set, 'final_blood_truth_seating_train.json')
save_json_to_file(test_set,'final_blood_truth_seating_val.json')

Successfully saved JSON data to 'final_blood_truth_seating_train.json'
Successfully saved JSON data to 'final_blood_truth_seating_val.json'
