# Data Set Creation

### Load Data

In [None]:
# !huggingface-cli login

In [None]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset("hivaze/LOGIC-701", "en")

In [None]:
df = pd.DataFrame(ds["train"])
# df.columns
df.to_csv("../data/real_data/logic_701.csv", index=False)

### Transform Data

In [None]:
system_prompt=("""I have data from a online dataset and I want you to help me to solve the logical problem and transform the data into a target format. Try to find the most optimal solution. I want you to output the question parsing, answer, cot, cot_parsing and sel_idx in json format. Please make sure that the output is valid JSON format. Formulate the answer as text. Do not just give the number or letter as answer.

Target format:
[
    {
        "question": "There are 7 outstanding students G, H, L, M, U, W and Z in a school.During the summer vacation, the school will send them to the United Kingdom and the United States for inspection.The school has only 7 students participating in this activity, and each person happens to go to one of these two countries.Considering the specialty of each student, this activity must meet the following conditions? (1) If G goes to the UK, then H To the United States.(2) If L goes to the UK, both M and U go to the US.(3) The country W went to was different from the country Z went to.(4) The country where U goes is different from the country where G goes.(5) If Z goes to the UK, then H also goes to the UK.\nIf G goes to the United States, which of the following must be true?\nA.H go to the UK\nB.L go to America\nC.M go to the UK\nD.W go to America",
        "question_parsing": [
            "There are 7 outstanding students G, H, L, M, U, W and Z in a school.During the summer vacation, the school will send them to the United Kingdom and the United States for inspection.",
            "each person happens to go to one of these two countries",
            "If G goes to the UK, then H To the United States",
            "If L goes to the UK, both M and U go to the US",
            "The country W went to was different from the country Z went to",
            "The country where U goes is different from the country where G goes",
            "If Z goes to the UK, then H also goes to the UK",
            "G goes to the United States"
        ],
        "answer": "1",
        "id": 162,
        "cot": "Since G goes to the United States, we need to analyze the conditions that follow. Condition (1) is not applicable since G is going to the US. Condition (2) is also not applicable since L's destination is not specified. Condition (3) does not provide any information about H, M, U, or W. Condition (4) states that U's destination is different from G's, which is the US, so U must go to the UK. Condition (5) is not applicable since Z's destination is not specified.",
        "cot_parsing": [
            {
                "statement": "Condition (1) is not applicable",
                "evidence": "Condition (1): If G goes to the UK, then H To the United States. | G is going to the US",
                "Verification": "false"
            },
            {
                "statement": "Condition (2) is also not applicable",
                "evidence": "Condition (2): If L goes to the UK, both M and U go to the US. | L's destination is not specified",
                "Verification": "false"
            },
            {
                "statement": "Condition (3) does not provide any information about H, M, U, or W",
                "evidence": "Condition (3): The country W went to was different from the country Z went to.",
                "Verification": "false"
            },
            {
                "statement": "U must go to the UK",
                "evidence": "Condition (4): The country where U goes is different from the country where G goes. | Condition (4) states that U's destination is different from G's, which is the US",
                "Verification": "true"
            },
            {
                "statement": "Condition (5) is not applicable",
                "evidence": "Condition (5): If Z goes to the UK, then H also goes to the UK. | Z's destination is not specified",
                "Verification": "true"
            }
        ],
        "sel_idx": 92
    }
]


    given data:
    """)

In [None]:
import os
from utils.AzureAdapter import AzureAdapter
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("AZURE_API_KEY")
api_endpoint = os.getenv("AZURE_API_ENDPOINT")
api_version = os.getenv("AZURE_API_VERSION")
deployment_name = "gpt-4o"

llm = AzureAdapter(api_key=api_key, api_endpoint=api_endpoint, api_version=api_version)

In [None]:
def save_to_json(output_folder, data, idx):
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, f"response_{idx}.json")
    with open(output_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

In [None]:
df["problem_statement"] = df[['problem_statement', 'answer_option_1', 'answer_option_2', 'answer_option_3', 'answer_option_4', 'answer_option_5']].fillna('').apply(
    lambda row: ', '.join([f"{col}: {row[col]}" for col in row.index]), axis=1
)

In [None]:
df["problem_statement"][0]

In [None]:
from tqdm import tqdm

def generate_cot(n_rows):
    for idx, row in tqdm(df.iterrows(), total=min(len(df), n_rows-1), desc="Processing rows"):
        if idx < 42:
            continue
        processed_row = row.to_json()
        response = llm.call_model(prompt=str(processed_row) + f"sel_idx: {idx}", system_prompt=system_prompt, deployment_name=deployment_name)
        # print(response)
        # print("\n")
        try:
            parsed_response = eval(response)
            save_to_json("./data", data=parsed_response, idx=idx)
        except Exception as e:
            print(f"Error processing row with id {idx}: {str(e)}")

        if idx >= n_rows-1:
            break


In [None]:
generate_cot(50)

In [None]:
def combine_json_to_jsonl(input_folder, output_file):
    data_combined = []
    with open(output_file, 'w') as jsonl_file:
        for file_name in os.listdir(input_folder):
            if file_name.endswith('.json'):
                file_path = os.path.join(input_folder, file_name)
                with open(file_path, 'r') as json_file:
                    data = json.load(json_file)
                    data_combined.append(data)
        jsonl_file.write(json.dumps(data_combined))
    print(f"Combined JSONL file saved to {output_file}")
    return data_combined

# Example usage
combine_json_to_jsonl('./data', 'combined_data_answer_complete.jsonl')

In [None]:
df_original = pd.read_csv('./logic_701.csv')
df_generated = pd.read_json('combined_data_answer_complete.jsonl')

df_generated

### Data Verification

In [None]:
import json

with open("combined_data_full_generation.jsonl", "r") as file:
    df_generated = json.loads(file.read())

type(df_generated)

In [None]:
df_generated_answers = [item["answer"] for item in df_generated]
df_generated_answers

In [None]:
def compare(original_answers, generated_answers):
    # Initialize comparison results
    comparison_results = {
        "total": len(generated_answers),
        "correct": 0,
        "incorrect": 0,
        "mismatched_answers": [],
        "matched_answers": []
    }

    # Compare answers
    for generated_item, original_answers in zip(generated_answers, original_answers):
        if generated_item not in original_answers:
            comparison_results["mismatched_answers"].append((original_answers, generated_answers))
            comparison_results["incorrect"] += 1
        else:
            comparison_results["incorrect"] += 1
            comparison_results["matched_answers"].append((original_answers, generated_answers))
    return comparison_results

df_original_answers = pd.read_csv("./logic_701.csv")["correct_option_number"]
# Example usage
results = compare(df_original_answers, df_generated_answers)
print(results)

In [None]:
results["mismatched_answers"]

In [None]:
questions = [(question_original, question_generated) for question_original, question_generated in zip(df_original_answers, df_generated_answers)]

In [None]:
questions

### Benchmarking with OpenPipe

In [None]:
import json

with open("combined_data_full_generation.jsonl", "r") as file:
    df_generated = json.loads(file.read())

In [None]:
df_generated

In [None]:
json.dumps(df_generated[0])

In [None]:
def format_data(data: json) -> list:
    output = []
    for puzzle in data:
        messages = {"messages": [
            {"role": "system",
            "content": "You are a reasoning assistant that transforms logic-based multiple-choice questions into structured JSON outputs. For each question:\n\n1. Parse the question into distinct logical or contextual statements and list them as `question_parsing`.\n2. Solve the problem using a clear, step-by-step chain-of-thought (`cot`).\n3. Parse the chain-of-thought into structured steps under `cot_parsing`, where each step includes:\n   - `statement`: the logical inference made\n   - `evidence`: the rule or fact supporting it\n   - `Verification`: whether the statement is logically verified as `true` or `false`\n4. Determine and return the **correct answer choice as a fully formulated sentence** under `answer`, not just a letter.\n5. Include the original question and use a valid JSON structure:\n\n{\n  \"question\": \"...\",\n  \"question_parsing\": [ \"...\", \"...\" ],\n  \"answer\": \"<full_text_of_correct_answer>\",\n  \"id\": <integer>,\n  \"cot\": \"...\",\n  \"cot_parsing\": [\n    { \"statement\": \"...\", \"evidence\": \"...\", \"Verification\": \"true\" | \"false\" },\n    ...\n  ],\n  \"sel_idx\": <integer>\n}\n\nBe logical, step-by-step, and do not make unsupported assumptions. Output only the JSON."},
            {"role":"user","content": puzzle["question"]},
            {"role":"assistant","content": json.dumps(puzzle)},
        ],
            "metadata": {"prompt_id": "logic_question_parser", "source": "hivaze/LOGIC-701"}}
        output.append(messages)

    return output

In [None]:
dataset_fine_tuning = format_data(df_generated)

In [None]:
dataset_fine_tuning[0]

In [None]:
def save_to_jsonl(data, output_file):
    with open(output_file, 'w') as file:
        for entry in data:
            file.write(json.dumps(entry) + '\n')

save_to_jsonl(dataset_fine_tuning, "dataset_fine_tuning.jsonl")

In [None]:
from dotenv import load_dotenv

load_dotenv()

open_pipe = os.getenv("structural-reasoning")

In [None]:
# pip install openpipe

from openpipe import OpenAI

client = OpenAI(
  openpipe={"api_key": f"{open_pipe}"}
)

completion = client.chat.completions.create(
    model="openpipe:long-taxis-show",
    messages=[
        {
            "role": "system",
            "content": "You are a reasoning assistant that transforms logic-based multiple-choice questions into structured JSON outputs. For each question:\n\n1. Parse the question into distinct logical or contextual statements and list them as `question_parsing`.\n2. Solve the problem using a clear, step-by-step chain-of-thought (`cot`).\n3. Parse the chain-of-thought into structured steps under `cot_parsing`, where each step includes:\n   - `statement`: the logical inference made\n   - `evidence`: the rule or fact supporting it\n   - `Verification`: whether the statement is logically verified as `true` or `false`\n4. Determine and return the **correct answer choice as a fully formulated sentence** under `answer`, not just a letter.\n5. Include the original question and use a valid JSON structure:\n\n{\n  \"question\": \"...\",\n  \"question_parsing\": [ \"...\", \"...\" ],\n  \"answer\": \"<full_text_of_correct_answer>\",\n  \"id\": <integer>,\n  \"cot\": \"...\",\n  \"cot_parsing\": [\n    { \"statement\": \"...\", \"evidence\": \"...\", \"Verification\": \"true\" | \"false\" },\n    ...\n  ],\n  \"sel_idx\": <integer>\n}\n\nBe logical, step-by-step, and do not make unsupported assumptions. Output only the JSON."
        },
        {
            "role": "user",
            "content": "There are 7 outstanding students G, H, L, M, U, W and Z in a school.During the summer vacation, the school will send them to the United Kingdom and the United States for inspection.The school has only 7 students participating in this activity, and each person happens to go to one of these two countries.Considering the specialty of each student, this activity must meet the following conditions? (1) If G goes to the UK, then H To the United States.(2) If L goes to the UK, both M and U go to the US.(3) The country W went to was different from the country Z went to.(4) The country where U goes is different from the country where G goes.(5) If Z goes to the UK, then H also goes to the UK. If G goes to the United States, which of the following must be true? A.H go to the UK B.L go to America C.M go to the UK D.W go to America"
        }
    ],
    temperature=0,
    openpipe={
        "tags": {
            "prompt_id": "counting",
            "any_key": "any_value"
        }
    },
)

print(completion.choices[0].message)