
# Get responses from GPT-4 using the API

In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.47.1-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.47.1-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.6/375.6 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━

In [None]:
import pandas as pd
import openai
import time
import random
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
#The key
openai.api_key = ''

In [None]:
def analyze_sentences_gpt_4o(prompts):
    client = openai.OpenAI(api_key=openai.api_key)
    responses = []

    for prompt in prompts:
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": f"{prompt.strip()}"}
                ],
                max_tokens=500,
                temperature=0.6
            )
            result = response.choices[0].message.content.strip()
            print(result)
            responses.append({'response': result})
        except Exception as e:
            responses.append({'prompt': prompt.strip(), 'response': str(e)})
        time.sleep(1)  # Add delay to avoid hitting rate limits

    return responses

# Connecting to github

In [None]:
import os

!git clone https://github.com/.../Research.git
!git config --global user.email "...."
!git config --global user.name "...."
%cd Research/dataset
#Token
os.environ['GITHUB_TOKEN'] = ''

Cloning into 'Research'...
remote: Enumerating objects: 590, done.[K
remote: Counting objects: 100% (164/164), done.[K
remote: Compressing objects: 100% (133/133), done.[K
remote: Total 590 (delta 89), reused 81 (delta 27), pack-reused 426 (from 1)[K
Receiving objects: 100% (590/590), 744.15 KiB | 9.79 MiB/s, done.
Resolving deltas: 100% (317/317), done.
/content/Research/dataset


# Evaluating gpt4o

## Direct pairs

In [None]:
prompts = pd.read_csv('basic_knowledge_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt_4o(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]
ground_truth_df = pd.read_csv('basic_knowledge_prompt.csv')['Result'].map({True: 'True', False: 'False'})
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': clean_responses,
    'Correct Answer': ground_truth_df
})

# Calculate precision, recall, and f1-score
precision = precision_score(comparison_df['Correct Answer'], comparison_df['Response'], pos_label="True")
recall = recall_score(comparison_df['Correct Answer'], comparison_df['Response'], pos_label="True")
f1 = f1_score(comparison_df['Correct Answer'], comparison_df['Response'], pos_label="True")

# Display the results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Precision: 0.89
Recall: 0.90
F1-Score: 0.89


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'basic_knowledge_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Mediator graph

In [None]:
prompts = pd.read_csv('mediator_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt_4o(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]

def clean_response(response):
    # Remove the 'collider: ' prefix and '<|end_of_text|>' token, and strip any extra whitespace
    return response.replace('mediator: ', '').replace('<eos>', '').strip()
cleaned_responses = [clean_response(response) for response in clean_responses]
ground_truth_df = pd.read_csv('mediator_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 38.83%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Mediator_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Mediator_response to the model folder"
!git push origin main

## Confunder graph

In [None]:
prompts = pd.read_csv('confunder_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt_4o(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]

def clean_response(response):
    return response.replace('confounder:', '').strip()
cleaned_responses = [clean_response(response) for response in clean_responses]
ground_truth_df = pd.read_csv('confunder_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 52.00%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'confunder_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added confunder_response to the model folder"
!git push origin main

## Collider graph

In [None]:
prompts = pd.read_csv('collider_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt_4o(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]

def clean_response(response):
    return response.replace('collider:', '').strip()
cleaned_responses = [clean_response(response) for response in clean_responses]
ground_truth_df = pd.read_csv('collider_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 90.80%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'collider_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added collider_response to the model folder"
!git push origin main

# Fine tuning

## Prepare the data

In [None]:
import pandas as pd
import json

# Read the dataset containing the basic knowledge prompts
ground_truth_df = pd.read_csv('basic_knowledge_prompt.csv')

# Define a function to format each row of the DataFrame for fine-tuning
def format_for_finetuning(row):
    # Create the formatted message structure
    formatted_data = {
        "messages": [
            {"role": "system", "content": "You are an expert in diagnosing neuropathic pain, using your causal reasoning ability to find relationships between different symptoms, conditions, and potential causes."},
            {"role": "user", "content": f"Is the following statement true or false? In the context of diagnosing neuropathic pain, {row['Source']} causes discomfort related to {row['Target']} in the feet."},
            {"role": "assistant", "content": str(row['Result'])}
        ]
    }
    return formatted_data

# Apply the formatting function to each row of the DataFrame
formatted_data_list = ground_truth_df.apply(format_for_finetuning, axis=1).tolist()



In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_fine_tuned_with_direct_pair'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'fine_tune_data_chat_format.jsonl')

with open(save_path, 'w') as outfile:
    for entry in formatted_data_list:
        json.dump(entry, outfile)
        outfile.write('\n')  # Write each formatted dictionary as a separate line

print(f"File saved to: {save_path}")


print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added collider_response to the model folder"
!git push origin main

In [None]:
client = openai.OpenAI(api_key=openai.api_key)

## Upload the training data file

In [None]:
file_path = '../model/GPT4o_fine_tuning_with_direct_pair/fine_tune_data_chat_format.jsonl'

# Open the file and upload it for fine-tuning
client.files.create(
    file=open(file_path, "rb"),
    purpose='fine-tune'
    )


## Create a fine-tuning job

In [None]:
client.fine_tuning.jobs.create(
  training_file=" ",
  model="gpt-4o-2024-08-06"
)

## Check the status


In [None]:
job_status = client.fine_tuning.jobs.retrieve("")
print(job_status)

# Evaluating direct pairs fine-tuning modle's performance

In [None]:
def analyze_sentences_gpt4o_tuned_with_directPair(prompts):
    client = openai.OpenAI(api_key=openai.api_key)
    responses = []

    for prompt in prompts:
        try:
            response = client.chat.completions.create(
                model='ft:',
                messages=[
                    {"role": "user", "content": f"{prompt.strip()}"}
                ],
                max_tokens=100,
                temperature=0.5
            )
            result = response.choices[0].message.content.strip()
            responses.append({'prompt': prompt.strip(), 'response': result})
        except Exception as e:
            responses.append({'prompt': prompt.strip(), 'response': str(e)})
        time.sleep(1)  # Add delay to avoid hitting rate limits

    return responses

## Direct pairs

In [None]:
prompts = pd.read_csv('basic_knowledge_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt4o_tuned_with_directPair(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]
ground_truth_df = pd.read_csv('basic_knowledge_prompt.csv')['Result'].map({True: 'True', False: 'False'})
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': clean_responses,
    'Correct Answer': ground_truth_df
})

# Calculate precision, recall, and f1-score
precision = precision_score(comparison_df['Correct Answer'], comparison_df['Response'], pos_label="True")
recall = recall_score(comparison_df['Correct Answer'], comparison_df['Response'], pos_label="True")
f1 = f1_score(comparison_df['Correct Answer'], comparison_df['Response'], pos_label="True")

# Display the results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Precision: 0.99
Recall: 0.99
F1-Score: 0.99


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_fine_tuned_with_direct_pair'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'basic_knowledge_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Mediator graph

In [None]:
prompts = pd.read_csv('mediator_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt4o_tuned_with_directPair(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]

def clean_response(response):
    # Remove the 'collider: ' prefix and '<|end_of_text|>' token, and strip any extra whitespace
    return response.replace('mediator: ', '').replace('<eos>', '').strip()
cleaned_responses = [clean_response(response) for response in clean_responses]
ground_truth_df = pd.read_csv('mediator_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 30.73%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_fine_tuned_with_direct_pair'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'mediator_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Confunder graph

In [None]:
prompts = pd.read_csv('confunder_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt4o_tuned_with_directPair(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]

def clean_response(response):
    return response.replace('confounder:', '').strip()
cleaned_responses = [clean_response(response) for response in clean_responses]
ground_truth_df = pd.read_csv('confunder_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 42.20%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_fine_tuned_with_direct_pair'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'confunder_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added confunder_response to the model folder"
!git push origin main

## Collider graph

In [None]:
prompts = pd.read_csv('collider_prompt.csv')['Prompt'].tolist()
responses = analyze_sentences_gpt4o_tuned_with_directPair(prompts)

In [None]:
clean_responses= [item['response'] for item in responses]

def clean_response(response):
    return response.replace('collider:', '').strip()
cleaned_responses = [clean_response(response) for response in clean_responses]
ground_truth_df = pd.read_csv('collider_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 91.20%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_fine_tuned_with_direct_pair'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'collider_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added collider_response to the model folder"
!git push origin main

# Reflection fine tuning

### Create fine-tunning dataset manually

In [None]:
import pandas as pd

# Creating the dataset with the six examples formatted as requested
all_formatted_data = [
    {
        "text": """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables DLS C6-C7, R C7 Radikulopati, and R Under armsbesvär. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: DLS C6-C7 causes R C7 Radikulopati discomfort is true.
R C7 Radikulopati causes R Under armsbesvär is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
      "text": """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R C7 Radikulopati, DLS C6-C7, and R Under armsbesvär. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: DLS C6-C7 causes R C7 Radikulopati discomfort is true.
R C7 Radikulopati causes R Under armsbesvär is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R C7 Radikulopati, DLS C6-C7, and R Under armsbesvär. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R C7 Radikulopati causes DLS C6-C7 discomfort is true.
DLS C6-C7 causes R Under armsbesvär is true.
The causal pathway is R C7 Radikulopati  -> DLS C6-C7 -> R Under armsbesvär,
so the mediator is DLS C6-C7.
</thinking>
<reflection>The correct causal chain is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär, where DLS C6-C7 causes discomfort related to R C7 Radikulopati, and R C7 Radikulopati causes discomfort related to R Under armsbesvär. My mistake was misidentifying the causal relationships within the chain.</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R C7 Radikulopati, R Under armsbesvär and  DLS C6-C7. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R C7 Radikulopati causes R Under armsbesvär is true.
DLS C6-C7 causes R C7 Radikulopati discomfort is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R Under armsbesvär, R C7 Radikulopati and DLS C6-C7. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R C7 Radikulopati causes R Under armsbesvär is true.
DLS C6-C7 causes R C7 Radikulopati discomfort is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Under armsbesvär, R C7 Radikulopati and DLS C6-C7. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R C7 Radikulopati causes R Under armsbesvär is true. DLS C6-C7 causes R C7 Radikulopati discomfort is true. The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R Under armsbesvär.
</thinking>
<reflection>The mediator should be the variable that connects the causal chain, which is R C7 Radikulopati, not R Under armsbesvär. My mistake was in identifying the effect as the mediator instead of the connecting variable.</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text":""" <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Vadbesvär, R Lårbesvär, and R L5 Radikulopati. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R L5 Radikulopati causes R Vadbesvär is true, and R L5 Radikulopati causes R Lårbesvär is true. R Vadbesvär and R Lårbesvär have no causal relationship. Therefore, R L5 Radikulopati is the confounder.
</thinking>
<reflection>No corrections needed</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Lårbesvär, R L5 Radikulopati, and R Vadbesvär. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R L5 Radikulopati causes R Lårbesvär is true, and R L5 Radikulopati causes R Vadbesvär is true. R Lårbesvär and R Vadbesvär have no causal relationship. Therefore, R L5 Radikulopati is the confounder.
</thinking>
<reflection>No corrections needed</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Lårbesvär, R Vadbesvär, and R L5 Radikulopati. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R Lårbesvär causes R L5 Radikulopati is true, and R L5 Radikulopati causes R Vadbesvär is true. Therefore, R Lårbesvär is the confounder.
</thinking>
<reflection>Correction: The correct chain should recognize R L5 Radikulopati as the common cause of R Lårbesvär and R Vadbesvär with no direct relationship between the effects. Thus, R L5 Radikulopati is the confounder, not R Lårbesvär.</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Vadbesvär, R Lårbesvär, and R L5 Radikulopati. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R L5 Radikulopati causes R Lårbesvär discomfort is true, and R L5 Radikulopati causes R Lårbesvär is true. Therefore, R Vadbesvär is the confounder.
</thinking>
<reflection>Correction: The error was in misidentifying R Vadbesvär as the confounder. The correct confounder is R L5 Radikulopati, as it influences both R Vadbesvär and R Lårbesvär without them influencing each other.</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the collider variable in the causal pathway A -> C and B -> C. This means if A causes C is true, and B causes C is true, and A and B have no causal relationship, then C is the collider. Only provide the collider variable (C) without any additional explanation. In diagnosing neuropathic pain, consider the variables R T12 Radikulopati, Nedre bukbesvär, and R T10 Radikulopati. Identify the collider variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R T12 Radikulopati causes Nedre bukbesvär is true, and R T10 Radikulopati causes Nedre bukbesvär is true. R T12 Radikulopati and R T10 Radikulopati have no causal relationship. Therefore, Nedre bukbesvär is the collider.
</thinking>
<reflection>No corrections needed</reflection>
<output>Nedre bukbesvär</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the collider variable in the causal pathway A -> C and B -> C. This means if A causes C is true, and B causes C is true, and A and B have no causal relationship, then C is the collider. Only provide the collider variable (C) without any additional explanation. In diagnosing neuropathic pain, consider the variables Nedre bukbesvär, R T10 Radikulopati, and R T12 Radikulopati. Identify the collider variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R T10 Radikulopati causes Nedre bukbesvär is true, and R T12 Radikulopati causes Nedre bukbesvär is true. R T10 Radikulopati and R T12 Radikulopati have no causal relationship. Therefore, Nedre bukbesvär is the collider.
</thinking>
<reflection>No corrections needed</reflection>
<output>Nedre bukbesvär</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the collider variable in the causal pathway A -> C and B -> C. This means if A causes C is true, and B causes C is true, and A and B have no causal relationship, then C is the collider. Only provide the collider variable (C) without any additional explanation. In diagnosing neuropathic pain, consider the variables R T12 Radikulopati, R T10 Radikulopati and Nedre bukbesvär. Identify the collider variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R T12 Radikulopati causes R T10 Radikulopati is true, and R T10 Radikulopati causes Nedre bukbesvär is true. Therefore, R T12 Radikulopati is the collider.
</thinking>
<reflection>Correction: The correct identification should recognize that Nedre bukbesvär is influenced by both R T12 Radikulopati and R T10 Radikulopati, making it the collider, not R T12 Radikulopati.</reflection>
<output>Nedre bukbesvär</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the collider variable in the causal pathway A -> C and B -> C. This means if A causes C is true, and B causes C is true, and A and B have no causal relationship, then C is the collider. Only provide the collider variable (C) without any additional explanation. In diagnosing neuropathic pain, consider the variables R T10 Radikulopati, R T12 Radikulopati, and Nedre bukbesvär. Identify the collider variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R T10 Radikulopati causes Nedre bukbesvär discomfort is true, and R T12 Radikulopati causes Nedre bukbesvär is true. Therefore, R T10 Radikulopati is the collider.
</thinking>
<reflection>Correction: The error was in misidentifying R T10 Radikulopati as the collider. The correct collider is Nedre bukbesvär, as it is the common effect of both R T10 Radikulopati and R T12 Radikulopati.</reflection>
<output>Nedre bukbesvär</output><eos>"""

    }

]


In [None]:
formatted_data_for_gpt = []

for example in all_formatted_data:
    # Extract the example text and split it into parts
    text = example["text"]
    parts = text.split("<|eot_id|>")

    # Extract system, user, and assistant messages based on known structure
    system_message = parts[0].split("<|start_header_id|>system<|end_header_id|>")[-1].strip() if "<|start_header_id|>system<|end_header_id|>" in parts[0] else ""
    user_message = parts[1].split("<|start_header_id|>user<|end_header_id|>")[-1].strip()
    assistant_message = parts[2].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

    # Create the GPT format message dictionary
    formatted_example = {
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": assistant_message}
        ]
    }

    # Add the formatted example to the list
    formatted_data_for_gpt.append(formatted_example)



In [None]:
import os
import json
# Define the target directory relative to the current script
target_directory = '../model/GPT4o_reflection_fine_tuning_model'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the JSONL file
save_path = os.path.join(target_directory, 'reflection_fine_tune_data_chat_format.jsonl')

with open(save_path, 'w') as outfile:
    for entry in formatted_data_for_gpt:
        json.dump(entry, outfile)
        outfile.write('\n')  # Write each formatted dictionary as a separate line

!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added reflection fine tuning dataset to the model folder"
!git push origin main

In [None]:
file_path = '../model/GPT4o_reflection_fine_tuning_model/reflection_fine_tune_data_chat_format.jsonl'

# Open the file and upload it for fine-tuning
client.files.create(
    file=open(file_path, "rb"),
    purpose='fine-tune'
    )


In [None]:
client.fine_tuning.jobs.create(
  training_file="",
  model=""
)

In [None]:
job_status = client.fine_tuning.jobs.retrieve("")
print(job_status)

## Evaluating

In [None]:
def analyze_sentences_gpt4o_tuned_with_CausalRules(prompts):
    client = openai.OpenAI(api_key=openai.api_key)
    responses = []

    for prompt in prompts:
        try:
            response = client.chat.completions.create(
                model='ft:',
                messages=[
                    {"role": "user", "content": f"{prompt.strip()}"}
                ],
                max_tokens=200,
                temperature=0.5
            )
            result = response.choices[0].message.content.strip()
            print(result)
            responses.append({'prompt': prompt.strip(), 'response': result})
        except Exception as e:
            responses.append({'prompt': prompt.strip(), 'response': str(e)})
        time.sleep(1)

    return responses

## mediator graph

In [None]:
prompts = pd.read_csv('mediator_prompt1.csv')['Prompt'].tolist()
responses1 = analyze_sentences_gpt4o_tuned_with_CausalRules(prompts)

<thinking>
The causal chain here is: DLS L4-L5 causes R L5 Radikulopati discomfort is true.
R L5 Radikulopati causes R Stortårbesvär is true.
The causal pathway is DLS L4-L5 -> R L5 Radikulopati -> R Stortårbesvär,
so the mediator is R L5 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R L5 Radikulopati</output><eos>
<thinking>
The causal chain here is: DLS T12-L1 causes L L1 Radikulopati is true.
L L1 Radikulopati causes Laterala bukbesvär is true.
The causal pathway is DLS T12-L1 -> L L1 Radikulopati -> Laterala bukbesvär,
so the mediator is L L1 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>L L1 Radikulopati</output><eos>
<thinking>
The causal chain here is: DLS L1-L2 causes L L2 Radikulopati is true.
L L2 Radikulopati causes Laterala bukbesvär is true.
The causal pathway is DLS L1-L2 -> L L2 Radikulopati -> Laterala bukbesvär,
so the mediator is L L2 Radikulopati.
</thinking>
<reflection>No corrections needed</refl

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'<output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(entry['response']) for entry in responses1]



In [None]:
ground_truth_df = pd.read_csv('mediator_prompt.csv')['Correct Answer']
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 91.62%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_reflection_fine_tuning_model'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Mediator_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Mediator_response to the model folder"
!git push origin main

## Confunder graph

In [None]:
prompts = pd.read_csv('confunder_prompt1.csv')['Prompt'].tolist()
responses2 = analyze_sentences_gpt4o_tuned_with_CausalRules(prompts)

<thinking>
The causal chain here is: R S1 Radikulopati causes Hälbesvär is true, and R S1 Radikulopati causes R Lårbesvär is true. Hälbesvär and R Lårbesvär have no causal relationship. Therefore, R S1 Radikulopati is the confounder.
</thinking>
<reflection>No corrections needed</reflection>
<output>R S1 Radikulopati</output><eot>
<thinking>
The causal chain here is: R L5 Radikulopati causes R Höftkamsbesvär is true, and R L5 Radikulopati causes R Lårbesvär is true. R Höftkamsbesvär and R Lårbesvär have no causal relationship. Therefore, R L5 Radikulopati is the confounder.
</thinking>
<reflection>No corrections needed</reflection>
<output>R L5 Radikulopati</output><eot>
<thinking>
The causal chain here is: R S1 Radikulopati causes R Dorsala knäledsbesvär is true, and R S1 Radikulopati causes R Benbesvär is true. R Dorsala knäledsbesvär and R Benbesvär have no causal relationship. Therefore, R S1 Radikulopati is the confounder.
</thinking>
<reflection>No corrections needed</reflection>

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'<output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(entry['response']) for entry in responses2]



In [None]:
ground_truth_df = pd.read_csv('confunder_prompt1.csv')['Correct Answer']
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 92.40%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_reflection_fine_tuning_model'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Confunder_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added confunder_response to the model folder"
!git push origin main

## Collider graph

In [None]:
prompts = pd.read_csv('collider_prompt1.csv')['Prompt'].tolist()
responses3 = analyze_sentences_gpt4o_tuned_with_CausalRules(prompts)

<thinking>
The causal chain here is: L S1 Radikulopati causes L Knäbesvär is true, and L L5 Radikulopati causes L Knäbesvär is true. L S1 Radikulopati and L L5 Radikulopati have no causal relationship. Therefore, L Knäbesvär is the collider.
</thinking>
<reflection>No corrections needed</reflection>
<output>L Knäbesvär</output><eot>
<thinking>
The causal chain here is: L T12 Radikulopati causes L Nedre bukbesvär is true, and L L1 Radikulopati causes L Nedre bukbesvär is true. L T12 Radikulopati and L L1 Radikulopati have no causal relationship. Therefore, L Nedre bukbesvär is the collider.
</thinking>
<reflection>No corrections needed</reflection>
<output>L Nedre bukbesvär</output><eos>
<thinking>
The causal chain here is: R T9 Radikulopati causes Nedre bukbesvär is true, and L T12 Radikulopati causes Nedre bukbesvär is true. R T9 Radikulopati and L T12 Radikulopati have no causal relationship. Therefore, Nedre bukbesvär is the collider.
</thinking>
<reflection>No corrections needed</r

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'<output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(entry['response']) for entry in responses3]



In [None]:
ground_truth_df = pd.read_csv('collider_prompt.csv')['Correct Answer']
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 92.40%


In [None]:
comparison_df

Unnamed: 0,Prompt,Response,Correct Answer,Is Correct
0,<|begin_of_text|><|start_header_id|>system<|en...,L Knäbesvär,L Knäbesvär,True
1,<|begin_of_text|><|start_header_id|>system<|en...,L Nedre bukbesvär,L Nedre bukbesvär,True
2,<|begin_of_text|><|start_header_id|>system<|en...,Nedre bukbesvär,Nedre bukbesvär,True
3,<|begin_of_text|><|start_header_id|>system<|en...,Bröstryggsbesvär,Bröstryggsbesvär,True
4,<|begin_of_text|><|start_header_id|>system<|en...,L Benbesvär,L Benbesvär,True
...,...,...,...,...
495,<|begin_of_text|><|start_header_id|>system<|en...,Bröstryggsbesvär,Bröstryggsbesvär,True
496,<|begin_of_text|><|start_header_id|>system<|en...,Bröstbesvär,Bröstbesvär,True
497,<|begin_of_text|><|start_header_id|>system<|en...,R Lumbago,R Lumbago,True
498,<|begin_of_text|><|start_header_id|>system<|en...,Nedre bukbesvär,Nedre bukbesvär,True


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_reflection_fine_tuning_model'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Collider_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Collider_response to the model folder"
!git push origin main

# Testing cladder

In [None]:
def analyze_sentences_gpt_4o(prompts):
    client = openai.OpenAI(api_key=openai.api_key)
    responses = []

    for prompt in prompts:
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": f"{prompt.strip()}"}
                ],
                max_tokens=200,
                temperature=0.6
            )
            result = response.choices[0].message.content.strip()
            print(result)
            responses.append({'response': result})
        except Exception as e:
            responses.append({'prompt': prompt.strip(), 'response': str(e)})
        time.sleep(1)  # Add delay to avoid hitting rate limits

    return responses

## Rung1

In [None]:
import json
import openai
import time

# Load the JSON data from rung1.json
with open('rung1.json', 'r') as file:
    data = json.load(file)


# Generate prompts from the JSON data
prompts = []
answers = []
for entry in data:
    given_info = entry['given_info']
    question = entry['question']
    answer = entry['answer']
    prompt = f"Use your causal reasoning ability to only answer yes or no for the following question based on the given information.\n\nGiven Info: {given_info}\nQuestion: {question}"
    prompts.append(prompt)
    answers.append(answer)






In [None]:
# Use the function to get responses from GPT
responses = analyze_sentences_gpt_4o(prompts)

In [None]:
# Clean up GPT responses for comparison
cleaned_responses = [response['response'].strip().lower().replace('.', '') for response in responses]

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'prompt': prompts,
    'answer': answers,
    'response': cleaned_responses
})

# Calculate accuracy
comparison_df['correct'] = comparison_df['answer'] == comparison_df['response']
accuracy = comparison_df['correct'].mean()
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 68.00%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_Cladder'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Rung1_responses.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Rung1_responses to the model folder"
!git push origin main

## Rung2

In [None]:
import json
import openai
import time

# Load the JSON data from rung1.json
with open('rung2.json', 'r') as file:
    data = json.load(file)


# Generate prompts from the JSON data
prompts = []
answers = []
for entry in data:
    given_info = entry['given_info']
    question = entry['question']
    answer = entry['answer']
    prompt = f"Use your causal reasoning ability to only answer yes or no for the following question based on the given information.\n\nGiven Info: {given_info}\nQuestion: {question}"
    prompts.append(prompt)
    answers.append(answer)

# Use the function to get responses from GPT
responses1 = analyze_sentences_gpt_4o(prompts)

Yes.
Yes.
No.
No.
Yes.
No
Yes.
No.
Yes.
Yes.
No.
Yes.
No.
Yes.
No.
No.
Yes
No.
Yes.
Yes.
Yes.
No.
Yes.
Yes.
Yes.
Yes.
Yes.
Yes.
No.
Yes.
Yes.
Yes.
Yes.
No.
No.
Yes.
No.
No
No.
No.
No
Yes.
Yes.
Yes.
No.
No.
No
Yes.
No.
Yes.
Yes.
No
No.
No.
Yes.
No.
No.
Yes.
Yes.
No.
Yes.
Yes.
Yes.
No.
No.
No.
Yes.
No.
No.
No.
Yes.
Yes.
No.
Yes.
Yes.
Yes.
Yes.
No.
Yes.
No.
No.
No.
Yes.
Yes.
No.
Yes.
Yes
No.
No
No.
Yes.
Yes.
Yes.
No.
No.
Yes.
No.
Yes.
Yes
Yes.
No.
No
Yes.
Yes.
Yes.
No.
Yes.
No.
Yes.
No
No.
Yes.
No.
Yes.
Yes.
No.
Yes
No.
Yes.
Yes.
No.
Yes.
No.
Yes
Yes.
Yes
Yes.
No.
No.
Yes.
Yes.
Yes.
No.
No.
No.
Yes.
Yes
Yes.
Yes.
No
Yes.
No.
No.
No
No.
No.
No
No.
No.
No
Yes.
No
Yes
No.
No.
Yes.
No.
Yes.
No.
Yes.
Yes.
No.
Yes.
Yes.
No
No.
Yes.
Yes
Yes.
No.
No.
Yes.
No
Yes.
No.
Yes.
Yes.
Yes.
No.
No.
No.
Yes
Yes.
No.
No.
Yes.
No.
No.
No.
No
No.
Yes.
No.
Yes.
Yes.
No.
Yes.
No.
Yes.
No


In [None]:
# Clean up GPT responses for comparison
cleaned_responses = [response['response'].strip().lower().replace('.', '') for response in responses1]

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'prompt': prompts,
    'answer': answers,
    'response': cleaned_responses
})

# Calculate accuracy
comparison_df['correct'] = comparison_df['answer'] == comparison_df['response']
accuracy = comparison_df['correct'].mean()
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 59.00%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_Cladder'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Rung2_responses.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Rung2_responses to the model folder"
!git push origin main

## Rung3

In [None]:
import json
import openai
import time

# Load the JSON data from rung1.json
with open('rung3.json', 'r') as file:
    data = json.load(file)


# Generate prompts from the JSON data
prompts = []
answers = []
for entry in data:
    given_info = entry['given_info']
    question = entry['question']
    answer = entry['answer']
    prompt = f"Use your causal reasoning ability to only answer yes or no for the following question based on the given information.\n\nGiven Info: {given_info}\nQuestion: {question}"
    prompts.append(prompt)
    answers.append(answer)

# Use the function to get responses from GPT
responses2 = analyze_sentences_gpt_4o(prompts)

In [None]:
# Clean up GPT responses for comparison
cleaned_responses = [response['response'].strip().lower().replace('.', '') for response in responses1]

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'prompt': prompts,
    'answer': answers,
    'response': cleaned_responses
})

# Calculate accuracy
comparison_df['correct'] = comparison_df['answer'] == comparison_df['response']
accuracy = comparison_df['correct'].mean()
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 47.50%


# Fine tuning for cladder benchmark

## Create fine-tunning dataset manually

In [None]:
data = [
  { #ett
      "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For husbands that don't set the alarm, the probability of ringing alarm is 56%. For husbands that set the alarm, the probability of ringing alarm is 65%.\n\nWill alarm set by husband increase the chance of ringing alarm?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = husband; V2 = wife; Y = alarm clock.\nThe causal structure is X -> V2, X -> Y, V2 -> Y.\nWe need to compute the difference: E[Y | do(X = 1)] - E[Y | do(X = 0)].\nFrom the given information:\nP(Y | X=0) = 0.56 and P(Y | X=1) = 0.65.\nThe difference is: 0.65 - 0.56 = 0.09.\nSince 0.09 > 0, setting the alarm increases the chance of ringing the alarm.\n</thinking>\n<reflection>There was an error in the final output. The difference shows that setting the alarm increases the chance of ringing the alarm, which means the correct answer is \"Yes,\" not \"No\" as initially stated.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  { #ett

    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For husbands that don't set the alarm, the probability of ringing alarm is 74%. For husbands that set the alarm, the probability of ringing alarm is 22%.\n\nFor husbands that set the alarm, would it be more likely to see ringing alarm if the husband had not set the alarm?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = husband; Y = alarm clock.\nThe causal structure is X -> Y.\nWe need to compute the difference: E[Y_{X = 1} - Y_{X = 0} | X = 1].\nFrom the given information:\nP(Y | X=0) = 0.74 and P(Y | X=1) = 0.22.\nThe difference is: 0.22 - 0.74 = -0.52.\nSince -0.52 < 0, it would be more likely to see ringing alarm if the husband had not set the alarm.\n</thinking>\n<reflection>The reasoning is correct, and the computed difference supports the conclusion. Since the difference is negative, the correct output is 'Yes'.</reflection>\n<output>Yes</output>"
      }
    ]


  },
  { #marginal

    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "The overall probability of alarm set by husband is 10%. For husbands that don't set the alarm, the probability of ringing alarm is 78%. For husbands that set the alarm, the probability of ringing alarm is 32%.\n\nIs ringing alarm more likely than silent alarm overall?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = husband; V2 = wife; Y = alarm clock.\nThe causal structure is X -> V2, X -> Y, V2 -> Y.\nWe need to compute the marginal probability of ringing the alarm, P(Y).\nFrom the given information:\nP(Y | X=1) = 0.32, P(X=1) = 0.10\nP(Y | X=0) = 0.78, P(X=0) = 0.90.\nUsing the formula: P(Y) = P(Y | X=1)*P(X=1) + P(Y | X=0)*P(X=0), we get:\nP(Y) = 0.10*0.32 + 0.90*0.78 = 0.73.\nSince 0.73 > 0.5, ringing alarm is more likely than silent alarm overall.\n</thinking>\n<reflection>No corrections needed. The reasoning and the computed marginal probability are correct, and ringing alarm is indeed more likely than silent alarm.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  {#marginal
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "The overall probability of alarm set by husband is 10%. For husbands that don't set the alarm, the probability of ringing alarm is 78%. For husbands that set the alarm, the probability of ringing alarm is 32%.\n\nIs ringing alarm more likely than silent alarm overall?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = husband; V2 = wife; Y = alarm clock.\nThe causal structure is X -> V2, X -> Y, V2 -> Y.\nWe need to compute the marginal probability of ringing the alarm, P(Y).\nFrom the given information:\nP(Y | X=1) = 0.32, P(X=1) = 0.10\nP(Y | X=0) = 0.78, P(X=0) = 0.90.\nUsing the formula: P(Y) = P(Y | X=1)*P(X=1) + P(Y | X=0)*P(X=0), we get:\nP(Y) = 0.10*0.32 + 0.90*0.78 = 0.54.\nSince 0.54 is close to 0.5, ringing alarm is equally likely as silent alarm.\n</thinking>\n<reflection>The mistake here is in calculating the marginal probability. The correct calculation is P(Y) = 0.10*0.32 + 0.90*0.78 = 0.73, not 0.54. Therefore, ringing alarm is more likely than silent alarm overall. The correct output should be 'Yes.'</reflection>\n<output>Yes</output>"
      }
    ]
  },



  { # backadj
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "Method 1: We look at how husband correlates with alarm clock case by case according to wife. Method 2: We look directly at how husband correlates with alarm clock in general.\n\nTo understand how husband affects alarm clock, is it more correct to use Method 1 than Method 2?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = husband, Y = alarm clock, Z = wife.\nThe structure is assumed to be X -> Z -> Y, and we want to find the causal effect of X on Y.\nMethod 1 computes the difference: E[Y | X=1, Z] - E[Y | X=0, Z], which captures the correct causal effect through the path X -> Z -> Y.\nMethod 2 looks at the correlation: E[Y | X=1] - E[Y | X=0], which does not account for the mediator Z and may include spurious correlations.\nTherefore, Method 1 is more correct than Method 2.\n</thinking>\n<reflection>The reasoning in the thinking step is flawed. The structure is not necessarily X -> Z -> Y. The problem only asks for the relationship between X and Y, so conditioning on Z (wife) in Method 1 is unnecessary and may complicate the analysis. Method 2, which looks directly at the correlation between X and Y, is the correct approach for this case. Therefore, the correct output should be 'no.'\n</reflection>\n<output>No</output>"
      }
    ]
  },
  { # backadj
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "Method 1: We look at how husband correlates with alarm clock case by case according to wife. Method 2: We look directly at how husband correlates with alarm clock in general.\n\nTo understand how husband affects alarm clock, is it more correct to use Method 1 than Method 2?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = husband, Y = alarm clock, Z = wife.\nThe structure assumed in the given information is not fixed, and we want to understand the direct relationship between X (husband) and Y (alarm clock).\nMethod 1 introduces an additional step by conditioning on Z (wife), which may obscure the direct relationship between X and Y.\nMethod 2, however, directly assesses the correlation between X and Y without unnecessary mediation, making it a more appropriate method for understanding this specific relationship.\nTherefore, Method 2 is more correct than Method 1.\n</thinking>\n<reflection>No corrections needed. The correct output is 'No,' as Method 2 is more suitable for this case.\n</reflection>\n<output>No</output>"
      }
    ]
  },
  {#ate
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For people not blowing out candles, the probability of dark room is 81%. For people who blow out candles, the probability of dark room is 46%.\n\nWill blowing out the candle decrease the chance of dark room?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet V2 = the candle; X = the person blowing out the candle; Y = dark room.\nThe causal structure is X -> Y, V2 -> Y.\nWe need to compute the effect: E[Y | do(X = 1)] - E[Y | do(X = 0)], which tells us how blowing out the candle affects the dark room.\nFrom the given information:\nP(Y=1 | X=0) = 0.81 (probability of dark room without blowing out the candle)\nP(Y=1 | X=1) = 0.46 (probability of dark room with blowing out the candle)\nThe difference is: 0.46 - 0.81 = -0.35.\nSince -0.35 < 0, blowing out the candle decreases the chance of the dark room.\n</thinking>\n<reflection>No corrections needed. The calculation and reasoning are correct, and blowing out the candle does decrease the chance of the dark room.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  {#ate
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For people not blowing out candles, the probability of dark room is 81%. For people who blow out candles, the probability of dark room is 46%.\n\nWill blowing out the candle decrease the chance of dark room?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet V2 = the candle; X = the person blowing out the candle; Y = dark room.\nThe causal structure is X -> Y, V2 -> Y.\nWe need to compute the effect: E[Y | do(X = 1)] - E[Y | do(X = 0)], which tells us how blowing out the candle affects the dark room.\nFrom the given information:\nP(Y=1 | X=0) = 0.81 (probability of dark room without blowing out the candle)\nP(Y=1 | X=1) = 0.46 (probability of dark room with blowing out the candle)\nThe difference is: 0.46 - 0.81 = -0.35.\nSince -0.35 < 0, blowing out the candle decreases the chance of the dark room.\n</thinking>\n<reflection>The reasoning is correct, but the conclusion in the output is wrong. The difference (-0.35) shows a decrease, so the correct answer should be 'Yes,' as blowing out the candle does decrease the chance of the dark room.</reflection>\n<output>No</output>"
      }
    ]
  },
  {#nde
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For people not taking any medication and with low blood pressure, the probability of healthy heart is 82%. For people not taking any medication and with high blood pressure, the probability of healthy heart is 52%. For people taking medication and with low blood pressure, the probability of healthy heart is 89%. For people taking medication and with high blood pressure, the probability of healthy heart is 45%. For people not taking any medication, the probability of high blood pressure is 61%. For people taking medication, the probability of high blood pressure is 10%.\n\nIf we disregard the mediation effect through blood pressure, would medication negatively affect heart condition?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = medication, V2 = blood pressure, Y = heart condition.\nThe causal structure is X -> V2 -> Y. We want to evaluate the effect of medication (X) on heart condition (Y), disregarding the mediation effect of blood pressure (V2).\nWe need to compute the natural direct effect: E[Y_{X=1, V2=0} - Y_{X=0, V2=0}].\nFrom the given information:\nP(Y | X=0, V2=0) = 0.82 (no medication, low blood pressure)\nP(Y | X=1, V2=0) = 0.89 (medication, low blood pressure)\nP(Y | X=0, V2=1) = 0.52 (no medication, high blood pressure)\nP(Y | X=1, V2=1) = 0.45 (medication, high blood pressure).\nDisregarding the mediation effect through blood pressure, we compare the probabilities directly: P(Y | X=0) vs. P(Y | X=1).\nSince P(Y | X=1, V2=1) = 0.45 and P(Y | X=0, V2=1) = 0.52, medication negatively affects heart condition when ignoring blood pressure.\n</thinking>\n<reflection>No corrections needed. The reasoning and comparison are correct, and medication negatively affects heart condition when disregarding the mediation effect through blood pressure.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  {#nde
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For people not taking any medication and with low blood pressure, the probability of healthy heart is 82%. For people not taking any medication and with high blood pressure, the probability of healthy heart is 52%. For people taking medication and with low blood pressure, the probability of healthy heart is 89%. For people taking medication and with high blood pressure, the probability of healthy heart is 45%. For people not taking any medication, the probability of high blood pressure is 61%. For people taking medication, the probability of high blood pressure is 10%.\n\nIf we disregard the mediation effect through blood pressure, would medication negatively affect heart condition?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = medication; V2 = blood pressure; Y = heart condition.\nWe need to compute the total effect of medication on heart condition.\nFrom the given information:\nP(healthy | X=0, V2=low) = 0.82, P(healthy | X=0, V2=high) = 0.52, P(V2=low | X=0) = 0.39, P(V2=high | X=0) = 0.61, P(healthy | X=1, V2=low) = 0.89, P(healthy | X=1, V2=high) = 0.45, P(V2=low | X=1) = 0.90, P(V2=high | X=1) = 0.10.\nUsing the formula:\nP(healthy | X=1) = 0.89 * 0.90 + 0.45 * 0.10 = 0.85.\nP(healthy | X=0) = 0.82 * 0.39 + 0.52 * 0.61 = 0.64.\nThe difference is 0.85 - 0.64 = 0.21.\nSince 0.21 > 0, medication improves heart condition.\n</thinking>\n<reflection>The mistake here is that the reasoning calculated the total effect, including the mediation effect through blood pressure. The question specifically asks for the direct effect, disregarding the mediation through blood pressure. To compute the natural direct effect (NDE), we should focus on the effect of medication while holding blood pressure constant. Therefore, the correct output should be 'Yes,' indicating that medication negatively affects heart condition when the mediation effect through blood pressure is disregarded.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  {#Nie
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For individuals who are not male and blue-collar workers, the probability of high salary is 7%. For individuals who are not male and white-collar workers, the probability of high salary is 59%. For individuals who are male and blue-collar workers, the probability of high salary is 20%. For individuals who are male and white-collar workers, the probability of high salary is 58%. For individuals who are not male and with low skill levels, the probability of white-collar job is 24%. For individuals who are not male and with high skill levels, the probability of white-collar job is 45%. For individuals who are male and with low skill levels, the probability of white-collar job is 52%. For individuals who are male and with high skill levels, the probability of white-collar job is 81%. The overall probability of high skill level is 18%.\n\nDoes gender negatively affect salary through occupation?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = gender, V3 = occupation, V2 = skill level, and Y = salary.\nThe question asks whether gender negatively affects salary through occupation, which means we are focusing on the natural indirect effect (NIE).\n\nTo compute the NIE, we calculate the difference between salary for males and non-males through the mediator, occupation (V3), while holding skill level (V2) constant.\n\nUsing the provided information:\n- P(high salary | non-male, blue-collar) = 7%, P(high salary | non-male, white-collar) = 59%\n- P(high salary | male, blue-collar) = 20%, P(high salary | male, white-collar) = 58%\n- P(white-collar | non-male, low skill) = 24%, P(white-collar | non-male, high skill) = 45%\n- P(white-collar | male, low skill) = 52%, P(white-collar | male, high skill) = 81%\n- P(high skill) = 18%\n\nTo assess the effect of gender on salary through occupation, we observe that males tend to have higher probabilities of obtaining white-collar jobs than non-males across skill levels, and white-collar jobs are associated with higher salaries.\n\nHowever, the actual salary outcomes for males and non-males in both white-collar and blue-collar jobs are fairly similar (59% for non-male white-collar, 58% for male white-collar).\n\nTherefore, gender does not seem to negatively affect salary through occupation, as males and non-males in the same occupation type have similar salary outcomes.\n</thinking>\n<reflection>No corrections needed. The reasoning correctly identifies that gender does not negatively affect salary through occupation, based on similar salary outcomes across genders within the same occupation.</reflection>\n<output>No</output>"
      }
    ]
  },
  {
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For individuals who are not male and blue-collar workers, the probability of high salary is 7%. For individuals who are not male and white-collar workers, the probability of high salary is 59%. For individuals who are male and blue-collar workers, the probability of high salary is 20%. For individuals who are male and white-collar workers, the probability of high salary is 58%. For individuals who are not male and with low skill levels, the probability of white-collar job is 24%. For individuals who are not male and with high skill levels, the probability of white-collar job is 45%. For individuals who are male and with low skill levels, the probability of white-collar job is 52%. For individuals who are male and with high skill levels, the probability of white-collar job is 81%. The overall probability of high skill level is 18%. Does gender negatively affect salary through occupation?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = gender; V2 = occupation; Y = salary.\nWe need to compute the difference: E[Y_{X = male} - Y_{X = not male}].\nThis can be decomposed into: [P(Y=1 | V2, X=male) - P(Y=1 | V2, X=not male)] * P(V2 | X=not male).\nFrom the given information:\nFor V2 = blue-collar:\nP(Y=1 | X=not male, V2=blue-collar) = 0.07\nP(Y=1 | X=male, V2=blue-collar) = 0.20.\nFor V2 = white-collar:\nP(Y=1 | X=not male, V2=white-collar) = 0.59\nP(Y=1 | X=male, V2=white-collar) = 0.58.\nThe proportion of V2 given X=not male is:\nP(V2=blue-collar | X=not male) = 0.76\nP(V2=white-collar | X=not male) = 0.24.\nThe difference is:\n0.76 * (0.20 - 0.07) + 0.24 * (0.58 - 0.59) = 0.0764.\nSince 0.0764 > 0, gender negatively affects salary through occupation.\n</thinking>\n<reflection>There was an error in my reasoning. The computed difference (0.0764) is positive but small. This means that the effect of gender through occupation does not negatively affect salary. The small positive difference shows no significant negative effect, meaning the correct output should be 'No.'</reflection>\n<output>No</output>"
      }
    ]
  },
  {#correlation
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "The overall probability of alarm set by husband is 3%. The probability of alarm not set by husband and ringing alarm is 72%. The probability of alarm set by husband and ringing alarm is 1%.\n\nIs the chance of ringing alarm smaller when observing alarm set by husband?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet X = alarm set by husband and Y = ringing alarm.\nWe need to assess the correlation between setting the alarm and ringing the alarm.\nFrom the given information:\nP(Y=1, X=1) = 0.01 and P(Y=1, X=0) = 0.72. The probability of ringing the alarm when X=1 is much lower than when X=0. Therefore, observing that the husband sets the alarm decreases the chance of the alarm ringing.\n</thinking>\n<reflection>No corrections needed. The probabilities clearly show that ringing the alarm is less likely when the husband sets the alarm, so the correct answer is 'Yes.'</reflection>\n<output>Yes</output>"
      }
    ]
  },
  {#collision
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For people who are famous, the correlation between attractive appearance and talent is -0.23.\n\nIf we look at people who are famous, does it mean that attractive appearance does not affect talent?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet Y = talent, X = appearance, and V3 = fame. The given information indicates that P(Y = 1 | X = 1, V3 = 1) - P(Y = 1 | X = 0, V3 = 1) = -0.23, meaning there is a negative correlation between attractive appearance and talent when conditioned on fame. This negative correlation suggests that appearance and talent are related when we control for fame, although the effect is negative.\nThus, attractive appearance does affect talent in this context, but negatively. The fact that the correlation is non-zero indicates there is still an effect, even though it is not positive.\n</thinking>\n<reflection>The initial reasoning was incorrect. The negative correlation is due to collider bias, which means that appearance and talent do not have a direct effect on each other. The observed correlation is a result of both appearance and talent influencing fame, not because they affect each other. Therefore, the correct answer is that attractive appearance does not affect talent.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  {#collision
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\nIf you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For people who are famous, the correlation between attractive appearance and talent is -0.23.\n\nIf we look at people who are famous, does it mean that attractive appearance does not affect talent?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet Y = talent, X = appearance, and V3 = fame. The correlation between appearance and talent when conditioned on fame is negative (-0.23). This negative correlation typically arises due to collider bias, meaning both appearance and talent influence fame, but they do not directly affect each other.\nTherefore, once fame is conditioned, there is no direct causal effect between appearance and talent. The negative correlation does not imply a causal relationship between appearance and talent.\n</thinking>\n<reflection>No reflection needed as the reasoning is correct.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  {#bias
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "For people who are famous, the correlation between attractive appearance and talent is -0.09. If we look at people who are famous, does it mean that attractive appearance does not affect talent?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet Y = talent, X = appearance, and V3 = fame. The structure is X -> V3 and Y -> V3, indicating that fame (V3) is influenced by both appearance (X) and talent (Y), making fame a collider. The given correlation between appearance and talent when conditioned on fame is negative (-0.09), which suggests that there is an observed association between appearance and talent due to collider bias. However, this does not imply a direct causal relationship between X and Y.\nTherefore, attractive appearance does not directly affect talent when conditioned on fame.\n</thinking>\n<reflection>No corrections needed. The negative correlation is due to collider bias, confirming that attractive appearance does not affect talent.</reflection>\n<output>Yes</output>"
      }
    ]
  },
  { #exp_away
    "messages": [
      {
        "role": "system",
        "content": "You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags."
      },
      {
        "role": "user",
        "content": "The overall probability of talent is 41%. For students who are not talented and rejected from elite institutions, the probability of being hard-working is 39%. For students who are not talented and accepted to elite institutions, the probability of being hard-working is 69%. For students who are talented and rejected from elite institutions, the probability of being hard-working is 29%. For students who are talented and accepted to elite institutions, the probability of being hard-working is 61%.\n\nIf we look at students accepted to elite institutions, does the chance of being hard-working decrease when talent?"
      },
      {
        "role": "assistant",
        "content": "<thinking>\nLet Y = effort (hard-working); X = talent; V3 = elite institution admission status. The causal structure is X -> V3 and Y -> V3, indicating a collider structure.\n\nWe want to evaluate if being talented (X=1) decreases the probability of being hard-working (Y=1) among students accepted to elite institutions (V3=1).\n\nFrom the given information:\nP(Y=1 | X=1, V3=1) = 0.61\nP(Y=1 | X=0, V3=1) = 0.69.\n\nSince 0.61 < 0.69, being talented (X=1) indeed decreases the likelihood of being hard-working (Y=1) for students accepted to elite institutions (V3=1).\n</thinking>\n<reflection>No corrections needed. The reasoning shows that talent decreases the likelihood of being hard-working in this specific case of students admitted to elite institutions.</reflection>\n<output>Yes</output>"
      }
    ]
  }
]


In [None]:
import os
import json
target_directory = '../model/GPT4o_reflection_fine_tuning_model'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the JSONL file
save_path = os.path.join(target_directory, 'reflection_cladder_finetuned.jsonl')

# Save the data in JSONL format
with open(save_path, 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')  # Write each formatted dictionary as a separate line
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added reflection fine tuning cladder dataset to the model folder"
!git push origin main

In [None]:
import re
def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'<output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found

In [None]:
file_path = '../model/GPT4o_reflection_fine_tuning_model/reflection_cladder_finetuned.jsonl'
client = openai.OpenAI(api_key=openai.api_key)
# Open the file and upload it for fine-tuning
client.files.create(
    file=open(file_path, "rb"),
    purpose='fine-tune'
    )


In [None]:
client.fine_tuning.jobs.create(
  training_file="",
  model="gpt-4o-2024-08-06"
)

In [None]:
job_status = client.fine_tuning.jobs.retrieve("")
print(job_status)

In [None]:
def Cladder_gpt_4o(prompts):
    client = openai.OpenAI(api_key=openai.api_key)
    responses = []
    num = 0
    for prompt in prompts:
        try:
            response = client.chat.completions.create(
                model="ft:",
                messages=[
                    {"role": "user", "content": f"{prompt.strip()}"}
                ],
                max_tokens=1000,
                temperature=0.6
            )
            result = response.choices[0].message.content.strip()
            print(num)
            print(result)
            num += 1
            responses.append({'response': result})
        except Exception as e:
            responses.append({'prompt': prompt.strip(), 'response': str(e)})
        time.sleep(1)  # Add delay to avoid hitting rate limits

    return responses

## Rung1

In [None]:
import json
import openai
import time

# Load the JSON data from rung1.json
with open('rung1.json', 'r') as file:
    data = json.load(file)


# Generate prompts from the JSON data
prompts = []
answers = []
for entry in data:
    given_info = entry['given_info']
    question = entry['question']
    answer = entry['answer']
    prompt = (
        "You are a world-class AI system, capable of complex causal reasoning and reflection. "
        "Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.\n\n"
        "If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.\n\n"
        f"Given Info: {given_info}\nQuestion: {question}"
    )
    prompts.append(prompt)
    answers.append(answer)
# Use the function to get responses from GPT


In [None]:
responses1 = Cladder_gpt_4o(prompts)

In [None]:
cleaned_responses = [clean_response(entry['response']) for entry in responses1]
# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'prompt': prompts,
    'answer': answers,
    'response': cleaned_responses
})

# Calculate accuracy
comparison_df['correct'] = comparison_df['answer'].str.lower() == comparison_df['response'].str.lower()

accuracy = comparison_df['correct'].mean()
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 87.34%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_Cladder_fine_tuned'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Rung1_responses.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Rung1_responses to the model folder"
!git push origin main

## Rung2

In [None]:
import json
import openai
import time

# Load the JSON data from rung1.json
with open('rung2.json', 'r') as file:
    data = json.load(file)


# Generate prompts from the JSON data
prompts = []
answers = []
for entry in data:
    given_info = entry['given_info']
    question = entry['question']
    answer = entry['answer']
    prompt = (
        "You are a world-class AI system, capable of complex causal reasoning and reflection. "
        "Use your causal reasoning ability to deeply analyze the relationship between these factors. Reason through the query inside <thinking> tags and provide your final response(yes or no) inside <output> tags.\n\n"
        "If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.\n\n"
        f"Given Info: {given_info}\nQuestion: {question}"
    )
    prompts.append(prompt)
    answers.append(answer)
# Use the function to get responses from GPT


In [None]:
responses2 = Cladder_gpt_4o(prompts)

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'<output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(entry['response']) for entry in responses2]

In [None]:

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'prompt': prompts,
    'answer': answers,
    'response': cleaned_responses

})

# Calculate accuracy
comparison_df['correct'] = comparison_df['answer'].str.lower() == comparison_df['response'].str.lower()

accuracy = comparison_df['correct'].mean()
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 72.30%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_Cladder_fine_tuned'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Rung2_responses.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Rung2_responses to the model folder"
!git push origin main

## Rung3

In [None]:
import json
import openai
import time

# Load the JSON data from rung1.json
with open('rung3.json', 'r') as file:
    data = json.load(file)


# Generate prompts from the JSON data
prompts = []
answers = []
for entry in data:
    given_info = entry['given_info']
    question = entry['question']
    answer = entry['answer']
    prompt = (
        "You are a world-class AI system, capable of complex causal reasoning and reflection. "
        "Use your causal reasoning ability to deeply analyze the relationship between these factors. Reason through the query inside <thinking> tags and provide your final response(yes or no) inside <output> tags.\n\n"
        "If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.\n\n"
        f"Given Info: {given_info}\nQuestion: {question}"
    )
    prompts.append(prompt)
    answers.append(answer)
# Use the function to get responses from GPT


In [None]:
responses3 = Cladder_gpt_4o(prompts)

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'<output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(entry['response']) for entry in responses3]

In [None]:

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'prompt': prompts,
    'answer': answers,
    'response': cleaned_responses

})

# Calculate accuracy
comparison_df['correct'] = comparison_df['answer'].str.lower() == comparison_df['response'].str.lower()

accuracy = comparison_df['correct'].mean()
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")



Accuracy: 68.64%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/GPT4o_Cladder_fine_tuned'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Rung3_responses.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/.../Research.git
!git add {save_path}
!git commit -m "Added Rung3_responses to the model folder"
!git push origin main