# Cancer 
1. Pollution: exposure to pollutants, 
2. Smoker: smoking habit, 
3. Cancer: Cancer, 
4. Dyspnoea: Dyspnoea, 
5. Xray: getting positive xray result


# Asia
Domain description: pneumonology, a medical specialty that deals with diseases involving the respiratory tract 
1. asia,visited Asia/visiting Asian countries with high exposure to pollutants
2. tub,tuberculosis 
3. smoke,smoking cigarettes habit
4. lung,lung cancer 
5. bronc,bronchitis 
6. either,individual has either tuberculosis or lung cancer 
7. xray, getting positve xray result
8. dysp,"dyspnoae, laboured breathing


# Medical prediction



In [None]:
node_list_asia = [
    ("Visited Asia", "yes/no"),
    ("Smoker", "yes/no"),
    ("Tuberculosis", "yes/no"),
    ("Lung Cancer", "yes/no"),
    ("Bronchitis", "yes/no"),
    ("individual has either Tuberculosis or Lung Cancer", "yes/no"),
    ("X-ray Result", "positive/negative"),
    ("Dyspnea", "yes/no")
]

node_list_cancer = [
    ("Exposure to pollutants", "low/high"),
    ("Smoker", "yes/no"),
    ("individual has Cancer", "yes/no"),
    ("Dyspnea", "yes/no"),
    ("X-ray Result", "positive/negative"),
]


node_list_medical = [
    ("Eat Fatty Food", "Consumes fatty foods) / Does not consume fatty foods"),
    ("Arteriosclerosis", "Present/Absent"),
    ("Right Heart Syndrome", "Present/Absent"),
    ("Left Heart Syndrome", "Present/Absent"),
    ("Lungs Sound Funny", "Abnormal sounds/Normal sounds"),
    ("Difficulty Breathing", "Experiences difficulty/No difficulty"),
    ("Smoking", "Smoker/Non-smoker"),
    ("Radon Exposure", "Exposed/Not exposed"),
    ("Lung Cancer", "Present/Absent"),
    ("Cough Up Blood", "Yes/No"),
]

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

import pandas as pd
import itertools
import math
import json
import os

In [None]:
with open('creds.json') as file:
  creds = json.load(file)
# os.environ["OPENAI_API_KEY"] = creds["OPENAI_API_KEY"]
# llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini").bind(logprobs=True)

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a reasoning assistant tasked with identifying whether a *causal* relationship exists "
     "between two medical-related variables. Each variable has discrete states, listed in parentheses. "
     "Assume that variables represent conditions, risk factors, symptoms, or diagnostic test results related to a single patient. "
     "Causality should reflect real-world biomedical reasoning: for example, a disease might cause a symptom, or a risk factor might cause a disease. "
     "Only return 'True' if changing the first variable is likely to lead to a change in the second. "
     "Return only 'True' or 'False'. Do not confuse correlation with causation."),
    
    ("human",
    "Consider the following two variables:\n"
    "  - {cause}: ({cause_states})\n"
    "  - {effect}: ({effect_states})\n\n"
    "Is it plausible that changes in '{cause}' can {causal_verb} '{effect}'?"
    "Answer only with 'True' or 'False'."),
])    

In [None]:
def get_logprob_for_answer(logprobs, answer):
    for entry in logprobs:
        if entry['token'].strip() == answer:
            return entry['logprob']
    return None  

def message_func (messages):
    msg = llm.invoke(messages)
    answer = msg.content.strip()
    logprobs = msg.response_metadata["logprobs"]["content"]
    logprob = get_logprob_for_answer(logprobs, answer)
    return answer, logprob


def calculate_probability(logprob):
    if logprob is not None:
        return math.exp(logprob)
    return None  # Return None if logprob is None

def new_row(node1, node2, verb, answer, probability):
    return pd.DataFrame([{"var1": node1, "var2": node2, "verb": verb, "answer": answer, "probability": probability}])

def process_input(node1, node2, verb):
    
    cause, cause_states = node1
    effect, effect_states = node2
    
    messages = prompt.format_messages(
        cause=cause,
        effect=effect,
        cause_states=cause_states,
        effect_states=effect_states,
        causal_verb=verb
    )
    
    answer, logprob = message_func(messages)
    probability = calculate_probability(logprob)
    return new_row(node1, node2, verb, answer, probability)

In [25]:
node_list_medical

[('Eat Fatty Food', 'Consumes fatty foods) / Does not consume fatty foods'),
 ('Arteriosclerosis', 'Present/Absent'),
 ('Right Heart Syndrome', 'Present/Absent'),
 ('Left Heart Syndrome', 'Present/Absent'),
 ('Lungs Sound Funny', 'Abnormal sounds/Normal sounds'),
 ('Difficulty Breathing', 'Experiences difficulty/No difficulty'),
 ('Smoking', 'Smoker/Non-smoker'),
 ('Radon Exposure', 'Exposed/Not exposed'),
 ('Lung Cancer', 'Present/Absent'),
 ('Cough Up Blood', 'Yes/No')]

In [None]:

# path ="results/cancer/"
# path = "results/asia/"
path = "results/medicine/"


causal_verbs = [
    "cause",
    "lead to",
    "result in",
    "affect",
    "influence",
    "increase the chance of",
    "raise the risk of"
]

df_all = pd.DataFrame(columns=["var1", "var2", "verb","answer", "probability"])

for verb in causal_verbs:
    df_results_var3 = pd.DataFrame(columns=["var1", "var2", "verb","answer", "probability"])
  
    for node1, node2 in itertools.combinations(node_list_medical, 2):
            if node1 != node2:
                print(f"Processing {node1} and {node2} with verb '{verb}'")
                
                new_row_data = process_input(node1, node2, verb)
                df_results_var3 = pd.concat([df_results_var3, new_row_data], ignore_index=True)
                df_all = pd.concat([df_all, new_row_data], ignore_index=True)                

                new_row_data = process_input(node2, node1, verb)
                df_results_var3 = pd.concat([df_results_var3, new_row_data], ignore_index=True)
                df_all = pd.concat([df_all, new_row_data], ignore_index=True)

                
    df_results_var3.to_csv(f"{path}causal_relationships_{verb}.csv", index=False)            

df_all.to_csv(f"{path}medicine_all.csv", index=False)   

In [28]:
df_results_var3.head(3)

Unnamed: 0,var1,var2,verb,answer,probability
0,"(Eat Fatty Food, Consumes fatty foods) / Does ...","(Arteriosclerosis, Present/Absent)",raise the risk of,True,1.0
1,"(Arteriosclerosis, Present/Absent)","(Eat Fatty Food, Consumes fatty foods) / Does ...",raise the risk of,False,1.0
2,"(Eat Fatty Food, Consumes fatty foods) / Does ...","(Right Heart Syndrome, Present/Absent)",raise the risk of,False,0.562176


In [11]:
df_results_var3.to_csv(f"{path}causal_relationships_{verb}.csv", index=False)  