In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import os
import glob
import pandas as pd
import sys  

from scipy.stats import friedmanchisquare
from scipy.stats import wilcoxon

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [12]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src import metrics

In [13]:
path = "results/asia/"
files = glob.glob(os.path.join(path, "*.csv"))

df_combined = pd.DataFrame()

for file in files:
    print(file)
    df = pd.read_csv(file)
    df = metrics.clean_dataframe(df)
    df_combined = pd.concat([df_combined, df], ignore_index=True)
    

results/asia\causal_relationships_affect.csv
results/asia\causal_relationships_cause.csv
results/asia\causal_relationships_increase_the_chance_.csv
results/asia\causal_relationships_influence.csv
results/asia\causal_relationships_lead_to.csv
results/asia\causal_relationships_raise_the_risk.csv
results/asia\causal_relationships_result_in.csv


In [14]:
df_combined

Unnamed: 0,var1,var2,verb,answer,probability,answer_binary
0,Visited_Asia,Smoker,affect,False,0.999999,0
1,Smoker,Visited_Asia,affect,False,0.999999,0
2,Visited_Asia,Tuberculosis,affect,True,1.000000,1
3,Tuberculosis,Visited_Asia,affect,False,0.999999,0
4,Visited_Asia,Lung_Cancer,affect,False,1.000000,0
...,...,...,...,...,...,...
387,X-ray_Result,individual_has_either_Tuberculosis_or_Lung_Cancer,result_in,False,1.000000,0
388,individual_has_either_Tuberculosis_or_Lung_Cancer,Dyspnea,result_in,True,1.000000,1
389,Dyspnea,individual_has_either_Tuberculosis_or_Lung_Cancer,result_in,False,0.999999,0
390,X-ray_Result,Dyspnea,result_in,False,1.000000,0


In [15]:
verb_list = list(df_combined["verb"].unique())
print("Unique verbs:" , (verb_list))

Unique verbs: ['affect', 'cause', 'increase_the_chance_of', 'influence', 'lead_to', 'raise_the_risk_of', 'result_in']


In [20]:
path_list = ["results/asia/", "results/cancer/", "results/medicine/"]

df_combined = pd.DataFrame()
df_results = pd.DataFrame()

for path in path_list:
    print(path)
    files = glob.glob(os.path.join(path, "*.csv"))
    for file in files:
        # print(file)
        df = pd.read_csv(file)
        df = metrics.clean_dataframe(df)
        df_combined = pd.concat([df_combined, df], ignore_index=True)

    verb_list = list(df_combined["verb"].unique())

    groups = [df_combined[df_combined["verb"] == verb]["probability"] for verb in verb_list]
    # Run the Friedman test
    friedman_stat, p_value = friedmanchisquare(*groups)

    print("Friedman test statistic:", friedman_stat)
    print("Friedman test p-value:", p_value)

    # Perform post-hoc or significance check
    df_new  = metrics.significant_check(df_combined, "probability", p_value, path, verb_list)
    df_results = pd.concat([df_results, df_new], ignore_index=True)


results/asia/
Friedman test statistic: 20.623880597014924
Friedman test p-value: 0.0021429213755423697
Friedman test is significant, performing post-hoc tests
Significant difference between cause and influence
Significant difference between cause and raise_the_risk_of
Significant difference between influence and lead_to
Significant difference between influence and result_in
results/cancer/
Friedman test statistic: 29.00959692898247
Friedman test p-value: 6.0583049865279704e-05
Friedman test is significant, performing post-hoc tests
Significant difference between cause and influence
Significant difference between cause and raise_the_risk_of
Significant difference between cause and result_in
Significant difference between influence and lead_to
Significant difference between influence and result_in
results/medicine/
Friedman test statistic: 51.6858982451716
Friedman test p-value: 2.1566644240983773e-09
Friedman test is significant, performing post-hoc tests
Significant difference between 

In [48]:
display(df_results[df_results["data"] == "results/asia/"],
df_results[df_results["data"] == "results/cancer/"],
df_results[df_results["data"] == "results/medicine/"])

Unnamed: 0,verb1,verb2,statistic,p_value,significant,data
0,affect,cause,441.0,0.276817,False,results/asia/
1,affect,increase_the_chance_of,446.0,0.744298,False,results/asia/
2,affect,influence,385.5,0.741266,False,results/asia/
3,affect,lead_to,332.5,0.136557,False,results/asia/
4,affect,raise_the_risk_of,489.0,0.573534,False,results/asia/
5,affect,result_in,361.5,0.118948,False,results/asia/
6,cause,increase_the_chance_of,455.5,0.483958,False,results/asia/
7,cause,influence,362.0,0.032365,True,results/asia/
8,cause,lead_to,403.0,0.397569,False,results/asia/
9,cause,raise_the_risk_of,384.5,0.036837,True,results/asia/


Unnamed: 0,verb1,verb2,statistic,p_value,significant,data
21,affect,cause,763.5,0.13528,False,results/cancer/
22,affect,increase_the_chance_of,836.5,0.714259,False,results/cancer/
23,affect,influence,801.5,0.842403,False,results/cancer/
24,affect,lead_to,689.0,0.274522,False,results/cancer/
25,affect,raise_the_risk_of,891.5,0.862631,False,results/cancer/
26,affect,result_in,747.5,0.299174,False,results/cancer/
27,cause,increase_the_chance_of,780.5,0.32207,False,results/cancer/
28,cause,influence,673.0,0.014069,True,results/cancer/
29,cause,lead_to,681.5,0.249119,False,results/cancer/
30,cause,raise_the_risk_of,693.5,0.013251,True,results/cancer/


Unnamed: 0,verb1,verb2,statistic,p_value,significant,data
42,affect,cause,3467.5,0.036286,True,results/medicine/
43,affect,increase_the_chance_of,3632.0,0.760039,False,results/medicine/
44,affect,influence,3111.0,0.531397,False,results/medicine/
45,affect,lead_to,3362.0,0.200563,False,results/medicine/
46,affect,raise_the_risk_of,4131.0,0.768673,False,results/medicine/
47,affect,result_in,3955.5,0.912699,False,results/medicine/
48,cause,increase_the_chance_of,3567.5,0.062015,False,results/medicine/
49,cause,influence,2980.0,0.001358,True,results/medicine/
50,cause,lead_to,3035.0,0.049335,True,results/medicine/
51,cause,raise_the_risk_of,3721.5,0.005778,True,results/medicine/


In [51]:

df_filter_true = df_results[df_results["significant"] == True]
print(df_filter_true.verb1.unique())
df_filter_true

['cause' 'influence' 'affect']


Unnamed: 0,verb1,verb2,statistic,p_value,significant,data
7,cause,influence,362.0,0.032365,True,results/asia/
9,cause,raise_the_risk_of,384.5,0.036837,True,results/asia/
15,influence,lead_to,247.0,0.006316,True,results/asia/
17,influence,result_in,270.5,0.014396,True,results/asia/
28,cause,influence,673.0,0.014069,True,results/cancer/
30,cause,raise_the_risk_of,693.5,0.013251,True,results/cancer/
31,cause,result_in,576.0,0.046511,True,results/cancer/
36,influence,lead_to,511.5,0.007717,True,results/cancer/
38,influence,result_in,549.5,0.027691,True,results/cancer/
42,affect,cause,3467.5,0.036286,True,results/medicine/


In [None]:
# df_results.to_csv("results/sensitivity_verb.csv", index=False)

In [None]:
path_list = ["results/asia/", "results/cancer/", "results/medicine/"]

df_combined_2 = pd.DataFrame()
df_results_2 = pd.DataFrame()

for path in path_list:
    print(path)
    files = glob.glob(os.path.join(path, "*.csv"))
    for file in files:
        # print(file)
        df = pd.read_csv(file)
        df = metrics.clean_dataframe(df)
        df_combined_2 = pd.concat([df_combined_2, df], ignore_index=True)
        
    #group by the verb and calculate the mean probability and std
    df_combined_2 = df_combined_2.groupby("verb").agg({"probability": ["mean", "std"]}).reset_index()
    df_combined_2.columns = ["verb", "mean_probability", "std_probability"]
    df_combined_2["data"] = path.split("/")[-2]  # Extract the folder name as data


    df_results_2 = pd.concat([df_results_2, df_combined_2], ignore_index=True)


results/asia/
results/cancer/
results/medicine/


In [63]:
display(df_results_2[df_results_2["data"] == "asia"],
df_results_2[df_results_2["data"] == "cancer"],
df_results_2[df_results_2["data"] == "medicine"])

Unnamed: 0,verb,mean_probability,std_probability,data
0,affect,0.994768,0.021891,asia
1,cause,0.991857,0.043287,asia
2,increase_the_chance_of,0.970307,0.107637,asia
3,influence,0.996214,0.020654,asia
4,lead_to,0.994172,0.0276,asia
5,raise_the_risk_of,0.996012,0.015194,asia
6,result_in,0.992021,0.030638,asia


Unnamed: 0,verb,mean_probability,std_probability,data
7,affect,0.997565,0.010592,cancer
8,cause,0.997849,0.008361,cancer
9,increase_the_chance_of,0.962147,0.108026,cancer
10,influence,0.999788,0.00059,cancer
11,lead_to,0.999675,0.001171,cancer
12,raise_the_risk_of,0.995934,0.013791,cancer
13,result_in,0.983407,0.07165,cancer


Unnamed: 0,verb,mean_probability,std_probability,data
14,affect,0.996561,0.023388,medicine
15,cause,0.995238,0.032632,medicine
16,increase_the_chance_of,0.990724,0.040776,medicine
17,influence,0.99892,0.005729,medicine
18,lead_to,0.990238,0.054789,medicine
19,raise_the_risk_of,0.983231,0.076229,medicine
20,result_in,0.997544,0.019288,medicine
