In [None]:
import os
import sys
import pandas as pd
pd.options.mode.copy_on_write = True 

from pathlib import Path
import cdata_utils
import numpy as np
import matplotlib.pyplot as plt
import cdata_utils.utils
import datetime


import json

#import cdata_utils.preprocess.read_and_clean_tabular
from cdata_utils.project_specific.psvd import (
    read_and_clean_PSVD_data__BL_consensus,
    read_and_clean_PSVD_data__BL_consensus_NEW,
    categorize_PSVD_data,
    exclude_patients,
    reorder_some_categorical_values, 
    table1_psvd, 
    table1_psvd_spleen, 
    descriptive_df_from_masks, 
    masks_for_endpoint_1__decompensation, 
    masks_for_endpoint_2__death,
    make_y_delta,
    drop_non_numeric_columns,
    table_of_valid_entries, 
    univariate_cox_ph_summary, 
    normalize_df, 
    load_EP1_EP2_data,
    relevant_column_names,
    relevant_column_names_clinical,
    categorize_PSVD_clinical_data,
    load_clinical_data,
)




import cdata_utils.preprocess
import cdata_utils.project_specific
import cdata_utils.project_specific.psvd


import lifelines
from lifelines import CoxPHFitter
from lifelines.datasets import load_rossi

from sklearn.preprocessing import StandardScaler

# path info: 
if "cwatzenboeck" in os.getcwd(): # desktop 
    data_path = Path("/home/cwatzenboeck/Dropbox/work/data/livermodel/PSVD/")
    data_path_output=Path("/home/cwatzenboeck/data/psvd/output_coxph/")
else: # laptop 
    data_path = Path("/home/clemens/Dropbox/work/data/livermodel/PSVD/")
    # data_path = Path("/home/clemens/projects/project_liver_model/data/PSVD")
    



In [None]:
df1, df2 = load_EP1_EP2_data(data_path,  file_name = "data_PSVD_unified_3.xlsx")

In [None]:
# filter out categorical values (which might be encoded differently):
    #    BL1_Ascites (0=none, 1=little, 2=moderate, 3=severe) 
    #    BL PV overall extent (no PVT=0, <50%=1, ≥50%=2)  
    #    BL segment IV MW (-1 = atrophy, 0 = normal,  1 = hypertrophy)
    #    BL segment 1 consensus (-1 = atrophy, 0 = normal,  1 = hypertrophy)


cdata_utils.descriptive.basic_stats.describe(  
df1.filter(regex="Ascites|PV overall extent|segment IV MW|segment 1 consensus")
).iloc[8:,:].transpose()


In [None]:
potentially_category_columns = list(df1.filter(regex="Ascites|PV overall extent|segment IV MW|segment 1 consensus").columns)
#potentially_category_columns = list(df1.filter(regex="PV overall extent").columns)
potentially_category_columns 

df1_ = df1[["BL Location consensus binary cat.", "BL Ascites mean", "status", "event"]]

# CoxPH for EP1:  
univariate_cox_ph_summary(XY=df1_, duration_col="event", event_col="status")

In [None]:
# CoxPH for EP1:  
df1_ph = univariate_cox_ph_summary(XY=df1, duration_col="event", event_col="status")



In [None]:

# make one hot encoding for atrophy, normal, hypertrophy
df1_ = df1[potentially_category_columns + ["status", "event"]]
df1_cat = cdata_utils.project_specific.psvd.categorize(df1_, category_columns=potentially_category_columns, drop_first=False)
df1_ph_add = univariate_cox_ph_summary(XY=df1_cat, duration_col="event", event_col="status")
# print(df1_ph_add.sort_values("p", ascending=True))

df1_ph.to_excel(data_path_output / "EP1_outcome_univariate.xlsx", index=False)
df1_ph_add.to_excel(data_path_output / "EP1_outcome_univariate_different_endcoding.xlsx", index=False)
df1_ph_all = pd.concat([df1_ph, df1_ph_add], axis=0)

#df1_ph_all[df1_ph_all["p"] <= 0.157][["covariate", "p"]].sort_values("p", ascending=True)

In [None]:
df2_ph = univariate_cox_ph_summary(XY=df2, duration_col="event", event_col="status")
df2_ph.to_excel(data_path_output  / "EP2_outcome_univariate.xlsx", index=False)
# df2_ph.sort_values("p", ascending=True)



In [None]:
df = df2
events = df['status'].astype(bool)

problematic_covariates = ['BL Atrophy/hypertrophy complex consensus', 
                          'BL intrahepatic shunts consensus', 
                          'BL Splanchnic thrombosis consensus binary cat. 2',  
                          'BL Intrahepatic portal abnormalities consensus binary cat. 1', 
                          'BL Intrahepatic portal abnormalities consensus binary cat. 3']

for c in problematic_covariates:


    print("covariate = ", c)
    print( sum(df.loc[events, c]), sum(df.loc[~events, c])  )
    print(df.loc[events, c].var())
    print(df.loc[~events, c].var())
    print()


In [None]:

# make one hot encoding for atrophy, normal, hypertrophy

df2_ = df2[potentially_category_columns + ["status", "event"]]
df2_cat = cdata_utils.project_specific.psvd.categorize(df2_, category_columns=potentially_category_columns, drop_first=False)
df2_ph_add = univariate_cox_ph_summary(XY=df2_cat, duration_col="event", event_col="status")

df2_ph_add.to_excel(data_path_output  / "EP2_outcome_univariate_different_endcoding.xlsx", index=False)

# df2_ph_all = pd.concat([df2_ph, df2_ph_add], axis=0)
# df2_ph_all[df2_ph_all["p"] <= 0.157][["covariate", "p"]]


In [None]:
df = df2_cat
events = df['status'].astype(bool)

problematic_covariates = ['BL PV overall extent (no PVT=0, <50%=1, ≥50%=2)_1', 'BL segment 1 consensus_-1', 'BL segment IV MW_-1']

for c in problematic_covariates:
    print("covariate = ", c)
    print( sum(df.loc[events, c]), sum(df.loc[~events, c])  )
    print(df.loc[events, c].var())
    print(df.loc[~events, c].var())
    print()

# Clinical parameters 


In [None]:


drop_negative_times_to_event_cases=True

df_c = load_clinical_data(data_path, file_name="data_PSVD_unified_3.xlsx", drop_modfied_colums=True)
df1c = df1.join(df_c.drop(columns=['Sex (1=male, 2=female)']))
df2c = df2.join(df_c.drop(columns=['Sex (1=male, 2=female)']))

# cols_clinical = relevant_column_names_clinical(dfo, chill=False)


univariate_cox_ph_summary(XY=df1_, duration_col="event", event_col="status")

# c1 = "1. Decompensation date"
# c2 = "Death"
# df1c = make_y_delta(df_c, c1)
# df2c = make_y_delta(df_c, c2)



# # drop negative event cases maybe: 
# m = df1c["event"] <= 0
# if drop_negative_times_to_event_cases and sum(m>0):
#     df1c = df1c[~m]
#     print(f"Drop cases with negative time-to-event: ", list(df1c[m]["ID"]), " for EP1")
# elif not drop_negative_times_to_event_cases and sum(m>0):
#     print(f"WARNING negative time-to-event: ", list(df1c[m]["ID"]), " for EP1 were NOT dropped")
    

# m = df2c["event"] <= 0
# if drop_negative_times_to_event_cases and sum(m>0):
#     df2c = df2c[~m]
#     print(f"Drop cases with negative time-to-event: ", list(df2c[m]["ID"]), " for EP2")
# elif not drop_negative_times_to_event_cases and sum(m>0):
#     print(f"WARNING negative time-to-event: ", list(df2c[m]["ID"]), " for EP2 were NOT dropped")

In [None]:
list(df1c.columns)

In [None]:
df1c



regex_patterns_clinical = [
    "age|Age", 
    "sex|Sex", 
    # "BL decompensated", # This makes no sense for EP1 
    "Child-Pugh-Score", 
    "MELD", "Crea", "BL_Na", "Plt|PLT", "Alb", "WBC", 
    "PSVD cause.*1",
    "PSVD cause.*2",
    "PSVD cause.*3",
    "PSVD cause.*4", 
    "HVPG \(mmHg\)", "LSM \(kPa\)"
]
    
columns = relevant_column_names(df1c, regex_patterns=regex_patterns_clinical , chill=False)


df1c_ph = univariate_cox_ph_summary(XY=df1c[columns + ["status", "event"]], duration_col="event", event_col="status")
df1c_ph.to_excel(data_path_output  / "EP1_clinical_parameters_outcome_univariate.xlsx", index=False)
# df2_ph.sort_values("p", ascending=True)



In [None]:
df1c_ph.sort_values("p")