In [None]:
import pandas as pd
import os
import re
from datetime import datetime
from dateutil import relativedelta
import numpy as np
import math
import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

###  Process demographic data

Utility Functions

In [None]:

def read_csv(file_name, delimiter = ',', chunksize = 10000, na_filter = False) :
    for chunk in pd.read_csv(file_name, chunksize=chunksize, delimiter=delimiter, na_filter=na_filter):
        yield chunk

def createDate(str, fmt="%Y-%m-%d"):
    try:
        return datetime.strptime(str, fmt)
    except:
        return None

def num_to_range(x, thresholds):
  if(math.isnan(x)):
    return None
  i=1
  while i < len(thresholds):
    if(int(x)<=thresholds[i]):
        return f"{thresholds[i-1]+1}-{thresholds[i]}"
    i += 1
  return f"{thresholds[i-1]}+"

def isPrevalent(exposure_date, index_date):
  if exposure_date=="0" :
    return 0
  else:
    exposure_date = pd.to_datetime(exposure_date)
    if exposure_date > index_date:
      return 0
    else:
       return 1

def isPostIncident(exposure_date, index_date):
  if exposure_date=="0" :
    return 0
  else:
    exposure_date = pd.to_datetime(exposure_date)
    if exposure_date > index_date:
      return 1
    else:
       return 0


Set input and output directories

In [None]:
pheno_path = "/genesandhealth/red/AshithaJoby/custom_phenotypes/"
case_path = "/genesandhealth/red/AshithaJoby/Cases/"
out_path = "/genesandhealth/red/AshithaJoby/results/comorbidity/"
demo_path = "/genesandhealth/red/AshithaJoby/results/07052024/cohort_demo.csv"

#### Build case and control groups
1. Derive the core cancer cohort with patients having at least one primary cancer diagnosis. Based on the speicific cancer of interest in case-control analysis, case age at diagnosis date will be assigned as Index age.
2. Exclude the core cancer cohort from the overall population to get core control cohort. Also exclude patients with secondary malignancies. Age at recuitment to G&H will be considered as Index age.
3. Exclude patients from both case and control groups with Index age < 18.

Choose one of the two cancer phenotype groups and follow rest of the pipeline

In [None]:
# Read cohort demographics file
#df_cohort = pd.read_csv(demo_path+"cohort_demo.csv",header = 0)
df_cohort = pd.read_csv(demo_path,header = 0)
df_cohort["DOB"] = df_cohort["DOB"].apply(lambda x:createDate(x))
population = df_cohort.shape[0]
print(f"Population size:{population}")

In [None]:
df_cohort

1. GNH cancer phenotypes

In [None]:
mode = 'GNH'
cancer_dict = { 'Oro-pharyngeal': 'Oro-pharyngeal','Other_Skin_and_subcutaneous_tissue' : 'Skin (other)', 'biliary_tract': 'Biliary tract', 'bladder' : 'Bladder', 
              'bone_and_articular_cartilage' : 'Bone','brain_other_CNS_and_intracranial': 'CNS', 'breast' : 'Breast', 'cervical' : 'Cervical',
              'pancreatic': 'Pancreatic', 'prostate' : 'Prostate','colorectal_and_anus' : 'Colorectal', 'kidney_and_ureter' : 'Kidney', 'liver' : 'Liver',
              'stomach' : 'Stomach', 'testicular' : 'Testicular', 'thyroid' : 'Thyroid', 'uterine' : 'Uterine', 'ovarian': 'Ovarian',
              'lung_and_trachea' : 'Lung', 'malignant_melanoma' : 'Melanoma', 'mesothelioma' : 'Mesothelioma','oesophageal' : 'Oesophageal', 
              'Hodgkin_Lymphoma':'Hodgkin Lymphoma','Leukaemia':'Leukaemia','Multiple_myeloma_and_malignant_plasma_cell_neoplasms':'Myeloma',
              'Non-Hodgkin_Lymphoma':'Non-Hodgkin Lymphoma','Other_GNH' : 'Other' 
              }
# Other_GNH cases are manually formatted from GNH curated phenotype case directory. 
cancer_group = cancer_dict.keys()
pheno_list = os.listdir(pheno_path)
case_list = os.listdir(case_path)
cancer_cohort = pd.DataFrame()
for cancer_ph in cancer_group:
  filename = 'Primary_Malignancy_'+cancer_ph+'.Cases.csv'
  cancer = pd.read_csv(case_path+filename, na_filter=False)
  cancer.rename(columns={'nhs_number': 'PseudoNHSnumber', 'date': 'DiagDt', 'code':'Code'}, inplace=True)
  if cancer_ph == 'prostate':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="male"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'breast':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'testicular':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="male"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'cervical':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'uterine':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'ovarian':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  cancer.columns = ["PseudoNHSnumber","date","code"]
  cancer["cancer"] = cancer_dict[cancer_ph]
  cancer["date"] = cancer["date"].apply(lambda x:createDate(x))
  cancer_cohort = pd.concat([cancer_cohort,cancer], ignore_index=True)

In [None]:
cancer_cohort

In [None]:
# Print basic statistics
print(cancer_cohort.shape)
print(cancer_cohort.PseudoNHSnumber.nunique())
print(cancer_cohort.cancer.value_counts())

Build analysis cohort

In [None]:
# Identify core case and control cohort
idx1 = df_cohort.set_index(['PseudoNHSnumber']).index
idx2 = cancer_cohort.set_index(['PseudoNHSnumber']).index
case_cohort = df_cohort[idx1.isin(idx2)]
control_cohort = df_cohort[~idx1.isin(idx2)]

# Exclude controls with age < 18
control_cohort = control_cohort[control_cohort["Age_at_recruitment"]>=18]
# Exclude controls with seocndary cancer, i.e., metastasis or recurrence
filename = 'Secondary_Malignancy_and_metastasis.Cases.csv'
cancer = pd.read_csv(case_path+filename, na_filter=False)
cancer.columns = ["PseudoNHSnumber","date","code"]
idx3 = control_cohort.set_index(['PseudoNHSnumber']).index
idx4 = cancer.set_index(['PseudoNHSnumber']).index
control_cohort = control_cohort[~idx3.isin(idx4)]


case_cohort["Group"] = "Case"
control_cohort["Group"] = "Control"
analysis_cohort = pd.concat([case_cohort,control_cohort])

# Print stats
print(f"case cohort:{case_cohort.shape[0]}")
print(f"control cohort:{control_cohort.shape[0]}")
print(f"analysis cohort:{analysis_cohort.shape[0]}")

Add phenotype incidence date to the analysis cohort

In [None]:
morbidity_list = []
pheno_list = os.listdir(pheno_path)
for pheno in pheno_list:
  if re.search("Primary_(M|m)alignancy|GNH|MGH|PRIMED|PADGEN|Secondary_(M|m)alignancy|Lymphoma|myeloma|metastasis|carcinoma|Leukaemia",pheno):
    continue
  filename = pheno+'_summary_report.csv'
  morbid = pd.read_csv(pheno_path+pheno+'/'+filename, na_filter=False)
  morbid = morbid[["nhs_number","date"]]
  morbid.columns = ["PseudoNHSnumber","date"]
  analysis_cohort = analysis_cohort.merge(morbid, how='left', on='PseudoNHSnumber')
  analysis_cohort['date'] = analysis_cohort['date'].fillna(0)
  analysis_cohort = analysis_cohort.rename(columns={'date': pheno})
  morbidity_list.append(pheno)


for cancer_ph in cancer_group:
  filename = 'Primary_Malignancy_'+cancer_ph+'.Cases.csv'
  cancer = pd.read_csv(case_path+filename, na_filter=False)
  cancer.rename(columns={'nhs_number': 'PseudoNHSnumber', 'date': 'DiagDt', 'code':'Code'}, inplace=True)
  cancer = cancer[["PseudoNHSnumber","DiagDt"]]
  cancer.columns = ["PseudoNHSnumber","date"]
  analysis_cohort = analysis_cohort.merge(cancer, how='left', on='PseudoNHSnumber')
  analysis_cohort['date'] = analysis_cohort['date'].fillna(0)
  analysis_cohort = analysis_cohort.rename(columns={'date': cancer_dict[cancer_ph]+'_Cancer'})

In [None]:
analysis_cohort.to_csv(out_path+mode+"/analysis_cohort.csv",index=False)

In [None]:
print(analysis_cohort.columns)

In [None]:
analysis_cohort

PROPENISTY SCORE MATCHING

In [None]:
cancer_list = list(cancer_dict.values())
#columns = ["PseudoNHSnumber","Gender","DOB","Ethnicity","DOR","Group"]
df_analysis_demo = analysis_cohort.copy()

df_analysis_demo["Group"] = df_analysis_demo["Group"].replace(['Case','Control'],[1,0])

df_analysis_demo["Gender"] = np.where(df_analysis_demo["Gender"].isin(['male','female']), df_analysis_demo["Gender"], 'Unknown')
df_analysis_demo = df_analysis_demo[df_analysis_demo['Gender']!='Unknown']
#dummies = pd.get_dummies(df_analysis_demo["Gender"], dtype=int)
#df_analysis_demo = pd.concat([df_analysis_demo, dummies], axis='columns')
#df_analysis_demo = df_analysis_demo.drop(['Gender', 'Unknown'], axis='columns')

# The smalled Indian group is merged with Other Asian group. Comment out the following two lines if want to keep Indians as a separate group.
df_analysis_demo["Ethnicity"] = np.where(df_analysis_demo["Ethnicity"].isin(['Asian or Asian British - Bangladeshi','Asian or Asian British - Pakistani','Not known','Not stated']), df_analysis_demo["Ethnicity"], 'Other')
df_analysis_demo["Ethnicity"] = df_analysis_demo["Ethnicity"].replace(['Asian or Asian British - Bangladeshi','Asian or Asian British - Pakistani','Other','Not known','Not stated'],['Bangladeshi','Pakistani','Other','Unknown','Unknown'])
df_analysis_demo = df_analysis_demo[df_analysis_demo['Ethnicity']!='Unknown']
#dummies = pd.get_dummies(df_analysis_demo["Ethnicity"], dtype=int)
#df_analysis_demo = pd.concat([df_analysis_demo, dummies], axis='columns')
#df_analysis_demo = df_analysis_demo.drop(['Ethnicity','Other', 'Unknown'], axis='columns')
#df_analysis_demo = df_analysis_demo.drop(['Ethnicity', 'Unknown'], axis='columns')


In [None]:
df_analysis_demo

In [None]:
df_analysis = df_analysis_demo.copy()
def compute_index_date(row):
    if row['Group'] == 0:
        return createDate(row['DOR'])
    elif row['Group'] == 1:
        dates = []
        for c in cancer_list:
            cancer = c + "_Cancer"
            if row[cancer] != 0:
                dates.append(createDate(row[cancer]))
        if dates:
            return min(dates)
    return pd.NaT

df_analysis['Index_date'] = df_analysis.apply(compute_index_date, axis=1)


In [None]:
df_analysis

In [None]:
# Convert columns to datetime
df_analysis['DOB'] = pd.to_datetime(df_analysis['DOB'])
df_analysis['Index_date'] = pd.to_datetime(df_analysis['Index_date'])

# Check the converted dates
#print("DOB and Index_date after conversion to datetime:")
#print(df_analysis[['DOB', 'Index_date']])

# Function to calculate age with print statements for debugging
def calculate_age(row):
    #print(f"Row: {row}")
    if pd.isnull(row['Index_date']):
        #print("Index_date is null, returning None")
        return None
    #print(f"Calculating age difference between {row['Index_date']} and {row['DOB']}")
    age = relativedelta.relativedelta(row['Index_date'], row['DOB']).years
    #print(f"Calculated age: {age}")
    return age

# Apply the function
df_analysis['Index_age'] = df_analysis.apply(calculate_age, axis=1)



In [None]:
df_analysis["Index_age_group"] = pd.cut(df_analysis["Index_age"],bins=[0,40,50,60,70,80,120],labels=[0,1,2,3,4,5],include_lowest = True)
   

In [None]:
df_analysis

In [None]:
columns =["PseudoNHSnumber","Group","Gender","Ethnicity","Index_age"]
for cancer_ph in cancer_group:
    columns.append(cancer_dict[cancer_ph]+'_Cancer')

In [None]:
columns

In [None]:
before_matching = df_analysis.loc[:,columns]

In [None]:
before_matching

In [None]:
matching_path = "/genesandhealth/red/AshithaJoby/results/Matched_data_files/GNH"
before_matching.to_csv(matching_path+"/before_matching.csv",index=False)

In [None]:
df_analysis_main =df_analysis.copy()

AFTER PROPENSITY SCORE MATCHING IN R

#### Cancer-Morbidity pairwise association study 

Transform predictor variables from categorical to numeric as appropriate

Run logistic regression: include gender, age group, and ethnic group as covariates

Accumulate odds ratio, confidence interval, p-value

In [None]:
cancer_set = ['Oro.pharyngeal_Cancer',
                 'Skin..other._Cancer',
                 'Biliary.tract_Cancer',
                 'Bladder_Cancer',
                 'Bone_Cancer',
                 'CNS_Cancer',
                 'Breast_Cancer',
                 'Cervical_Cancer',
                 'Pancreatic_Cancer',
                 'Prostate_Cancer',
                 'Colorectal_Cancer',
                 'Kidney_Cancer',
                 'Liver_Cancer',
                 'Stomach_Cancer',
                 'Testicular_Cancer',
                 'Thyroid_Cancer',
                 'Uterine_Cancer',
                 'Ovarian_Cancer',
                 'Lung_Cancer',
                 'Melanoma_Cancer',
                 'Mesothelioma_Cancer',
                 'Oesophageal_Cancer',
                 'Hodgkin.Lymphoma_Cancer',
                 'Leukaemia_Cancer',
                 'Myeloma_Cancer',
                 'Non.Hodgkin.Lymphoma_Cancer',
                 'Other_Cancer']

In [None]:
gender_specific_cancers=["Breast_Cancer",
                           'Cervical_Cancer',
                           'Prostate_Cancer',
                           'Testicular_Cancer',
                           'Uterine_Cancer',
                           'Ovarian_Cancer']

In [None]:

morbidity_list = ['Dermatitis_atopc_contact_other_unspecified', 'Iron_deficiency_with_and_without_anaemia', 'Anxiety_and_phobia',
                   'Unspecified_or_Rare_Diabetes', 'Migraine', 'Urinary_Incontinence', 'Allergic_and_chronic_rhinitis', 
                   'Enthesopathies__synovial_disorders', 'Sjogrens_disease', 'Asthma', 'Obesity', 'Type_2_Diabetes', 
                   'Hypertension', 'Gastritis_and_duodenitis', 'Other_psychoactive_substance_misuse', 'Coronary_heart_disease',
                    'Cervical_dysplasia', 'Somatoform_and_dissociative_disorders', 'Erectile_dysfunction', 'Thyroid_disease',
                    'Chronic_fatigue', 'Chronic_sinusitis', 'Hyperplasia_of_prostate', 'Neuromuscular_dysfunction_of_bladder',
                    'Menorrhagia_and_polymenorrhoea', 'Chronic_Kidney_Disease', 'Gastro-oesophageal_reflux_disease', 
                    'Portal_hypertension_and_oesophageal_varices', 'Other_anaemias', 'Diabetic_eye_disease', 
                    'Liver_fibrosis_sclerosis_and_cirrhosis', 'Constipation', 'Depression', 'Osteoarthritis_excl_spine']

In [None]:
df_result = pd.DataFrame(columns=["Cancer","Morbidity","OR","Lower_CI","Upper_CI","pvalue"])
df_case_control = pd.DataFrame()

for cancer_ph in cancer_set:
  filename = "/genesandhealth/red/AshithaJoby/results/Matched_data_files/"+cancer_ph+".csv"
  matched_data = pd.read_csv(filename, na_filter=False)
  print(f"Cancer: {cancer_ph}")
  print(f"Case Group size:{len(matched_data[matched_data['Group']==1])}")
  print(f"Control Group size:{len(matched_data[matched_data['Group']==0])}")
  row = [{"Cancer":cancer_ph,"Case_Group_Size":len(matched_data[matched_data['Group']==1]), "Control_Group_Size":len(matched_data[matched_data['Group']==0])}] 
  df_case_control = pd.concat([df_case_control,pd.DataFrame(row)], ignore_index=True)

  if cancer_ph in gender_specific_cancers:
     columns = ["PseudoNHSnumber","Group","Ethnicity","Index_age"]
  else:
     columns = ["PseudoNHSnumber","Group","Gender","Ethnicity","Index_age"]
  matched_data_demo = matched_data.loc[:,columns]
  idx1 = df_analysis_main.set_index(['PseudoNHSnumber']).index
  idx2 = matched_data_demo.set_index(['PseudoNHSnumber']).index
  analysis_cohort = df_analysis_main[idx1.isin(idx2)].reset_index(drop=True)
  # Factorize the Gender column
  if cancer_ph not in gender_specific_cancers:
     matched_data_demo['Gender_Codes'], gender_categories = pd.factorize(matched_data_demo['Gender'])

  # Factorize the Ethnicity column
  matched_data_demo['Ethnicity_Codes'], ethnicity_categories = pd.factorize(matched_data_demo['Ethnicity'])
  if cancer_ph in gender_specific_cancers:
     matched_data_analysis = matched_data_demo.loc[:,["PseudoNHSnumber","Group","Ethnicity_Codes","Index_age"]]
  else:
      matched_data_analysis = matched_data_demo.loc[:,["PseudoNHSnumber","Group","Gender_Codes","Ethnicity_Codes","Index_age"]]

  for morbidity in morbidity_list:
    print(morbidity+":"+cancer_ph)
    df_analysis = pd.merge(matched_data_analysis, analysis_cohort[["PseudoNHSnumber",morbidity,"Index_date"]], on='PseudoNHSnumber',how='left')

    df_analysis[[morbidity]] = df_analysis[[morbidity]].astype(str)
    #mask = df_analysis[ (df_analysis["Group"]==1) & (df_analysis[cancer]=="0") ].index
    #df_analysis = df_analysis.drop(mask,axis='index')
    #df_analysis["morbidity_prevalent"] = df_analysis.apply(lambda x: isPrevalent(x[morbidity],x["Index_date"]), axis=1)
    df_analysis["morbidity_prevalent"] = df_analysis.apply(lambda x: isPostIncident(x[morbidity],x["Index_date"]), axis=1)
    

    df_analysis = df_analysis.drop(labels=["PseudoNHSnumber",morbidity,"Index_date"],axis=1)
    df_analysis=df_analysis.reset_index(drop=True)
    c_tab = pd.crosstab(df_analysis["Group"],df_analysis["morbidity_prevalent"])
    if (c_tab<5).values.any():
            continue

    #outcome = "Group"
    #predictors = list(set(df_analysis.columns).difference({outcome}))
    #predictor_str = "+".join(predictors)
    #formula = outcome+"~"+predictor_str
   
    outcome = "Group"
    if cancer_ph in gender_specific_cancers:
       adjustors = ["C(Ethnicity_Codes)", "Index_age"]
    else:
       adjustors = ["C(Gender_Codes)", "C(Ethnicity_Codes)", "Index_age"]
    predictor = "morbidity_prevalent"
    adjustor_str = "+".join(adjustors)
    formula = outcome+"~"+adjustor_str+"+"+predictor

    try:
      log_reg = smf.logit(formula, data=df_analysis).fit()
      odds_ratios = pd.DataFrame(
                {
                    "OR": log_reg.params,
                    "Lower_CI": log_reg.conf_int()[0],
                    "Upper_CI": log_reg.conf_int()[1]
                }
            )
      odds_ratios = np.exp(odds_ratios)
      odds_ratios["pvalue"] = round(log_reg.pvalues,2)
      pd.options.display.float_format = '{:.2f}'.format
      row = odds_ratios.loc["morbidity_prevalent"]
      stat = [{"Cancer":cancer_ph,"Morbidity":morbidity, "OR":row.OR,"Lower_CI":row.Lower_CI,"Upper_CI":row.Upper_CI,"pvalue":row.pvalue }]
      df_result = pd.concat([df_result,pd.DataFrame(stat)], ignore_index=True)
    except Exception as e:
        print(f"An error occurred: {e}")


  

  
  

In [None]:
df_case_control
df_case_control.to_csv(out_path+mode+"/case_control_matched.csv",index=False)

In [None]:
#df_result.sort_values(by=["pvalue","OR"], ascending=[True,False]).to_csv(out_path+mode+"/"+"cancer_prediag_morbidity_OR.csv", index=False)

df_result.sort_values(by=["pvalue","OR"], ascending=[True,False]).to_csv(out_path+mode+"/"+"cancer_postdiag_morbidity_OR.csv", index=False)



Heatmap for adjusted odds ratios

In [None]:
df_result = pd.read_csv(out_path+mode+"/"+"cancer_prediag_morbidity_OR.csv", header=0)
df_result = df_result[(df_result["pvalue"]<=0.05)&(df_result["Cancer"]!="Other_Cancer")]
#df_result = df_result[df_result["Cancer"]!="Other_Cancer"]
df_result.to_csv(out_path+mode+"/adjusted_OR_prediag_significant_table.csv",index=False)
#df_result.to_csv(out_path+mode+"/adjusted_OR_prediag_full_table.csv",index=False)

In [None]:
grouped_df = df_result.groupby('Cancer')
# Use apply to reassemble the grouped data into a DataFrame
df_result_export_prediag = grouped_df.apply(lambda x: x.reset_index(drop=True)).reset_index(drop=True)
df_result_export_prediag.to_csv(out_path+mode+"/adjusted_OR_prediag_significant_table_grouped.csv",index=False)

In [None]:
df_result_export_prediag

In [None]:
df_result = pd.read_csv(out_path+mode+"/"+"cancer_postdiag_morbidity_OR.csv", header=0)
df_result = df_result[(df_result["pvalue"]<=0.05)&(df_result["Cancer"]!="Other_Cancer")]
#df_result = df_result[df_result["Cancer"]!="Other_Cancer"]
df_result.to_csv(out_path+mode+"/adjusted_OR_postdiag_significant_table.csv",index=False)
#df_result.to_csv(out_path+mode+"/adjusted_OR_postdiag_full_table.csv",index=False)

In [None]:
grouped_df = df_result.groupby('Cancer')
# Use apply to reassemble the grouped data into a DataFrame
df_result_export_postdiag = grouped_df.apply(lambda x: x.reset_index(drop=True)).reset_index(drop=True)
df_result_export_postdiag.to_csv(out_path+mode+"/adjusted_OR_postdiag_significant_table_grouped.csv",index=False)

In [None]:
df_result_export_postdiag

In [None]:
df_result = pd.read_csv(out_path+mode+"/"+"cancer_postdiag_morbidity_OR.csv", header=0)
df_result = df_result[(df_result["pvalue"]<=0.05)&(df_result["Cancer"]!="Other_Cancer")]
df_result

In [None]:
df_result = pd.read_csv(out_path+mode+"/"+"cancer_prediag_morbidity_OR.csv", header=0)
df_result = df_result[(df_result["pvalue"]<=0.05)&(df_result["Cancer"]!="Other_Cancer")]
df_result_pivot = df_result.pivot(index="Cancer",columns="Morbidity",values="OR")
df_result_pivot.fillna(1, inplace=True)
sns.clustermap(df_result_pivot,center=1, cmap="BrBG",cbar_pos=(0.02, 0.9, 0.02, 0.12))
plt.savefig(out_path+mode+"/"+"prediag_OR_heatmap.png", bbox_inches='tight')

In [None]:
df_result = pd.read_csv(out_path+mode+"/"+"cancer_postdiag_morbidity_OR.csv", header=0)
df_result = df_result[(df_result["pvalue"]<=0.05)&(df_result["Cancer"]!="Other_Cancer")]
df_result_pivot = df_result.pivot(index="Cancer",columns="Morbidity",values="OR")
df_result_pivot.fillna(1, inplace=True)
sns.clustermap(df_result_pivot,center=1, cmap="BrBG",cbar_pos=(0.02, 0.9, 0.02, 0.12))
plt.savefig(out_path+mode+"/"+"postdiag_OR_heatmap.png", bbox_inches='tight')