In [None]:
import pandas as pd
import os
import re
from datetime import datetime
from dateutil import relativedelta
import numpy as np
import math
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import kruskal

###  Process demographic data

Utility Functions

In [None]:

def read_csv(file_name, delimiter = ',', chunksize = 10000, na_filter = False) :
    for chunk in pd.read_csv(file_name, chunksize=chunksize, delimiter=delimiter, na_filter=na_filter):
        yield chunk

def createDate(str, fmt="%Y-%m-%d"):
    try:
        return datetime.strptime(str, fmt)
    except:
        return None

def findBirthCountry(x):
    match = re.search(r'Born in ([\w\s]+) \(finding\)', x)
    if match:
        return match.group(1)
    else:
        return ""

Define code mappping vocabulary. Not exhaustive. Will need to update regularly.

In [None]:
gender_dict = {0:"not known", 1:"male", 2:"female", 9:"not specified"}
gnh_ethnic_dict = {0:"99", 1:"K", 2:"J", 3:"Z"}
parents_dict = {0:"not known", 1:"Yes", 2:"No", 3:"not known"}
country_dict = {'PAK': 'Pakistan', 'BGD': 'Bangladesh', 'ARE': 'United Arab Emirates', 'AFG': 'Afghanistan', 'IND': 'India', 'MMR': 'Myanmar', 'SOM': 'Somalia',
       'GBR': 'Great Britain', 'ITA': 'Italy'}
ethnic_dict = {'J': 'Asian or Asian British - Pakistani', 'K': 'Asian or Asian British - Bangladeshi', 'L': 'Asian or Asian British - Any other', 
       'H': 'Asian or Asian British - Indian', '99': 'Not known', 'Z': 'Not stated', 'S': 'Other Ethnic Groups - Any other', 'F': 'Mixed - White and Asian',
       'G': 'Mixed - Any other', 'A': 'White - British', 'N': 'Black or Black British - African', 'P': 'Black or Black British - Any other', 
       'X': 'Not known', '5': 'Not known'}

####  G&H Stage 1 Questionnaire data

Use the new/updated input files, which change every now and then. Combine S1QST Date of Birth with S1LabCo Date of Recruitment data to derive age at recuitment.

In [None]:
qst_file = "/genesandhealth/library-red/genesandhealth/phenotypes_rawdata/QMUL__Stage1Questionnaire/2024_04_30__S1QST_redacted.tab"
labco_file = "/genesandhealth/library-red/genesandhealth/phenotypes_rawdata/QMUL__Stage1LabCollector/Stage1LabCollector_2024-04-26.tab"
mappingfile = "/genesandhealth/library-red/genesandhealth/2024_05_03_OrageneID_PseudoNHS_Gender_withmissing_forTRE.tab"

df_mapping = pd.read_csv(mappingfile, sep="\t",header = 0)
df_mapping = df_mapping.iloc[:,[0,2]]
df_mapping.columns = ["Oragene_id","PseudoNHSnumber"]
mask = df_mapping["PseudoNHSnumber"].str.len() == 64
df_mapping = df_mapping.loc[mask]

df_data = pd.read_csv(qst_file, sep="\t",header=0)
df_data = df_data.iloc[:,[0,1,2,3,9]]
df_data.columns = ["Oragene_id","DOB","Gender","Ethnicity","Parents_related"]
df_data = df_mapping.merge(df_data, how='outer', on='Oragene_id')
#specify date format (dd-mm-yyyy)?
df_data["DOB"] = pd.to_datetime("15-"+df_data["DOB"], dayfirst=True)
df_data["Gender"] = df_data["Gender"].apply(lambda x: gender_dict[0] if np.isnan(x) else gender_dict[x])
df_data["Ethnicity"] = df_data["Ethnicity"].apply(lambda x: gnh_ethnic_dict[0] if np.isnan(x) else gnh_ethnic_dict[x])
df_data["Parents_related"] = df_data["Parents_related"].apply(lambda x: parents_dict[0] if np.isnan(x) else parents_dict[x])
df_qst = df_data.loc[df_data.groupby('PseudoNHSnumber').Oragene_id.idxmin()].reset_index(drop=True)

df_data = pd.read_csv(labco_file, sep="\t",header=0)
df_data = df_data.iloc[:,[0,1,3]]
df_data.columns = ["Oragene_id","DOR","Region"]
df_data = df_mapping.merge(df_data, how='outer', on='Oragene_id')
df_data["DOR"] = pd.to_datetime(df_data["DOR"]+"-15")
#groups by the pseudo NHS number and selects the row with the smallest Oragene id within each group
df_labco = df_data.loc[df_data.groupby('PseudoNHSnumber').Oragene_id.idxmin()].reset_index(drop=True)

df_merged = df_qst.merge(df_labco, on = ["Oragene_id","PseudoNHSnumber"], how="outer")
# axis=1 implies the function should be applied at each row
#relativedelta calculates the difference between two dates
df_merged["Age_at_recruitment"] = df_merged.apply(lambda x: None if pd.isnull(x.DOR) else relativedelta.relativedelta(x.DOR, x.DOB).years, axis=1)

####  Discovery, Barts Health and NHS Digital data

In [None]:

filename = "/genesandhealth/library-red/genesandhealth/phenotypes_rawdata/DSA__Discovery_7CCGs/2023_11_Discovery/pseudonhs_uniq_2023-11-08.csv"
df_cohort = pd.read_csv(filename, header=0)
df_cohort.columns = ["PseudoNHSnumber"]

filename = "/genesandhealth/library-red/genesandhealth/phenotypes_rawdata/DSA__Discovery_7CCGs/2023_11_Discovery/gh3_demographicsDataset_forTRE.csv"
df = pd.read_csv(filename,  header=0)
df_demo_disc = df.iloc[:,[1,2,4,5,8,14,18]]
df_demo_disc["DOB"] = df_demo_disc.loc[:,"DateOfBirth_mm-yyyy"].apply(lambda x: createDate("15-"+x,"%d-%m-%Y"))
# df_demo_disc["DOD"] = df_demo_disc["YearOfDeath"].apply(lambda x: createDate(x,"%d/%m/%Y"))

df_demo_disc["DOD"] = pd.to_datetime(df_demo_disc.loc[:,"YearOfDeath"])
df_demo_disc = df_demo_disc.iloc[:,[0,1,7,3,8,5,4]]
df_demo_disc.columns = ["PseudoNHSnumber","Gender","DOB","Ethnicity","DOD","LSOA2011","BirthCountry"]
df_demo_disc["Gender"] = df_demo_disc.loc[:,"Gender"].str.lower()
df_demo_disc["BirthCountry"] = df_demo_disc.loc[:,"BirthCountry"].apply(lambda x: "" if pd.isnull(x) else findBirthCountry(x))

filename = "/genesandhealth/library-red/genesandhealth/phenotypes_rawdata/DSA__BartsHealth_NHS_Trust/2023_12_ResearchDatasetv1.6/GH_Patient_Demographics__20231211.ascii.redacted.tab"
df = pd.read_csv(filename, delimiter="\t",header=0)
df_demo_barts = df.iloc[:,[0,3,1,4,6]]
df_demo_barts["DOB"] = df_demo_barts.loc[:,"Date_of_Birth"].apply(lambda x: createDate(x,"%Y-%m-%d"))
# df_demo_barts["DOD"] = df_demo_barts["Date_of_Death"].apply(lambda x: createDate(x,"%Y-%m-%d %H:%M:%S.%f"))
df_demo_barts["DOD"] = pd.to_datetime(df_demo_barts.loc[:,"Date_of_Death"])
df_demo_barts = df_demo_barts.iloc[:,[0,1,5,3,6]]
df_demo_barts.columns = ["PseudoNHSnumber","Gender","DOB","Ethnicity","DOD"]
df_demo_barts["Gender"] = df_demo_barts.loc[:,"Gender"].str.lower()

filename = "/genesandhealth/nhsdigital-sublicence-library-red/DSA__NHSDigitalNHSEngland/2023_07/FILE0179982_NIC338864_Demographics_.txt"
df = pd.read_csv(filename, delimiter="|",header=0,  na_filter=False)
df_demo_nhsd = df.iloc[:,[1,8,18,42,24]]
df_demo_nhsd["DOB"] = df_demo_nhsd.loc[:,"DOB_YEAR_MONTH"].apply(lambda x: createDate(str(x)+"15","%Y%m%d"))
df_demo_nhsd["DOD"] = df_demo_nhsd.loc[:,"DERIVED_INF_DODYM"].apply(lambda x: createDate(str(x)+"15","%Y%m%d"))
df_demo_nhsd = df_demo_nhsd.iloc[:,[0,1,5,6,4]]
df_demo_nhsd.columns = ["PseudoNHSnumber","Gender","DOB","DOD","BirthCountry"]
df_demo_nhsd["Gender"] = df_demo_nhsd.loc[:,"Gender"].apply(lambda x: gender_dict[x])
df_demo_nhsd["BirthCountry"] = df_demo_nhsd.loc[:,"BirthCountry"].apply(lambda x: country_dict[x] if x in country_dict else "")

#what is apc?
filename = "/genesandhealth/nhsdigital-sublicence-library-red/DSA__NHSDigitalNHSEngland/2023_07/old_files_missing2022_data/NIC338864_HES_OP_2003to2023.txt"
df_ethno_nhsd = pd.DataFrame()
for df in read_csv(filename):
    df_demo_nhsd_op = df.loc[:,["STUDY_ID","ETHNOS"]]
    df_ethno_nhsd = pd.concat([df_ethno_nhsd,df_demo_nhsd_op], ignore_index=True)

filename = "/genesandhealth/nhsdigital-sublicence-library-red/DSA__NHSDigitalNHSEngland/2023_07/HES/nic338864_hes_op_202299.csv"
# won't this re-initialize df_ethno_nhsd?
# df_ethno_nhsd = pd.DataFrame()
for df in read_csv(filename):
    df_demo_nhsd_op = df.loc[:,["STUDY_ID","ETHNOS"]]
    df_ethno_nhsd = pd.concat([df_ethno_nhsd,df_demo_nhsd_op], ignore_index=True)

filename = "/genesandhealth/nhsdigital-sublicence-library-red/DSA__NHSDigitalNHSEngland/2023_07/old_files_missing2022_data/NIC338864_HES_APC_1997to2023.txt"
for df in read_csv(filename):
    df_demo_nhsd_apc = df.loc[:,["STUDY_ID","ETHNOS"]]
    df_ethno_nhsd = pd.concat([df_ethno_nhsd,df_demo_nhsd_apc], ignore_index=True)

filename = "/genesandhealth/nhsdigital-sublicence-library-red/DSA__NHSDigitalNHSEngland/2023_07/HES/nic338864_hes_apc_202299.csv"
for df in read_csv(filename):
    df_demo_nhsd_apc = df.loc[:,["STUDY_ID","ETHNOS"]]
    df_ethno_nhsd = pd.concat([df_ethno_nhsd,df_demo_nhsd_apc], ignore_index=True)

df_ethno_nhsd.columns = ["PseudoNHSnumber","Ethnicity"]
df_ethno_nhsd.drop_duplicates()



####  Derive final cohort demographics
1. Merge datasets
2. Fields: Gender, Date of Birth, Date of Death, Ethnicity, Country of Birth, Age at recruitment, Recruitment region

In [None]:
# df_demo = pd.concat([df_demo_disc.iloc[:,[0,1,2,4]], df_demo_barts.iloc[:,[0,1,2,4]], df_demo_nhsd.iloc[:,[0,1,2,3]]], ignore_index=True)
# df_demo.drop_duplicates()
df = pd.concat([df_demo_disc.iloc[:,[0,1]], df_demo_barts.iloc[:,[0,1]], df_demo_nhsd.iloc[:,[0,1]], df_qst.iloc[:,[1,3]]], ignore_index=True)
df.drop_duplicates()
gender_main_cx = ['male','female']
df['Gender']=df['Gender'].apply(lambda x: 'not known' if x not in gender_main_cx else x)
df_demo_gender_flag = df.groupby(['PseudoNHSnumber']).apply(lambda x: x['Gender'].values)
data=[]
for i, v in df_demo_gender_flag.items():
    found = False
    most_common_elements = Counter(v).most_common()

    for e,n in most_common_elements:
            if e in {'male','female'}:
                data.append([i,e])
                found = True
                break
    if not found:
        data.append([i,Counter(v).most_common()[0][0]])
df_demo_gender = pd.DataFrame(data)
df_demo_gender.columns = ['PseudoNHSnumber','Gender'] 
df_demo_gender['flag']=[1 if all(gender in v for gender in {'male','female'}) else 0 for i,v in df_demo_gender_flag.items()]
new_gender_data=[]
qst_list= df_qst.iloc[:,1].tolist()
for i in range(len(df_demo_gender)):
    if df_demo_gender.iloc[i,2] == 1:
        if  df_demo_gender.iloc[i,0] in qst_list:
            new_gender_data.append([df_demo_gender.iloc[i,0],df_qst.iloc[i,3]])
            
        else:
            new_gender_data.append([df_demo_gender.iloc[i,0],df_demo_gender.iloc[i,1]])
            
        
    else:
            new_gender_data.append([df_demo_gender.iloc[i,0],df_demo_gender.iloc[i,1]])

df_demo_gender = pd.DataFrame(new_gender_data)
df_demo_gender.columns = ['PseudoNHSnumber','Gender'] 

df = pd.concat([df_demo_disc.iloc[:,[0,3]], df_demo_barts.iloc[:,[0,3]], df_qst.iloc[:,[1,4]], df_ethno_nhsd], ignore_index=True)
df.drop_duplicates()
df_demo_ethn = df.groupby(['PseudoNHSnumber']).apply(lambda x: x['Ethnicity'].values)
data = []
for i, v in df_demo_ethn.items():
    found = False
    most_common_elements = Counter(v).most_common()

    for e,n in most_common_elements:
            if e in {'H','J','K','L'}:
                data.append([i,e])
                found = True
                break
    if not found:
        data.append([i,Counter(v).most_common()[0][0]])
df_demo_ethn = pd.DataFrame(data)
df_demo_ethn.columns = ['PseudoNHSnumber','Ethnicity'] 
df_demo_ethn['Ethnicity'] = df_demo_ethn['Ethnicity'].apply(lambda x: ethnic_dict[x] if x in ethnic_dict else "")

df = pd.concat([df_demo_disc.iloc[:,[0,6]], df_demo_nhsd.iloc[:,[0,4]]], ignore_index=True)
# df['BirthCountry'] = df.groupby(['PseudoNHSnumber'])['BirthCountry'].transform(lambda x: ','.join(map(str,x)))
df_birth_country = df.groupby(['PseudoNHSnumber']).apply(lambda x: x['BirthCountry'].values)
data = []
for i, v in df_birth_country.items():
    found = False
    most_common = Counter(v).most_common()

    for e,n in most_common:
            if e in {'Bangladesh','Pakistan'}:
                data.append([i,e])
                found = True
                break
    if not found:
        data.append([i,Counter(v).most_common()[0][0]])
        
df_birth_country = pd.DataFrame(data)
df_birth_country.columns = ['PseudoNHSnumber','BirthCountry'] 
df_birth_country['BirthCountry'] = df_birth_country['BirthCountry'].apply(lambda x: "Great Britain" if x in ['England','Scotland','Wales','Northern Ireland'] else x)

df = pd.concat([df_demo_disc.iloc[:,[0,2]], df_demo_barts.iloc[:,[0,2]], df_demo_nhsd.iloc[:,[0,2]], df_qst.iloc[:,[1,2]]], ignore_index=True)
df.drop_duplicates()
df_demo_dob = df.groupby(['PseudoNHSnumber']).apply(lambda x: x.sort_values('DOB')).groupby(level=0).nth(0).reset_index(drop=True)

df = pd.concat([df_demo_disc.iloc[:,[0,4]], df_demo_barts.iloc[:,[0,4]], df_demo_nhsd.iloc[:,[0,3]]], ignore_index=True)
df.drop_duplicates()
df_demo_dod = df.groupby(['PseudoNHSnumber']).apply(lambda x: x.sort_values('DOD')).groupby(level=0).nth(0).reset_index(drop=True)
                                    
df_final = df_demo_gender.merge(df_demo_dob, how='outer', on='PseudoNHSnumber')
df_final = df_final.merge(df_demo_dod, how='outer', on='PseudoNHSnumber')
df_final = df_final.merge(df_demo_ethn, how='outer', on='PseudoNHSnumber')
df_final = df_final.merge(df_birth_country, how='outer', on='PseudoNHSnumber')
df_final = df_final.merge(df_merged.iloc[:,[1,6,7,8]], how='outer', on='PseudoNHSnumber')
# df_final = df_demo.groupby(['PseudoNHSnumber']).apply(lambda x: x.sort_values('DOB')).groupby(level=0).nth(0).reset_index(drop=True)
df_final.to_csv("/genesandhealth/red/AshithaJoby/results/07052024/cohort_demo.csv", index=False)
#df_final.to_csv("../results/07052024/cohort_demo.csv", index=False)

In [None]:
df_final.head(5)

In [None]:
df_final.shape[0]

### Morbidity statistics on primary malignancy cohorts

Note: Remember to take out Female patient from Male_Genital cases and Male patients from Gynaecological cases. The codelist contains "Z854 - personal history of cancer in genital organs" for both phenotypes, hence caution required to exclude non-compliant genders.

Set input and output directories

In [None]:
pheno_path = "/genesandhealth/red/AshithaJoby/custom_phenotypes/"
case_path = "/genesandhealth/red/AshithaJoby/Cases/"
out_path = "/genesandhealth/red/AshithaJoby/results/comorbidity/"
demo_path = "/genesandhealth/red/AshithaJoby/results/07052024/cohort_demo.csv"

In [None]:
df_cohort = pd.read_csv(demo_path,header = 0)
df_cohort["DOB"] = df_cohort["DOB"].apply(lambda x:createDate(x))
df_cohort.head()
population = df_cohort.shape[0]

In [None]:
df_cohort.shape[0]

In [None]:
df_cohort.head()


#### Build cancer-morbidity tally
Derive incidence counts for each (cancer, comorbidity) pair

Choose one of the two cancer phenotype groups and follow rest of the pipeline

1. GNH cancer phenotypes

In [None]:
mode = 'GNH'
cancer_dict = { 'Oro-pharyngeal': 'Oro-pharyngeal','Other_Skin_and_subcutaneous_tissue' : 'Skin (other)', 'biliary_tract': 'Biliary tract', 'bladder' : 'Bladder', 
              'bone_and_articular_cartilage' : 'Bone','brain_other_CNS_and_intracranial': 'CNS', 'breast' : 'Breast', 'cervical' : 'Cervical',
              'pancreatic': 'Pancreatic', 'prostate' : 'Prostate','colorectal_and_anus' : 'Colorectal', 'kidney_and_ureter' : 'Kidney', 'liver' : 'Liver',
              'stomach' : 'Stomach', 'testicular' : 'Testicular', 'thyroid' : 'Thyroid', 'uterine' : 'Uterine', 'ovarian': 'Ovarian',
              'lung_and_trachea' : 'Lung', 'malignant_melanoma' : 'Melanoma', 'mesothelioma' : 'Mesothelioma','oesophageal' : 'Oesophageal',
              'Hodgkin_Lymphoma':'Hodgkin Lymphoma','Leukaemia':'Leukaemia','Multiple_myeloma_and_malignant_plasma_cell_neoplasms':'Myeloma',
              'Non-Hodgkin_Lymphoma':'Non-Hodgkin Lymphoma'
              }
cancer_group = cancer_dict.keys()

incidence = pd.DataFrame()
pheno_list = os.listdir(pheno_path)
case_list = os.listdir(case_path)
for cancer_ph in cancer_group:
  filename = 'Primary_Malignancy_'+cancer_ph+'.Cases.csv'
  cancer = pd.read_csv(case_path+filename, na_filter=False)
  cancer.rename(columns={'nhs_number': 'PseudoNHSnumber', 'date': 'DiagDt', 'code':'Code'}, inplace=True)
  if cancer_ph == 'prostate':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="male"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'breast':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'testicular':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="male"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'cervical':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'uterine':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  if cancer_ph == 'ovarian':
    cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
    cancer_demo = cancer_demo[cancer_demo["Gender"]=="female"]
    cancer = cancer_demo.loc[:,["PseudoNHSnumber","DiagDt","Code"]]
    cancer.to_csv(case_path+"Primary_Malignancy_"+cancer_ph+".Cases.csv", index=False)
  cancer.columns = ["nhs_number","date","code"]
  cancer["date"] = cancer["date"].apply(lambda x:createDate(x))
  for pheno in pheno_list:
    if re.search("Primary_(M|m)alignancy|GNH|MGH|PRIMED|PADGEN|Lymphoma|myeloma|metastasis|carcinoma|Leukaemia",pheno):
      continue
    filename = pheno+'_summary_report.csv'
    morbid = pd.read_csv(pheno_path+pheno+'/'+filename, na_filter=False)
    morbid["date"] = morbid["date"].apply(lambda x:createDate(x))
    cancer_morbid = cancer[["nhs_number","date"]].merge(morbid[["nhs_number","date"]], how='inner', on='nhs_number')
    #shouldn't = be in the post-diagnosis set?
    pre_diag = cancer_morbid.query('date_y<=date_x').shape[0]
    post_diag = cancer_morbid.query('date_y>date_x').shape[0]
    total = pre_diag+post_diag
    stat = [{"cancer":cancer_dict[cancer_ph], "morbidity": pheno, "cancer_size": cancer.shape[0], "morbid_size": morbid.shape[0], "pre_diag": pre_diag, "post_diag": post_diag,"total": total}]
    incidence = pd.concat([incidence,pd.DataFrame(stat)], ignore_index=True)

In [None]:
incidence

##### Identify highly prevalent morbidity subset for further analysis (GNH)

1. Find top 10 prevalent morbidity in each cancer and merge (this gives 32 morbidity)
2. Merge that list with top 20 prevalent morbidity in whole population (this gives 34 morbidity)

In [None]:
morbidity_set = []
for c in set(incidence.cancer):
    df = incidence[incidence.cancer == c]
    df.sort_values(by="total", ascending=False, inplace=True)
    morbidity_set.extend(df.iloc[:10]["morbidity"].to_list())

print(len(set(morbidity_set)))
morbid = incidence[["morbidity","morbid_size"]].drop_duplicates().sort_values(by="morbid_size", ascending=False,)
morbidity_set.extend(morbid.iloc[:20]["morbidity"].to_list())

morbidity_set = set(morbidity_set)
print(len(set(morbidity_set)))
print(list(morbidity_set))


#### Derive stats and plots

In [None]:
incidence["pre_diag_perc"] = incidence.apply(lambda x: x["pre_diag"]/x["cancer_size"], axis=1)
incidence["post_diag_perc"] = incidence.apply(lambda x: x["post_diag"]/x["cancer_size"], axis=1)
incidence["total_perc"] = incidence.apply(lambda x: x["total"]/x["cancer_size"], axis=1)

In [None]:
morbid = incidence.loc[:,["morbidity","morbid_size"]]
morbid["prevalence"] = morbid["morbid_size"]/population
morbid = morbid.drop_duplicates()
morbid.sort_values(by="morbid_size", ascending=False, inplace=True)
cancer = incidence.loc[:,["cancer","cancer_size"]]
cancer["prevalence"] = cancer["cancer_size"]/population
cancer = cancer.drop_duplicates()
cancer.sort_values(by="cancer_size", ascending=False, inplace=True)

In [None]:
cancer.reset_index(inplace=True, drop=True)
morbid.reset_index(inplace=True, drop=True)
cancer.to_csv(out_path+mode+"/"+"cancer_stat.csv", index=False)
morbid.to_csv(out_path+mode+"/"+"morbidity_stat.csv", index=False)

In [None]:
cancer

In [None]:
morbid

In [None]:
ax = sns.barplot(cancer, y="cancer", x="cancer_size", hue="cancer", legend=False)
for index, row in cancer.iterrows():
    ax.text(x = row.cancer_size + 2, y = row.name, s = f"{row.cancer_size}({np.round(row.prevalence * 100, 1)}%)", color='black', ha='left')
plt.tight_layout()
plt.savefig(out_path+mode+"/"+"cancer_prevalence.png")

plt.figure(figsize=(10,12))
data = morbid.iloc[:50]
ax = sns.barplot(data, y="morbidity", x="morbid_size", hue="morbidity", legend=False)
for index, row in data.iterrows():
    ax.text(x = row.morbid_size + 2, y = row.name, s = f"{row.morbid_size}({np.round(row.prevalence * 100, 1)}%)", color='black', ha='left')
plt.tight_layout()
plt.savefig(out_path+mode+"/"+"morbidity_prevalence.png")

In [None]:
for c in list(cancer.cancer):
    df = incidence[incidence.cancer == c]
    df.sort_values(by="total", ascending=False, inplace=True)
    df = df.iloc[:20]
    plt.figure(figsize=(8,4), dpi=96)
    bar1 = sns.barplot(df, y="morbidity", x="total_perc", color="lightgreen")
    bar2 = sns.barplot(df, y="morbidity", x="pre_diag_perc",  color="darkgreen")
    top_bar = mpatches.Patch(color='darkgreen', label='pre-diagnosis')
    bottom_bar = mpatches.Patch(color='lightgreen', label='post-diagnosis')
    plt.legend(handles=[top_bar,bottom_bar])
    plt.title(c+" cancer morbidity prevalence")
    plt.tight_layout()
    plt.savefig(out_path+mode+"/"+c+".png")

### Demographic statistics on primary malignancy cohorts

In [None]:

incidence = pd.DataFrame()
for cancer_ph in cancer_group:
  filename = "Primary_Malignancy_" + cancer_ph + ".Cases.csv"
  cancer = pd.read_csv(case_path+filename, na_filter=False)
  cancer.rename(columns={'nhs_number': 'PseudoNHSnumber', 'date': 'DiagDt', 'code':'Code'}, inplace=True)
  cancer["DiagDt"] = cancer["DiagDt"].apply(lambda x:createDate(x))
  cancer["cancer"] = cancer_dict[cancer_ph]
  cancer_demo = cancer.merge(df_cohort, how='left',  on='PseudoNHSnumber')
  incidence = pd.concat([incidence,cancer_demo], ignore_index=True)

In [None]:
incidence

In [None]:
df_cohort.fillna("Unknown", inplace=True)
incidence.fillna("Unknown", inplace=True)

In [None]:
df_cohort

In [None]:
analysis_cohort_1 = df_cohort.copy()
analysis_cohort_1["Gender"] = np.where(analysis_cohort_1["Gender"].isin(['male','female']), analysis_cohort_1["Gender"], 'Unknown')

analysis_cohort_1["Ethnicity"] = np.where(analysis_cohort_1["Ethnicity"].isin(['Asian or Asian British - Bangladeshi','Asian or Asian British - Pakistani', 'Asian or Asian British - Indian','Asian or Asian British - Any other','Not known','Not stated']), analysis_cohort_1["Ethnicity"], 'Other')
analysis_cohort_1["Ethnicity"] = analysis_cohort_1["Ethnicity"].replace(['Asian or Asian British - Bangladeshi','Asian or Asian British - Pakistani', 'Asian or Asian British - Indian','Asian or Asian British - Any other','Other','Not known','Not stated'],['Bangladeshi','Pakistani','Other Asian','Other Asian','Other','Unknown','Unknown'])


analysis_cohort_1["BirthCountry"] = np.where(analysis_cohort_1["BirthCountry"].isna(), 'Unknown', analysis_cohort_1["BirthCountry"])
analysis_cohort_1["BirthCountry"] = np.where(analysis_cohort_1["BirthCountry"].isin(['Bangladesh','Pakistan', 'Great Britain', 'Unknown']), analysis_cohort_1["BirthCountry"], 'Other')

In [None]:
analysis_cohort_2 = incidence.copy()
analysis_cohort_2["Gender"] = np.where(analysis_cohort_2["Gender"].isin(['male','female']), analysis_cohort_2["Gender"], 'Unknown')

analysis_cohort_2["Ethnicity"] = np.where(analysis_cohort_2["Ethnicity"].isin(['Asian or Asian British - Bangladeshi','Asian or Asian British - Pakistani', 'Asian or Asian British - Indian','Asian or Asian British - Any other','Not known','Not stated']), analysis_cohort_2["Ethnicity"], 'Other')
analysis_cohort_2["Ethnicity"] = analysis_cohort_2["Ethnicity"].replace(['Asian or Asian British - Bangladeshi','Asian or Asian British - Pakistani', 'Asian or Asian British - Indian','Asian or Asian British - Any other','Other','Not known','Not stated'],['Bangladeshi','Pakistani','Other Asian','Other Asian','Other','Unknown','Unknown'])


analysis_cohort_2["BirthCountry"] = np.where(analysis_cohort_2["BirthCountry"].isna(), 'Unknown', analysis_cohort_2["BirthCountry"])
analysis_cohort_2["BirthCountry"] = np.where(analysis_cohort_2["BirthCountry"].isin(['Bangladesh','Pakistan', 'Great Britain', 'Unknown']), analysis_cohort_2["BirthCountry"], 'Other')

In [None]:
# derive demographic distribution table for individual cancers in relation to overall population

categories = ["Gender","Ethnicity","BirthCountry","Region"]
incidence_x = pd.DataFrame()
cancer_list = list(cancer_dict.values())
df_all = pd.DataFrame()
for cat in categories:
    analysis_cohort_1[cat] = analysis_cohort_1[cat].astype(str)
    stat = analysis_cohort_1[cat].value_counts()
    df = pd.DataFrame(stat).reset_index()
    df.columns = ["Value","Population"]
    df.insert(0,"Category",cat)
    df_all = pd.concat([df_all,df], ignore_index=True)
print(df_all)

for c in cancer_list:
  #cancer = c+"_Cancer"
  df_cancer = pd.DataFrame()
  for cat in categories:
      analysis_cohort_2[cat] = analysis_cohort_2[cat].astype(str)
      stat = analysis_cohort_2[analysis_cohort_2["cancer"]== c ][cat].value_counts()
      df = pd.DataFrame(stat).reset_index()
      df.columns = ["Value","Count"]
      df.insert(0,"Category",cat)
      df_cancer = pd.concat([df_cancer,df], ignore_index=True)
  df_c = df_all.merge(df_cancer, how='left', left_on=['Category','Value'], right_on=['Category','Value'])
  df_c.insert(3,"Cancer",c)
  df_c.fillna(0)
  incidence_x = pd.concat([incidence_x,df_c], ignore_index=True)

incidence_x["Prop"] = incidence_x["Count"] *100 / incidence_x["Population"]

In [None]:
incidence_x

In [None]:
gender_main_cx = ['male','female']
gender_main_df=incidence_x[incidence_x['Value'].isin(gender_main_cx)]

#ethnicity_main_cx = ['Asian or Asian British - Pakistani','Asian or Asian British - Bangladeshi','Asian or Asian British - Any other']
#agg_ethnicity_1 = agg_ethnicity[agg_ethnicity['Ethnicity'].isin(ethnicity_main_cx)]
gender_main_df


In [None]:
plt.rcdefaults()

In [None]:
agg_gender = gender_main_df[gender_main_df["Category"]=="Gender"].pivot(index="Cancer", columns="Value", values=["Count","Prop"])
agg_gender.dropna(axis=1, how='all', inplace=True)
agg_gender.fillna(0, inplace=True)

agg_gender_prop = agg_gender.Prop
agg_gender_prop.sort_values(by="male", inplace=True)
agg_gender_count = agg_gender.Count
agg_gender_count.sort_values(by="male", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"male": "skyblue", "female": "salmon"}

ax = agg_gender_prop.plot(kind="barh", figsize=(10,12), width=0.7, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Gender")
for n, x in enumerate([*agg_gender_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_gender_prop.loc[x],agg_gender_count.loc[x])):
        if(proportion < 0.0001):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/4 - 0.25,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black",
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
plt.savefig(out_path+mode+"/"+"cancerbygender_pop.png", bbox_inches='tight')

In [None]:
#df_cohort_1=df_cohort.copy()
#df_cohort_1=df_cohort_1.dropna(subset=['Ethnicity'])

ethnicity_df = incidence.loc[:,['PseudoNHSnumber','cancer','Ethnicity']]
#ethnicity_df=ethnicity_df.dropna(subset=['Ethnicity'])

ethnicity_main_cx = ['Asian or Asian British - Pakistani','Asian or Asian British - Bangladeshi','Asian or Asian British - Any other', 'Unknown']
ethnicity_df['Ethnicity']=ethnicity_df['Ethnicity'].apply(lambda x: 'Asian or Asian British - Any other' if x == 'Asian or Asian British - Indian' else x)
ethnicity_df['Ethnicity']=ethnicity_df['Ethnicity'].apply(lambda x: 'Unknown' if x in ['Not known','Not stated'] else x)

ethnicity_df['Ethnicity']=ethnicity_df['Ethnicity'].apply(lambda x: 'Other' if x not in ethnicity_main_cx else x)
Bangladeshi_gen_pop = len(df_cohort[df_cohort['Ethnicity']=="Asian or Asian British - Bangladeshi"])
Pakistani_gen_pop = len(df_cohort[df_cohort['Ethnicity']=="Asian or Asian British - Pakistani"])
Asian_other_gen_pop = len(df_cohort[df_cohort['Ethnicity']=="Asian or Asian British - Any other"]) + len(df_cohort[df_cohort['Ethnicity']== "Asian or Asian British - Indian"])
x_set = ['Asian or Asian British - Pakistani','Asian or Asian British - Bangladeshi','Asian or Asian British - Any other', 'Asian or Asian British - Indian', 'Not known','Not stated']
Other_gen_pop = len(df_cohort[~df_cohort['Ethnicity'].isin(x_set)])
y_set = ['Not known','Not stated', 'Unknown']
unknown_gen_pop = len(df_cohort[df_cohort['Ethnicity'].isin(y_set)])

female_cancer = ['Breast', 'Cervical','Uterine','Ovarian']
male_cancer = ['Prostate','Testicular']

Bangladeshi_male_pop = len(df_cohort[(df_cohort['Ethnicity']=="Asian or Asian British - Bangladeshi")& (df_cohort['Gender']=="male")])
Pakistani_male_pop = len(df_cohort[(df_cohort['Ethnicity']=="Asian or Asian British - Pakistani")& (df_cohort['Gender']=="male")])
Asian_other_male_pop = len(df_cohort[(df_cohort['Ethnicity']=="Asian or Asian British - Any other")& (df_cohort['Gender']=="male")]) + len(df_cohort[(df_cohort['Ethnicity']== "Asian or Asian British - Indian")& (df_cohort['Gender']=="male")])
Other_male_pop = len(df_cohort[(~df_cohort['Ethnicity'].isin(x_set))&(df_cohort['Gender']=="male")])
unknown_male_pop = len(df_cohort[(df_cohort['Ethnicity'].isin(y_set))&(df_cohort['Gender']=="male")])

Bangladeshi_female_pop = len(df_cohort[(df_cohort['Ethnicity']=="Asian or Asian British - Bangladeshi")& (df_cohort['Gender']=="female")])
Pakistani_female_pop = len(df_cohort[(df_cohort['Ethnicity']=="Asian or Asian British - Pakistani")& (df_cohort['Gender']=="female")])
Asian_other_female_pop = len(df_cohort[(df_cohort['Ethnicity']=="Asian or Asian British - Any other")& (df_cohort['Gender']=="female")]) + len(df_cohort[(df_cohort['Ethnicity']== "Asian or Asian British - Indian")& (df_cohort['Gender']=="female")])
Other_female_pop = len(df_cohort[(~df_cohort['Ethnicity'].isin(x_set))&(df_cohort['Gender']=="female")])
unknown_female_pop = len(df_cohort[(df_cohort['Ethnicity'].isin(y_set))&(df_cohort['Gender']=="female")])


agg_ethnicity = ethnicity_df.groupby(['cancer','Ethnicity']).size().reset_index(name='counts')
for i in range(len(agg_ethnicity)):
    if agg_ethnicity.loc[i,'cancer'] in female_cancer:
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Bangladeshi":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Bangladeshi_female_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Pakistani":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Pakistani_female_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Any other":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Asian_other_female_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Other":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Other_female_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Unknown":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/unknown_female_pop)*100
    elif agg_ethnicity.loc[i,'cancer'] in male_cancer:
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Bangladeshi":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Bangladeshi_male_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Pakistani":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Pakistani_male_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Any other":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Asian_other_male_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Other":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Other_male_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Unknown":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/unknown_male_pop)*100
    else:
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Bangladeshi":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Bangladeshi_gen_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Pakistani":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Pakistani_gen_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Asian or Asian British - Any other":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Asian_other_male_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Other":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/Other_gen_pop)*100
        if agg_ethnicity.loc[i,'Ethnicity']=="Unknown":
            agg_ethnicity.loc[i,'prevalence']=(agg_ethnicity.loc[i,'counts']/unknown_gen_pop)*100



    
agg_ethnicity

In [None]:
ethnicity_df['Ethnicity'].value_counts()

In [None]:

agg_ethnicity_1 = agg_ethnicity.pivot(index='cancer', columns='Ethnicity', values=['counts','prevalence'])
agg_ethnicity_1.dropna(axis=1, how='all', inplace=True)
agg_ethnicity_1.fillna(0, inplace=True)


agg_ethnicity_prop = agg_ethnicity_1.prevalence
agg_ethnicity_prop.sort_values(by="Asian or Asian British - Bangladeshi", inplace=True)
agg_ethnicity_count = agg_ethnicity_1.counts
agg_ethnicity_count.sort_values(by="Asian or Asian British - Bangladeshi", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"Asian or Asian British - Bangladeshi": "lightblue", "Asian or Asian British - Pakistani": "lightgreen", "Asian or Asian British - Any other":"lightcoral","Unknown":"lightgrey" }


ax = agg_ethnicity_prop.plot(kind="barh", figsize=(10,16), width=0.9, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Ethnicity")
for n, x in enumerate([*agg_ethnicity_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_ethnicity_prop.loc[x],agg_ethnicity_count.loc[x])):
        if(proportion < 0.0001):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/4 - 0.40,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black",
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
#plt.savefig(out_path+mode+"/"+"cancerbyethnicity_pop.png", bbox_inches='tight')

In [None]:
ethnicity_main_cx = ['Asian or Asian British - Bangladeshi', 'Asian or Asian British - Pakistani', 'Asian or Asian British - Any other']
ethnicity_main_df=agg_ethnicity[agg_ethnicity['Ethnicity'].isin(ethnicity_main_cx)]


In [None]:

agg_ethnicity_2 = ethnicity_main_df.pivot(index='cancer', columns='Ethnicity', values=['counts','prevalence'])
agg_ethnicity_2.dropna(axis=1, how='all', inplace=True)
agg_ethnicity_2.fillna(0, inplace=True)


agg_ethnicity_prop = agg_ethnicity_2.prevalence
agg_ethnicity_prop.sort_values(by="Asian or Asian British - Bangladeshi", inplace=True)
agg_ethnicity_count = agg_ethnicity_2.counts
agg_ethnicity_count.sort_values(by="Asian or Asian British - Bangladeshi", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"Asian or Asian British - Bangladeshi": "lightblue", "Asian or Asian British - Pakistani": "lightgreen", "Asian or Asian British - Any other":"lightcoral","Unknown":"lightgrey" }


ax = agg_ethnicity_prop.plot(kind="barh", figsize=(10,18), width=0.9, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Ethnicity")
for n, x in enumerate([*agg_ethnicity_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_ethnicity_prop.loc[x],agg_ethnicity_count.loc[x])):
        if(proportion < 0.0001):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/4 - 0.30,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black",
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
#plt.savefig(out_path+mode+"/"+"cancerbygender_pop.png", bbox_inches='tight')

In [None]:
ethnicity_main_cx_1 = ['Asian or Asian British - Bangladeshi', 'Asian or Asian British - Pakistani']
ethnicity_main_df_1=agg_ethnicity[agg_ethnicity['Ethnicity'].isin(ethnicity_main_cx_1)]


In [None]:

agg_ethnicity_3 = ethnicity_main_df_1.pivot(index='cancer', columns='Ethnicity', values=['counts','prevalence'])
agg_ethnicity_3.dropna(axis=1, how='all', inplace=True)
agg_ethnicity_3.fillna(0, inplace=True)


agg_ethnicity_prop = agg_ethnicity_3.prevalence
agg_ethnicity_prop.sort_values(by="Asian or Asian British - Bangladeshi", inplace=True)
agg_ethnicity_count = agg_ethnicity_3.counts
agg_ethnicity_count.sort_values(by="Asian or Asian British - Bangladeshi", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"Asian or Asian British - Bangladeshi": "lightblue", "Asian or Asian British - Pakistani": "lightgreen", "Asian or Asian British - Any other":"lightcoral","Unknown":"lightgrey" }


ax = agg_ethnicity_prop.plot(kind="barh", figsize=(10,16), width=0.7, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Ethnicity")
for n, x in enumerate([*agg_ethnicity_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_ethnicity_prop.loc[x],agg_ethnicity_count.loc[x])):
        if(proportion < 0.0001):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/4 - 0.15,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black",
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
plt.savefig(out_path+mode+"/"+"cancerbyethnicity_pop.png", bbox_inches='tight')

In [None]:
birthcountry_df = incidence.loc[:,['PseudoNHSnumber','cancer','BirthCountry']]
birthcountry_main_cx = ['Pakistan', 'Bangladesh', 'Great Britain', 'Unknown', 'Not known', 'Not stated']
unknown_main_cx = ['Not known', 'Not stated']


birthcountry_df['BirthCountry']=birthcountry_df['BirthCountry'].apply(lambda x: 'Other' if x not in birthcountry_main_cx else x)
birthcountry_df['BirthCountry']=birthcountry_df['BirthCountry'].apply(lambda x: 'Unknown' if x in unknown_main_cx else x)

Bangladeshi_gen_pop = len(df_cohort[df_cohort['BirthCountry']=="Bangladesh"])
Pakistani_gen_pop = len(df_cohort[df_cohort['BirthCountry']=="Pakistan"])
Britain_gen_pop = len(df_cohort[df_cohort['BirthCountry']=="Great Britain"])
Other_gen_pop = len(df_cohort[~df_cohort['BirthCountry'].isin(birthcountry_main_cx)])
Unknown_gen_pop = len(df_cohort[df_cohort['BirthCountry']=="Unknown"])
female_cancer = ['Breast', 'Cervical','Uterine','Ovarian']
male_cancer = ['Prostate','Testicular']

Bangladeshi_male_pop = len(df_cohort[(df_cohort['BirthCountry']=="Bangladesh")& (df_cohort['Gender']=="male")])
Pakistani_male_pop = len(df_cohort[(df_cohort['BirthCountry']=="Pakistan")& (df_cohort['Gender']=="male")])
Britain_male_pop = len(df_cohort[(df_cohort['BirthCountry']=="Great Britain")& (df_cohort['Gender']=="male")])
Other_male_pop = len(df_cohort[(~df_cohort['BirthCountry'].isin(birthcountry_main_cx))&(df_cohort['Gender']=="male")])
Unknown_male_pop = len(df_cohort[(df_cohort['BirthCountry']=="Unknown")& (df_cohort['Gender']=="male")])

Bangladeshi_female_pop = len(df_cohort[(df_cohort['BirthCountry']=="Bangladesh")& (df_cohort['Gender']=="female")])
Pakistani_female_pop = len(df_cohort[(df_cohort['BirthCountry']=="Pakistan")& (df_cohort['Gender']=="female")])
Britain_female_pop = len(df_cohort[(df_cohort['BirthCountry']=="Great Britain")& (df_cohort['Gender']=="female")]) 
Other_female_pop = len(df_cohort[(~df_cohort['BirthCountry'].isin(birthcountry_main_cx))&(df_cohort['Gender']=="female")])
Unknown_female_pop = len(df_cohort[(df_cohort['BirthCountry']=="Unknown")& (df_cohort['Gender']=="female")])



agg_birthcountry = birthcountry_df.groupby(['cancer','BirthCountry']).size().reset_index(name='counts')
for i in range(len(agg_birthcountry)):
    if agg_birthcountry.loc[i,'cancer'] in female_cancer:
        if agg_birthcountry.loc[i,'BirthCountry']=="Bangladesh":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Bangladeshi_female_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Pakistan":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Pakistani_female_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Great Britain":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Britain_female_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Other":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Other_female_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Unknown":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Unknown_female_pop)*100

    elif agg_birthcountry.loc[i,'cancer'] in male_cancer:
        if agg_birthcountry.loc[i,'BirthCountry']=="Bangladesh":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Bangladeshi_male_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Pakistan":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Pakistani_male_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Great Britain":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Britain_male_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Other":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Other_male_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Unknown":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Unknown_male_pop)*100

    else:
        if agg_birthcountry.loc[i,'BirthCountry']=="Bangladesh":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Bangladeshi_gen_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Pakistan":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Pakistani_gen_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Great Britain":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Britain_gen_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Other":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Other_gen_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Unknown":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Unknown_gen_pop)*100


agg_birthcountry

In [None]:

agg_birthcountry_1 = agg_birthcountry.pivot(index='cancer', columns='BirthCountry', values=['counts','prevalence'])
agg_birthcountry_1.dropna(axis=1, how='all', inplace=True)
agg_birthcountry_1.fillna(0, inplace=True)


agg_birthcountry_prop = agg_birthcountry_1.prevalence
agg_birthcountry_prop.sort_values(by="Unknown", inplace=True)
agg_birthcountry_count = agg_birthcountry_1.counts
agg_birthcountry_count.sort_values(by="Unknown", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"Bangladesh": "lightblue", "Pakistan": "lightgreen", "Other":"lightcoral","Great Britain":"pink","Unknown":"lightgrey" }


ax = agg_birthcountry_prop.plot(kind="barh", figsize=(10,25), width=0.7, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Birth Country")
for n, x in enumerate([*agg_birthcountry_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_birthcountry_prop.loc[x],agg_birthcountry_count.loc[x])):
        if(proportion < 0.0001):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/5-0.40,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black",
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
#plt.savefig(out_path+mode+"/"+"cancerbybithcountry_pop.png", bbox_inches='tight')

In [None]:
birthcountry_main_cx = ['Pakistan','Bangladesh','Great Britain','Other']
birthcountry_main_df1 = agg_birthcountry[agg_birthcountry['BirthCountry'].isin(birthcountry_main_cx)]


In [None]:
birthcountry_main_df1

In [None]:

agg_birthcountry_2 = birthcountry_main_df1.pivot(index='cancer', columns='BirthCountry', values=['counts','prevalence'])
agg_birthcountry_2.dropna(axis=1, how='all', inplace=True)
agg_birthcountry_2.fillna(0, inplace=True)


agg_birthcountry_prop = agg_birthcountry_2.prevalence
agg_birthcountry_prop.sort_values(by="Bangladesh", inplace=True)
agg_birthcountry_count = agg_birthcountry_2.counts
agg_birthcountry_count.sort_values(by="Bangladesh", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"Bangladesh": "lightblue", "Pakistan": "lightgreen", "Other":"lightcoral","Great Britain":"pink","Unknown":"lightgrey" }


ax = agg_birthcountry_prop.plot(kind="barh", figsize=(10,25), width=0.7, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Birth Country")
for n, x in enumerate([*agg_birthcountry_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_birthcountry_prop.loc[x],agg_birthcountry_count.loc[x])):
        if(proportion < 0.01):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/5-0.35,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black",
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
#plt.savefig(out_path+mode+"/"+"cancerbybirthcountry_pop.png", bbox_inches='tight')

In [None]:
birthcountry_main_cx = ['Pakistan','Bangladesh','Great Britain']
birthcountry_main_df2 = agg_birthcountry[agg_birthcountry['BirthCountry'].isin(birthcountry_main_cx)]


In [None]:

agg_birthcountry_3 = birthcountry_main_df2.pivot(index='cancer', columns='BirthCountry', values=['counts','prevalence'])
agg_birthcountry_3.dropna(axis=1, how='all', inplace=True)
agg_birthcountry_3.fillna(0, inplace=True)


agg_birthcountry_prop = agg_birthcountry_3.prevalence
agg_birthcountry_prop.sort_values(by="Bangladesh", inplace=True)
agg_birthcountry_count = agg_birthcountry_3.counts
agg_birthcountry_count.sort_values(by="Bangladesh", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"Bangladesh": "lightblue", "Pakistan": "lightgreen", "Other":"lightcoral","Great Britain":"pink","Unknown":"lightgrey" }


ax = agg_birthcountry_prop.plot(kind="barh", figsize=(10,25), width=0.7, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Birth Country")
for n, x in enumerate([*agg_birthcountry_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_birthcountry_prop.loc[x],agg_birthcountry_count.loc[x])):
        if(proportion < 0.01):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/5-0.25,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black",
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
plt.savefig(out_path+mode+"/"+"cancerbybirthcountry_pop_no_other.png", bbox_inches='tight')

In [None]:



birthcountry_df = incidence.loc[:,['PseudoNHSnumber','cancer','BirthCountry']]
unknown_main_cx = ['Not known', 'Not stated', 'Unknown']
#df_cohort[~df_cohort['Ethnicity'].isin(x_set)]
birthcountry_df = birthcountry_df[~birthcountry_df['BirthCountry'].isin(unknown_main_cx)]
df_cohort_1 =  df_cohort[~df_cohort['BirthCountry'].isin(unknown_main_cx)]
birthcountry_main_cx = ['Great Britain']

birthcountry_df['BirthCountry']=birthcountry_df['BirthCountry'].apply(lambda x: 'Other' if x not in birthcountry_main_cx else x)

Britain_gen_pop = len(df_cohort_1[df_cohort_1['BirthCountry']=="Great Britain"])
Other_gen_pop = len(df_cohort_1[~df_cohort_1['BirthCountry'].isin(birthcountry_main_cx)])
female_cancer = ['Breast', 'Cervical','Uterine','Ovarian']
male_cancer = ['Prostate','Testicular']

Britain_male_pop = len(df_cohort_1[(df_cohort_1['BirthCountry']=="Great Britain")& (df_cohort_1['Gender']=="male")])
Other_male_pop = len(df_cohort_1[(~df_cohort_1['BirthCountry'].isin(birthcountry_main_cx))&(df_cohort_1['Gender']=="male")])

Britain_female_pop = len(df_cohort_1[(df_cohort_1['BirthCountry']=="Great Britain")& (df_cohort_1['Gender']=="female")]) 
Other_female_pop = len(df_cohort_1[(~df_cohort_1['BirthCountry'].isin(birthcountry_main_cx))&(df_cohort_1['Gender']=="female")])


agg_birthcountry = birthcountry_df.groupby(['cancer','BirthCountry']).size().reset_index(name='counts')
for i in range(len(agg_birthcountry)):
    if agg_birthcountry.loc[i,'cancer'] in female_cancer:
        if agg_birthcountry.loc[i,'BirthCountry']=="Great Britain":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Britain_female_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Other":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Other_female_pop)*100
    elif agg_birthcountry.loc[i,'cancer'] in male_cancer:
        if agg_birthcountry.loc[i,'BirthCountry']=="Great Britain":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Britain_male_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Other":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Other_male_pop)*100
    else:
        if agg_birthcountry.loc[i,'BirthCountry']=="Great Britain":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Britain_gen_pop)*100
        if agg_birthcountry.loc[i,'BirthCountry']=="Other":
            agg_birthcountry.loc[i,'prevalence']=(agg_birthcountry.loc[i,'counts']/Other_gen_pop)*100

agg_birthcountry


In [None]:

agg_birthcountry_1 = agg_birthcountry.pivot(index='cancer', columns='BirthCountry', values=['counts','prevalence'])
agg_birthcountry_1.dropna(axis=1, how='all', inplace=True)
agg_birthcountry_1.fillna(0, inplace=True)


agg_birthcountry_prop = agg_birthcountry_1.prevalence
agg_birthcountry_prop.sort_values(by="Other", inplace=True)
agg_birthcountry_count = agg_birthcountry_1.counts
agg_birthcountry_count.sort_values(by="Other", inplace=True)

#plt.figure(figsize=(10,12))
custom_palette = {"Bangladesh": "lightblue", "Pakistan": "lightgreen", "Other":"lightcoral","Great Britain":"pink","Unknown":"lightgrey" }


ax = agg_birthcountry_prop.plot(kind="barh", figsize=(10,12), width=0.7, color=custom_palette)

box = ax.get_position()

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=2)
plt.title("Cancer Prevalence by Birth Country")
for n, x in enumerate([*agg_birthcountry_prop.index.values]):
    for i, (proportion, count) in enumerate(zip(agg_birthcountry_prop.loc[x],agg_birthcountry_count.loc[x])):
        if(proportion < 0.0001):
                continue 
                
        plt.text(x=proportion ,
                 y= n + i/4-0.15,
                 s=f'{int(count)}({np.round(proportion , 2)}%)', 
                 color="black", horizontalalignment='left', verticalalignment='center',
                 fontsize=6)
plt.tight_layout()
plt.xlabel("Prevalence (%)")
#plt.savefig(out_path+mode+"/"+"cancerbygender_pop.png", bbox_inches='tight')

In [None]:
region_df = incidence.loc[:,['PseudoNHSnumber','cancer','Region']]


London_gen_pop = len(df_cohort[df_cohort['Region']=="London"])
Manchester_gen_pop = len(df_cohort[df_cohort['Region']=="Manchester"])
Bradford_gen_pop = len(df_cohort[df_cohort['Region']=="Bradford "])

agg_region = region_df.groupby(['cancer','Region']).size().reset_index(name='counts')
for i in range(len(agg_region)):
    if agg_region.loc[i,'Region']=="London":
        agg_region.loc[i,'prevalence']=agg_region.loc[i,'counts']/London_gen_pop
    if agg_region.loc[i,'Region']=="Manchester":
        agg_region.loc[i,'prevalence']=agg_region.loc[i,'counts']/Manchester_gen_pop
    if agg_region.loc[i,'Region']=="Bradford":
        agg_region.loc[i,'prevalence']=agg_region.loc[i,'counts']/Bradford_gen_pop
   

agg_region

In [None]:
set(df_cohort['Region'])

In [None]:
df_cohort["Region"].value_counts()

In [None]:
agg_region = agg_region.sort_values(by='prevalence', ascending=False)
plt.figure(figsize=(10,12))
sns.set_palette("pastel")
ax = sns.barplot(y='cancer', x='prevalence', hue='Region', data=agg_region, dodge=True, order=agg_region['cancer'].unique())

# Add text annotations with prevalence percentages
for p in ax.patches:
    width = p.get_width()  # get bar length (prevalence)
    if width>0.00001:
        percentage = width * 100  # calculate percentage
        ax.text(width,    # set x position of label slightly to the right of the bar
                p.get_y() + p.get_height() / 2,  # set y position of label to the middle of the bar
                '{:.2f}%'.format(percentage),  # set the label text to the prevalence percentage
                ha='left', va='center', fontsize=8)  # set horizontal and vertical alignment and fontsize



# Customize the plot
plt.title('Prevalence of Cancer by Type and Recruitment Region')
plt.xlabel('Prevalence')
plt.ylabel('Cancer Type')
plt.legend(title='Recruitment Region')
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
incidence_red = incidence[incidence["DiagDt"].notnull()]
incidence_red["diag_age"] = incidence_red.apply(lambda x: relativedelta.relativedelta(x.DiagDt, x.DOB).years, axis=1)
incidence_red = incidence_red[incidence_red["diag_age"]>0]
data = incidence_red.loc[:,["cancer","diag_age","Gender","Ethnicity"]]
ethnicity_main_cx = ['Asian or Asian British - Pakistani','Asian or Asian British - Bangladeshi','Asian or Asian British - Any other', 'Unknown']
data['Ethnicity']=data['Ethnicity'].apply(lambda x: 'Asian or Asian British - Any other' if x == 'Asian or Asian British - Indian' else x)
data['Ethnicity']=data['Ethnicity'].apply(lambda x: 'Unknown' if x in ['Not known','Not stated'] else x)
data['Ethnicity']=data['Ethnicity'].apply(lambda x: 'Other' if x not in ethnicity_main_cx else x)


In [None]:
data

In [None]:
from scipy.stats import levene

df_male = data[(data['Gender']=='male')&(data['cancer']=='Colorectal')]
df_female = data[(data['Gender']=='female')&(data['cancer']=='Colorectal')]
df_male = df_male.loc[:,"diag_age"].values
df_female = df_female.loc[:,'diag_age'].values



In [None]:
stat, p_value = levene(df_male, df_female)

print(f"Levene's test statistic: {stat}")
print(f"P-value: {p_value}")

if p_value > 0.05:
    print("Fail to reject the null hypothesis: the variances are equal.")
else:
    print("Reject the null hypothesis: the variances are not equal.")

In [None]:
from scipy.stats import kruskal
gender_ks_test_results = []
cancer_list = list(cancer_dict.values())

for cancer in cancer_list:
    df_male = data[(data['cancer']==cancer)&(data['Gender']=='male')]
    df_female = data[(data['cancer']==cancer)&(data['Gender']=='female')]
    df_male=df_male.loc[:,"diag_age"].values
    df_female = df_female.loc[:,"diag_age"].values
    stat,p = kruskal(df_male, df_female)
    if p<=0.05:
        symbol='significant'
    else:
        symbol='.'
    gender_ks_test_results.append([cancer,p,symbol])

gender_ks_test_results = pd.DataFrame(gender_ks_test_results)
gender_ks_test_results.columns = ['cancer','p-value', 'significance']
gender_ks_test_results.sort_values(by='p-value', ascending=True, inplace=True)    


In [None]:
gender_ks_test_results

In [None]:

ethnicity_ks_test_results = []
cancer_list = list(cancer_dict.values())

for cancer in cancer_list:
    df_pakistani = data[(data['cancer']==cancer)&(data['Ethnicity']=='Asian or Asian British - Pakistani')]
    df_bangladeshi = data[(data['cancer']==cancer)&(data['Ethnicity']=='Asian or Asian British - Bangladeshi')]
    df_pakistani=df_pakistani.loc[:,"diag_age"].values
    df_bangladeshi = df_bangladeshi.loc[:,"diag_age"].values
    stat,p = kruskal(df_pakistani, df_bangladeshi)
    if p<=0.05:
        symbol='significant'
    else:
        symbol='.'
    ethnicity_ks_test_results.append([cancer,p,symbol])

ethnicity_ks_test_results = pd.DataFrame(ethnicity_ks_test_results)
ethnicity_ks_test_results.columns = ['cancer','p-value', 'significance']
ethnicity_ks_test_results.sort_values(by='p-value', ascending=True, inplace=True)    

    
    


In [None]:
ethnicity_ks_test_results


In [None]:

gender_ethnicity_ks_test_results = []
cancer_list = list(cancer_dict.values())

for cancer in cancer_list:
    df_pakistani_male = data[(data['cancer']==cancer)&(data['Ethnicity']=='Asian or Asian British - Pakistani')&(data['Gender']=='male')]
    df_bangladeshi_male = data[(data['cancer']==cancer)&(data['Ethnicity']=='Asian or Asian British - Bangladeshi')&(data['Gender']=='male')]
    df_pakistani_male=df_pakistani_male.loc[:,"diag_age"].values
    df_bangladeshi_male = df_bangladeshi_male.loc[:,"diag_age"].values
    df_pakistani_female = data[(data['cancer']==cancer)&(data['Ethnicity']=='Asian or Asian British - Pakistani')&(data['Gender']=='female')]
    df_bangladeshi_female = data[(data['cancer']==cancer)&(data['Ethnicity']=='Asian or Asian British - Bangladeshi')&(data['Gender']=='female')]
    df_pakistani_female=df_pakistani_female.loc[:,"diag_age"].values
    df_bangladeshi_female = df_bangladeshi_female.loc[:,"diag_age"].values
    stat,p = kruskal(df_pakistani_male, df_bangladeshi_male)
    if p<=0.05:
        symbol='significant'
    else:
        symbol='.'
    gender_ethnicity_ks_test_results.append([cancer,"male",p,symbol])

    stat,p = kruskal(df_pakistani_female, df_bangladeshi_female)
    if p<=0.05:
        symbol='significant'
    else:
        symbol='.'
    gender_ethnicity_ks_test_results.append([cancer,"female",p,symbol])

gender_ethnicity_ks_test_results = pd.DataFrame(gender_ethnicity_ks_test_results)
gender_ethnicity_ks_test_results.columns = ['cancer','Gender','p-value', 'significance']
gender_ethnicity_ks_test_results.sort_values(by='p-value', ascending=True, inplace=True)    

    
    


In [None]:
gender_ethnicity_ks_test_results

In [None]:
# age at diagnosis box plot
data2 = data[["cancer","diag_age"]]
index_sort = data2.groupby(by="cancer").median().sort_values(by="diag_age").index
#sns.boxplot(y=data["cancer"], x=data["diag_age"], order=index_sort)
sns.boxplot(y="cancer", x="diag_age", data=data2, order=index_sort, hue="cancer", palette="muted")
plt.title("Cancer Prevalence by Age at Diagnosis")
plt.xlabel("Age at diagnosis")
plt.tight_layout()
plt.savefig(out_path+mode+"/"+"cancerbyage.png",bbox_inches='tight')

In [None]:
data = data[data['Ethnicity'].isin(['Asian or Asian British - Pakistani','Asian or Asian British - Bangladeshi', 'Asian or Asian British - Any other'])]

custom_palette = {"male": "skyblue", "female": "salmon"}




sns.set(font_scale=2)
g = sns.displot(data, row='cancer',x= 'diag_age', hue='Gender', palette= custom_palette, fill=False, common_norm=True, kind="kde", height=4, aspect=1.5, warn_singular=False, clip=[0,100] )


age_points_by_cancer = {
    'Colorectal': [60, 74],
    'Breast': [50, 70],
    'Cervical': [25, 64]
    
}


# Create a mapping of cancer types to their respective axes
cancer_to_ax = {label: ax for label, ax in zip(g.row_names, g.axes.flat)}

# Add vertical lines at specific age points for the specified cancers with default color
for cancer, age_points in age_points_by_cancer.items():
    if cancer in cancer_to_ax:
        ax = cancer_to_ax[cancer]
        for age in age_points:
            ax.axvline(age, color='black', linestyle='--', linewidth=1)

for ax in g.axes.flat:
    ax.xaxis.label.set_visible(True)    # Ensure x-axis label is visible
    ax.xaxis.set_tick_params(labelbottom=True)  # Ensure x-axis tick labels are visible
    ax.set_xlabel('Diagnosis Age')

# Customize the plot
g.set_axis_labels('Diagnosis Age', 'Density')
g.set_titles(row_template='{row_name}', col_template='{col_name}', size=16)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
plt.savefig(out_path+mode+"/"+"cancerbyage_gender.png")

In [None]:
custom_palette = {"Asian or Asian British - Pakistani": "green", "Asian or Asian British - Bangladeshi": "blue", "Asian or Asian British - Any other":"salmon"}
sns.set(font_scale=2)
g = sns.displot(data, row='cancer', hue='Ethnicity', x="diag_age", palette= custom_palette, fill=False, common_norm=True, kind="kde", height=4, aspect=1.5, warn_singular=False, clip=[0,100], legend=True)


age_points_by_cancer = {
    'Colorectal': [60, 74],
    'Breast': [50, 70],
    'Cervical': [25, 64]
    
}


# Create a mapping of cancer types to their respective axes
cancer_to_ax = {label: ax for label, ax in zip(g.row_names, g.axes.flat)}

# Add vertical lines at specific age points for the specified cancers with default color
for cancer, age_points in age_points_by_cancer.items():
    if cancer in cancer_to_ax:
        ax = cancer_to_ax[cancer]
        for age in age_points:
            ax.axvline(age, color='black', linestyle='--', linewidth=1)

for ax in g.axes.flat:
    ax.xaxis.label.set_visible(True)    # Ensure x-axis label is visible
    ax.xaxis.set_tick_params(labelbottom=True)  # Ensure x-axis tick labels are visible
    ax.set_xlabel('Diagnosis Age')
# Map the sns.kdeplot to the FacetGrid
#g.map(sns.kdeplot, 'diag_age', common_norm=False, fill=True)

# Add legends
#g.add_legend(title='Ethnicity')# Customize legend properties
# Adjust the legend using the FacetGrid's own methods


# Customize the plot
g.set_axis_labels('Diagnosis Age', 'Density')
g.set_titles(row_template='{row_name}', col_template='{col_name}', size=16)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
plt.savefig(out_path+mode+"/"+"cancerbyage_ethnicity.png")

In [None]:


# Get the unique types of cancers
cancer_types = data['cancer'].unique()
custom_palette = {"male": "skyblue", "female": "salmon"}

# Iterate over each type of cancer and create a FacetGrid plot
for cancer in cancer_types:
    # Filter data for the current cancer type
    cancer_data = data[data['cancer'] == cancer]
    
    sns.set(font_scale=2)
    g = sns.displot(cancer_data, col='Ethnicity',x= 'diag_age', hue='Gender', palette= custom_palette, fill=False, common_norm=True, kind="kde", height=4, aspect=1.5, warn_singular=False, clip=[0,100], legend=False )

    
    # Create a FacetGrid for the density plots
    #g = sns.FacetGrid(cancer_data, col='Ethnicity', hue='Gender', height=4, aspect=1.5, sharex=True, sharey=True)
    
    # Map the sns.kdeplot to the FacetGrid
    #g.map(sns.kdeplot, 'diag_age', common_norm=True, fill=True)
    
    # Add legends
    #g.add_legend(title='Gender')
    
    # Customize the plot
    g.set_axis_labels('Diagnosis Age', 'Density')
    g.set_titles(col_template='{col_name}', size=12)
    g.fig.suptitle(f'Density Plot of Diagnosis Age for {cancer} cancer', y=1.02)
    
    # Adjust layout
    plt.tight_layout()
    
    # Show the plot
    plt.show()
    plt.savefig(out_path+mode+"/"+cancer+"cancerbyage_ethnicity_gender.png")

In [None]:
# Get the unique types of cancers
cancer_types = data['cancer'].unique()

custom_palette = {"Asian or Asian British - Pakistani": "green", "Asian or Asian British - Bangladeshi": "blue", "Asian or Asian British - Any other":"salmon"}

# Iterate over each type of cancer and create a FacetGrid plot
for cancer in cancer_types:
    # Filter data for the current cancer type
    cancer_data = data[data['cancer'] == cancer]
    
    sns.set(font_scale=2)
    g = sns.displot(cancer_data, col='Gender',x= 'diag_age', hue='Ethnicity', palette= custom_palette, fill=False, common_norm=True, kind="kde", height=4, aspect=1.5, warn_singular=False, clip=[0,100], legend=False )


    # Create a FacetGrid for the density plots
    #g = sns.FacetGrid(cancer_data, col='Gender', hue='Ethnicity', palette = custom_palette, height=4, aspect=1.5, sharex=True, sharey=True)
    
    # Map the sns.kdeplot to the FacetGrid
    #g.map(sns.kdeplot, 'diag_age', common_norm=False, fill=True)
    
    # Add legends
    #g.add_legend(title='Ethnicity')
    
    # Customize the plot
    g.set_axis_labels('Diagnosis Age', 'Density')
    g.set_titles(col_template='{col_name}', size=12)
    g.fig.suptitle(f'Density Plot of Diagnosis Age for {cancer} cancer', y=1.02)
    
    # Adjust layout
    plt.tight_layout()
    
    # Show the plot
    plt.show()
    plt.savefig(out_path+mode+"/"+cancer+"cancerbyage_gender_ethnicity.png")