In [None]:
from datetime import datetime
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import scikit_posthocs as sp
from lifelines import CoxPHFitter
from sklearn.impute import KNNImputer

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 200)

# Config

In [None]:
file_redcap_data = 'Homebase_new_updated.csv'
file_header_mappings = 'ClinicalGenomicCorre_header_mappings.xlsx'
file_genetics_data = '1-s2.0-S0022202X18322942-mmc2_MAIN GENOMIC.xlsx'

# Read REDCap data export file

In [None]:
df_header = pd.read_excel(file_header_mappings)
header_labels = df_header.loc[0, :].to_dict()

In [None]:
df = pd.read_csv(file_redcap_data, index_col=0, skiprows=1)

## Drop unnecessary columns

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df = df.drop(["Age at CBC prior to sampling (can answer instead of 'date of CBC prior to sampling')",
 'Age at Initial Diagnosis - can answer instead of Date of Initial Diagnosis  ',
 "Age at LDH collection (prior to sampling) - can answer instead of 'Date of LDH collection prior to sampling'",
 "Age at date of relapse/disease progression 1 (can answer instead of 'date of relapse/disease progression 1')",
 "Age at date of relapse/disease progression 2 (can answer instead of 'date of relapse/disease progression 2')",
 "Age at date of relapse/disease progression 3 (can answer instead of 'date of relapse/disease progression 3')",
 "Age at date of relapse/disease progression 4 (can answer instead of 'date of relapse/disease progression 4')",
 "Age at date of relapse/disease progression 5 (can answer instead of 'date of relapse/disease progression 5')",
 "Age at death (can answer instead of 'date of death')",
 "Age at large cell transformation (can answer instead of 'date of large cell transformation')",
 "Age at sampling used for genetic analysis (can answer instead of 'date of sampling for genetic analysis')",
 "Age during CBC with differential at time of sampling (can answer instead of 'date of CBC with differential at time of sampling')",
 'Current Age (At Time of Data Entry) - can answer instead of DOB'], axis=1)

In [None]:
# Swap out the column names (non-distinct labels) with the distinct REDCap variable names 
df.columns = list(header_labels.keys())[1:]

In [None]:
# drop rows where the subject_id is empty
df = df.dropna(axis=0, subset=['subject_id'])
print(df.shape)
df.head()

# Map IDs to genomics file

Subject ID entries in REDCap do not exactly match those in the genomics data file, and each group seems to enter their IDs a little differently. This function tries to figure out these differences. 

In [None]:
def subject_id_to_sample_id(x):
    if type(x) is not str:
        return ''
    
    # Wang (MD Anderson): SPZ-###__wang__SS
    # note: in REDCap, sometimes entered as "spz-#" or "spz #"
    r = re.compile('^spz[ -](\d{1,2})$')
    m = r.match(x)
    if m is not None:
        return f'SPZ-{int(m[1]):03d}__wang__SS'
    
    # Choi (Yale): CTCL# --> CTCL#__choi__SS
    r = re.compile('^CTCL(\d{1,2})$')
    m = r.match(x)
    if m is not None:
        return f'CTCL{m[1]}__choi__SS'
    
    # Woollard (Kings College): WES-# --> WES_#__woolard__SS
    r = re.compile('^WES-(\d{1,2})$')
    m = r.match(x)
    if m is not None:
        return f'WES_{m[1]}__woollard__SS'
    
    # Almeida (Leiden): SS_L# --> SS_L#__almeida__SS
    # note: Almeida also has another format for SS_NU#__almedeida__SS that so far is not observed in REDCap
    r = re.compile('^SS_([NU|L]\d{1,2})$')
    m = r.match(x)
    if m is not None:
        return f'SS_{m[1]}__almeida__SS'
    
    # Prasad (Barcelona): SS##__prasad__SS --> SS##__prasad__SS 
    # note: subject ID in REDCap matches sample_id in genomics file exactly
    r = re.compile('^SS\d{2}__prasad__SS$')
    m = r.match(x)
    if m is not None:
        return x
    
    
    return ''
    
    
df['sample_id'] = df.subject_id.apply(subject_id_to_sample_id)
df[['subject_id', 'sample_id']]

In [None]:
df.shape

In [None]:
copy_df_first = df.copy()

# Data cleaning

## Remove rows and columns with high missingness

In [None]:
# Drop participants that are mostly empty

# axis = 0: along the rows (going down)
# axis = 1: along the columns (going right)
x = df.isnull().sum(axis=1)
x.hist()
df = df[x<200]

In [None]:
df.shape

In [None]:
missing = [item for item in copy_df_first.index.to_list() if item not in df.index.to_list()]

In [None]:
200/341

In [None]:
len(copy_df_first.loc[missing, :].index.to_list())

In [None]:
copy_df_first.loc[missing, 'redcap_data_access_group']

In [None]:
# Drop columns that are completely empty

x = df.isnull().sum(axis=0)
df = df.loc[:, x < len(df)]
print(df.shape)


# Check out what other columns to drop
x = df.isnull().sum(axis=0)
x.hist()
display(x[x > 45])

In [None]:
# Show columns with a lot of missing data
df.loc[:,x>45].iloc[:, 0:11] # don't show the treatment response columns

In [None]:
# Drop specified columns
df = df.drop(columns=['lesiongrade_type', 'exposure_description', 'rbc_2', 'hct_2', 'mcv_2', 'mchc_2', 'rdw_2', 'neutrophil_number_2', 'lymphocyte_number_2', 'monocytes_number_2', 'eosinophils_number_2', 'basophils_number_2', 'basophils_percentage_2'])
print(df.shape)

In [None]:
for ind in df.index:
    try:
        if pd.Series(df.tbsa_diagnosis[ind]).isna()[0]:
            continue
        elif '%' in df.tbsa_diagnosis[ind]:
            df.tbsa_diagnosis[ind] = df.tbsa_diagnosis[ind][:-1]
        elif '>' in df.tbsa_diagnosis[ind]:
            df.tbsa_diagnosis[ind] = df.tbsa_diagnosis[ind][1:]
    except Exception as e:
        print(e)
        print(df.tbsa_diagnosis[ind])

In [None]:
df.tbsa_diagnosis = df.tbsa_diagnosis.apply(lambda x: float(x))

In [None]:
df.shape

## Clean and correct data in certain columns

In [None]:
import math

In [None]:
tnmb_value_list = ['0', '1', '2', '3', '4', 'X']
tnmb_list = ['t_sampling', 'n_sampling', 'm_sampling', 'b_sampling']

In [None]:
tnmb_label_list = [b +'_'+a for a in tnmb_value_list for b in tnmb_list]

In [None]:
tnmb_label_list

In [None]:
# result = df.n_sampling.apply(lambda x: math.nan if type(x) is float else tnmb_dict[x])
# result

In [None]:
# # Convert certain columns to numeric
# df.t_sampling = df.t_sampling.apply(lambda x: math.nan if type(x) is float else tnmb_dict[x])
# df.n_sampling = df.n_sampling.apply(lambda x: math.nan if type(x) is float else tnmb_dict[x])
# df.m_sampling = df.m_sampling.apply(lambda x: math.nan if type(x) is float else tnmb_dict[x])
# df.b_sampling = df.b_sampling.apply(lambda x: math.nan if type(x) is float else tnmb_dict[x])

# keep t, n, m, b as catagorical variables and apply one-hot encoding

for v in tnmb_value_list:
    for tnmb in tnmb_list:
        if df[tnmb] is not math.nan:
            df[tnmb+'_' + v] = (df[tnmb] == v).astype(float)
        else:
            df[tnmb+'_' + v] = math.nan

In [None]:
df.head(5)

In [None]:
# race: replace empty with 'Unknown'
df.loc[pd.isnull(df.race), 'race'] = 'Unknown'

# ethnicity: replace empty with 'Unknown'
df.loc[pd.isnull(df.ethnicity), 'ethnicity'] = 'Unknown'

# date of diagnosis
# patient ID 901-4 has erroneous date
print(df.loc['901-4', 'date_of_diagnosis'])
df.loc['901-4', 'date_of_diagnosis'] = datetime.fromisoformat('2014-03-03')



# tbsa_diagnosis
# patient ID 900-9 has tbsa_diagnosis >80, so we changed to 80
# patient ID 848-1 has tbsa_diagnosis 90%, so we changed to 90
x = df.tbsa_diagnosis < 1
df.loc[x, 'tbsa_diagnosis'] = df.tbsa_diagnosis[x] * 100

# tbsa_sampling
# patient ID 795-23 has erroneous value
print(df.loc['795-23', 'tbsa_sampling'])
df.loc['795-23', 'tbsa_sampling'] = np.nan
df.tbsa_sampling = pd.to_numeric(df.tbsa_sampling, errors='coerce')

# t, n, m, b columns contain values 'X' which were not part of Prof.Casey's data and thus new transformation method was developed.
# # Convert certain columns to numeric
# df.t_sampling = pd.to_numeric(df.t_sampling, errors='coerce', downcast='integer')
# df.n_sampling = pd.to_numeric(df.n_sampling, errors='coerce', downcast='integer')
# df.m_sampling = pd.to_numeric(df.m_sampling, errors='coerce', downcast='integer')
# df.b_sampling = pd.to_numeric(df.b_sampling, errors='coerce', downcast='integer')


# cd4cd8ratio
# for now, replace the entered cd4:cd8 with the calculated cd4:cd8 in divergent cases
c = df.absolute_cd4 / df.absolute_cd8
d = (2 * np.abs(df.cd4cd8ratio - c) / (df.cd4cd8ratio + c)) > 0.2
df.loc[d, 'cd4cd8ratio'] = c[d]

# absolute_cd4_cd26
df.loc['901-2', 'absolute_cd4_cd26'] = df.loc['901-2', 'absolute_cd4']

# cbc_date
df.loc['795-11', 'cbc_date'] = datetime.fromisoformat('2012-10-12')
df.loc['795-36', 'cbc_date'] = datetime.fromisoformat('2014-01-15')
df.cbc_date = pd.to_datetime(df.cbc_date, errors='coerce')

# wbc
x = df.wbc > 1000
df.loc[x, 'wbc'] = df.loc[x, 'wbc'] / 1000

# rbc
df.loc['901-4', 'rbc'] = df.loc['901-4', 'rbc'] / 1000

# hct
# several values seem to be express percent in 0-1 range; for one person, hct in the 3.xx range, which is unreasonably low
x = df.hct < 1
df.loc[x, 'hct'] = df.loc[x, 'hct'] * 100

# this is quite dubious, so we are leaving this out
# x = df.hct < 10
# df.loc[x, 'hct'] = df.loc[x, 'hct'] * 10

# neutrophil number
x = df.neutrophil_number > 1000
df.loc[x, 'neutrophil_number'] = df.loc[x, 'neutrophil_number'] / 1000

# lymphocyte number
x = df.lymphocyte_number > 1000
df.loc[x, 'lymphocyte_number'] = df.loc[x, 'lymphocyte_number'] / 1000

# monocytes_number
x = df.monocytes_number > 100
df.loc[x, 'monocytes_number'] = df.loc[x, 'monocytes_number'] / 1000

# eosinophils_number
x = df.eosinophils_number > 100
df.loc[x, 'eosinophils_number'] = df.loc[x, 'eosinophils_number'] / 1000

# wbc_2
x = df.wbc_2 > 1000
df.loc[x, 'wbc_2'] = df.loc[x, 'wbc_2'] / 1000

##### The columns below have been dropped, so no longer need to do the below cleaning steps

# # rbc_2
# x = df.rbc_2 > 100
# df.loc[x, 'rbc_2'] = df.loc[x, 'rbc_2'] / 100

# # mcv_2
# df.loc['901-2', 'mcv_2'] = df.loc['901-2', 'mcv_2'] / 10

# # neutrophil number_2
# x = df.neutrophil_number_2 > 1000
# df.loc[x, 'neutrophil_number_2'] = df.loc[x, 'neutrophil_number_2'] / 1000

# # lymphocyte number_2
# x = df.lymphocyte_number_2 > 1000
# df.loc[x, 'lymphocyte_number_2'] = df.loc[x, 'lymphocyte_number_2'] / 1000

# # monocytes_number_2
# x = df.monocytes_number_2 > 100
# df.loc[x, 'monocytes_number_2'] = df.loc[x, 'monocytes_number_2'] / 1000

# # eosinophils_number_2
# x = df.eosinophils_number_2 > 100
# df.loc[x, 'eosinophils_number_2'] = df.loc[x, 'eosinophils_number_2'] / 1000

In [None]:
df.shape

# Data checks after cleaning

In [None]:
ax = df.hist(layout=[33, 4], figsize=[20, 120])
plt.show()

In [None]:
df

In [None]:
# reason: df.date_of_diagnosis['900-24'] is nan
df = df.drop(['900-24'], axis=0)

In [None]:
df.shape

In [None]:
df.date_of_diagnosis['901-4'] = '2014-03-03'

# Feature transformations

In [None]:
# Calculate age at diagnosis
# patient ID 900-24 has date_of_diagnosis NaN. Because there is no entry for date of initial diagnosis, we dropped this patient and moved forward.
# patient ID 901-4 has date_of_diagnosis 2014-03-03 00:00:00
print(f'Empty DOB: {pd.isnull(df.date_of_birth).sum()}')
print(f'Empty date of dx: {pd.isnull(df.date_of_diagnosis).sum()}')
for ind in df.index:  
    try:
        df.loc[ind, 'age_at_dx'] = (datetime.strptime(df.date_of_diagnosis[ind], '%Y-%m-%d') - datetime.strptime(df.date_of_birth[ind], '%Y-%m-%d')).days/365.24
    except Exception as e:
        print(e)
        print(df.date_of_diagnosis[ind])
        print(df.date_of_birth[ind])
    
df.age_at_dx.hist()

In [None]:
df.shape

In [None]:
# Simplify subtypes to MF vs SS
display(df.subtype_variant.value_counts(dropna=False))

def simplify_subtype(x):
    if type(x) is not str:
        return x
    
    return 'Mycosis fungoides' if x != 'Sezary syndrome' else x 

df['subtype_variant_simplified'] = df.subtype_variant.map(simplify_subtype)

In [None]:
df.shape

In [None]:
# Convert categoricals to numeric
# Questionable operation. Should keep them as catagorical. Fixed below when generating data at diagnosis and data at sampling
# The survival analysis in this script only focus on the CTCL stages. Stages were kept as catagorical variables in the preprocessing steps for survival analysis.
df['gender_n'] = df.gender.map(lambda x: int(x == 'Female') if type(x) is str else np.nan)
df['subtype_variant_simplified_n'] = df.subtype_variant_simplified.map(lambda x: int(x == 'Sezary syndrome') if type(x) is str else np.nan)
stage_to_numeric_dict = {
        'IA': 1,
        'IB': 2,
        'IIA': 3,
        'IIB': 4,
        'IIIA': 5,
        'IIIB': 6,
        'IVA1': 7,
        'IVA2': 8,
        'IVB': 9
    }
def stage_to_numeric(x):
    return stage_to_numeric_dict.get(x, x)
df['stage_at_diagnosis_n'] = df.stage_at_diagnosis.map(stage_to_numeric)
df['stage_at_sampling_n'] = df.stage_at_sampling.map(stage_to_numeric)

def lesion_type_to_numeric(x):
    map = {
        'Patch': 1,
        'Plaque': 2,
        'Tumor': 3,
        'Erythroderma': 4
    }
    return map.get(x, x)
df['lesion_diagnosis_n'] = df.lesion_diagnosis.map(lesion_type_to_numeric)
df['lesion_sampling_n'] = df.predominant_sampling.map(lesion_type_to_numeric)

In [None]:
df.shape

# Analysis

In [None]:
df.alive.value_counts(dropna=False)

In [None]:
df.shape

In [None]:
# distribution of survival time since dx date among patients who died
x = ~pd.isnull(df.date_of_death)
x2 = []
for ind in df[x].index:
    x2.append((datetime.strptime(df.loc[ind, 'date_of_death'], '%Y-%m-%d') - datetime.strptime(df.loc[ind, 'date_of_diagnosis'], '%Y-%m-%d')).days/365.24)
x2 = pd.Series(x2)
x2.hist(bins=20)
df.loc[x, 'survival_years'] = x2

# Treatment responses analysis

In [None]:
# How many participants had each treatment
for i in range(1, 24):
    c = f'prior_treatment___{i}'
    print(f'{header_labels[c]}: {(df[c] == "Checked").sum()}')

In [None]:
# Get treatment names
r = re.compile('^Treatment prior to sampling \(choice=(.+)\)$')
treatment_names = [None] * 23
for i in range(1, 24):
    c_prior = f'prior_treatment___{i}'
    m = r.match(header_labels[c_prior])
    treatment_names[i-1] = m[1]
print(treatment_names)

In [None]:
# See if any participants are listed with the same treatment both before and after sampling
for i in range(2, 17):
    c_prior = f'prior_treatment___{i}'
    c_prior_date = f'date_of_prior_treatment{i}'
    c_prior_response = f'prior_treatment_response{i}'
    c_prior_duration = f'duration_response_priortx{i}'
    c_sampling = f'treatment_sampling___{i}'
    c_after = f'after_sampling___{i}'
    c_after_date = f'date_of_after_treatment{i}'
    c_after_response = f'after_treatment_response{i}'
    c_after_duration = f'duration_response_aftertx{i}'
    
    print(treatment_names[i-1])
    s = ((df[c_prior] == 'Checked').astype(int) + (df[c_sampling] == 'Checked').astype(int) + (df[c_after] == 'Checked').astype(int)) > 1
    if s.sum() > 0:
        display(df.loc[s, [c_prior, c_prior_date, c_prior_response, c_prior_duration, c_sampling, 'date_sampling', c_after, c_after_date, c_after_response, c_after_duration]])
        

In [None]:
(~pd.isnull(df[c_prior_date]) & (df[c_prior_date] > df.date_sampling))

In [None]:
df.shape

In [None]:
# See if any participants have prior/after treatments with dates not matching the date regime
for i in range(2, 17):
    c_prior = f'prior_treatment___{i}'
    c_prior_date = f'date_of_prior_treatment{i}'
    c_prior_response = f'prior_treatment_response{i}'
    c_prior_duration = f'duration_response_priortx{i}'
    c_after = f'after_sampling___{i}'
    c_after_date = f'date_of_after_treatment{i}'
    c_after_response = f'after_treatment_response{i}'
    c_after_duration = f'duration_response_aftertx{i}'
        
    s = ~pd.isnull(df.date_sampling) & ((~pd.isnull(df[c_prior_date]) & (df[c_prior_date] > df.date_sampling)) | (~pd.isnull(df[c_after_date]) & (df[c_after_date] < df.date_sampling)))
    if s.sum() > 0:
        print(treatment_names[i-1])
        display(df.loc[s, [c_prior, c_prior_date, c_prior_response, c_prior_duration, 'date_sampling', c_after, c_after_date, c_after_response, c_after_duration]])
        

In [None]:
# See if any participants are listed with the same treatment and date before and after sampling
for i in range(2, 17):
    c_prior = f'prior_treatment___{i}'
    c_prior_date = f'date_of_prior_treatment{i}'
    c_prior_response = f'prior_treatment_response{i}'
    c_prior_duration = f'duration_response_priortx{i}'
    c_after = f'after_sampling___{i}'
    c_after_date = f'date_of_after_treatment{i}'
    c_after_response = f'after_treatment_response{i}'
    c_after_duration = f'duration_response_aftertx{i}'
        
#     s = (df[c_prior] == 'Checked') & (df[c_after] == 'Checked') & (df[c_prior_date] == df[c_after_date]) & \
#             (df[c_prior_duration] != df[c_after_duration]) & ~(np.isnan(df[c_prior_duration]) & np.isnan(df[c_after_duration]))
    s = (df[c_prior] == 'Checked') & (df[c_after] == 'Checked') & (df[c_prior_duration] != df[c_after_duration]) & \
            ~(np.isnan(df[c_prior_duration]) & np.isnan(df[c_after_duration]))
    if s.sum() > 0:
        print(treatment_names[i-1])
        display(df.loc[s, [c_prior, c_prior_date, c_prior_response, c_prior_duration, 'date_sampling', c_after, c_after_date, c_after_response, c_after_duration]])
        

In [None]:
max_duration_columns = list()
for i in range(2, 17):                
    max_duration_columns.append(f'max_tx_duration_response{i}')
df[max_duration_columns] = np.nan


# def get_max_response_durations(r):
for index, r in df.iterrows():
    for i in range(2, 17):
        c_prior = f'prior_treatment___{i}'        
        c_prior_response = f'prior_treatment_response{i}'
        c_prior_duration = f'duration_response_priortx{i}'        
        c_after = f'after_sampling___{i}'
        c_after_response = f'after_treatment_response{i}'
        c_after_duration = f'duration_response_aftertx{i}'
        c_max_duration = f'max_tx_duration_response{i}'

        durations = list()
        if r[c_prior] == 'Checked':
            duration = r[c_prior_duration]
            if r[c_prior_response] == 'No':
                durations.append(0)
            elif r[c_prior_response] == 'Yes' and not(pd.isnull(duration)):
                durations.append(duration)
            # ignore cases where "Yes" was selected for response but no duration specified
                
        if r[c_after] == 'Checked':
            duration = r[c_after_duration]
            if r[c_after_response] == 'No':
                durations.append(0)
            elif r[c_after_response] == 'Yes' and not(pd.isnull(duration)):
                durations.append(duration)
            # ignore cases where "Yes" was selected for response but no duration specified
            
                
        if len(durations) > 0:
            df.loc[index, c_max_duration] = np.max(durations)
        else:
            df.loc[index, c_max_duration] = np.nan
                        

In [None]:
df.shape

In [None]:
df[max_duration_columns]

In [None]:
non_nan_durations = [x[~pd.isnull(x)] for _, x in df[max_duration_columns].items()]
xlabels = [treatment_names[i+1] + f' (n={len(non_nan_durations[i])})' for i in range(len(non_nan_durations))]

# Sort from longest to shortest duration
non_nan_duration_sort_values = np.array([(np.median(x), np.mean(x)) for x in non_nan_durations], dtype=[('median', np.float64), ('mean', np.float64)])
argsort = np.flip(np.argsort(non_nan_duration_sort_values, order=('median', 'mean')))
non_nan_durations_sorted = [non_nan_durations[i] for i in argsort]
xlabels_sorted = [xlabels[i] for i in argsort]

plt.figure(figsize=[20, 8])
plt.boxplot(non_nan_durations_sorted, labels=xlabels_sorted)
plt.ylabel('Response duration (months)')
plt.xlabel('Treatment')
plt.xticks(rotation=90)
plt.show()

In [None]:
df.shape

In [None]:
# Kruskal-Wallis test
stats.kruskal(*non_nan_durations)

In [None]:
# Dunn test with Holm-Sidak correction
df_dunn = sp.posthoc_dunn(non_nan_durations_sorted, p_adjust='holm-sidak')
df_dunn.columns = xlabels_sorted
df_dunn.index = xlabels_sorted
df_dunn.style.applymap(lambda x: 'color: red' if x < 0.05 else None)

In [None]:
df.head(10)

In [None]:
df.to_csv('preprocessed_121122.csv', index=None)

In [None]:
df.shape

In [None]:
copy_df = df.copy()
print(copy_df.shape)

In [None]:
df = copy_df.copy()

In [None]:
df.shape

# Survival

## stage at diagnosis

In [None]:
# Create dataframe for Cox regression
estimated_data_entry_date = pd.Timestamp('2022-12-11')
df[['survival', 'event']] = np.nan
print(df.shape)
for ind in df.index:
    if df.alive[ind] == 'Yes':
        df.loc[ind, 'survival'] = (estimated_data_entry_date - datetime.strptime(df.date_of_diagnosis[ind], '%Y-%m-%d')).days/365.24
        df.loc[ind, 'event'] = 0 
#     elif df.alive[ind] == 'No' and not pd.isnull(df.date_of_death[ind]) and not pd.isnull(df.date_of_diagnosis[ind]):
    elif not pd.isnull(df.date_of_death[ind]) and not pd.isnull(df.date_of_diagnosis[ind]):
        df.loc[ind, 'survival'] = (datetime.strptime(df.date_of_death[ind], '%Y-%m-%d') - datetime.strptime(df.date_of_diagnosis[ind], '%Y-%m-%d')).days/365.24
        df.loc[ind, 'event'] = 1 
print(df.shape)   
df.dropna(axis=0, how='any', subset=['survival', 'event'], inplace=True)
print(df.shape) 
df[['date_of_diagnosis', 'alive', 'date_of_death', 'survival', 'event']]

In [None]:
missing = [item for item in copy_df.index.to_list() if item not in df.index.to_list()]

In [None]:
len(missing)

In [None]:
missing

In [None]:
copy_df.loc[missing, ['date_of_diagnosis', 'date_of_death', 'alive', 'redcap_data_access_group']]

In [None]:
copy_df.loc[missing, 'redcap_data_access_group']

In [None]:
df.shape

In [None]:
stage_to_numeric_dict.keys()

In [None]:
# Check how many samples we have for stage_at_diagnosis

# -----------------------------------
# Bad coding... missing the ninth bar
# df.stage_at_diagnosis_n.hist(bins=np.arange(1,10), figsize=[12,6], width=0.95)
# plt.xticks(ticks=np.arange(1.5, 10.5, 1), labels=list(stage_to_numeric_dict.keys()))

df.stage_at_diagnosis_n.hist(bins=np.arange(1,11), figsize=[12,6], width=0.95)
plt.xticks(ticks=np.arange(1.5, 11.5, 1), labels=['IA', 'IB', 'IIA', 'IIB', 'IIIA', 'IIIB', 'IVA1', 'IVA2', 'IVB', ''])
plt.xlabel('Stage at diagnosis')
plt.ylabel('# Patients')
plt.show()

In [None]:
df.stage_at_diagnosis.value_counts()

In [None]:
df.shape

In [None]:
sum(df.stage_at_diagnosis.value_counts().to_list())

In [None]:
# get rid of entries where stage at diagnosis is missing
df = df[-df.stage_at_diagnosis.isna()]

In [None]:
df.shape

In [None]:
# all these entries have valid values under the column stage_at_sampling
df.stage_at_sampling.isna().sum()

In [None]:
# df_temp = df[['survival', 'event', 'stage_at_diagnosis_n']].copy().dropna()
# df_temp.rename(columns={'stage_at_diagnosis_n': 'stage_n'}, inplace=True)

In [None]:
# df_temp.stage_n.value_counts()

In [None]:
plt.figure(figsize=[24, 12])

df_temp = df[['survival', 'event', 'stage_at_diagnosis_n']].copy().dropna()
df_temp.rename(columns={'stage_at_diagnosis_n': 'stage_n'}, inplace=True)
stages = list(stage_to_numeric_dict.keys())

print('===========\nstage numeric\n===========\n')  
cph = CoxPHFitter()
cph.fit(df_temp, duration_col='survival', event_col='event')
cph.print_summary()
# plt.subplots(figsize = (10, 6))
# cph.plot()

ax = plt.subplot(3, 4, 1)
cph.plot_partial_effects_on_outcome(covariates='stage_n', values=np.arange(1,10), cmap='coolwarm', ax=ax)

print('===========\nstage levels\n===========\n')  
def stage_levels(stage_numeric):
    if 1 <= stage_numeric <= 3:
        return 0
    else:
        return 1
df_temp['stage_level'] = df_temp.stage_n.map(stage_levels)
cph = CoxPHFitter()
cph.fit(df_temp[['survival', 'event', 'stage_level']], duration_col='survival', event_col='event')
cph.print_summary()

ax = plt.subplot(3, 4, 2)
cph.plot_partial_effects_on_outcome(covariates='stage_level', values=np.arange(2), cmap='coolwarm', ax=ax)

print('===========\nstage levels with dummy coding\n===========\n')  
df_temp['stage_levels'] = df_temp.stage_n.map(stage_levels)
df_temp_dummy = pd.get_dummies(df_temp.stage_levels, prefix='stage')
# df_temp['stage_early'] = df_temp_dummy['stage_0']
df_temp['stage_late'] = df_temp_dummy['stage_1']
# df_temp['stage_III+'] = df_temp_dummy['stage_2']
cph = CoxPHFitter()
cph.fit(df_temp[['survival', 'event', 'stage_late']], duration_col='survival', event_col='event')
cph.print_summary()

ax = plt.subplot(3, 4, 3)
cph.plot_partial_effects_on_outcome(covariates=['stage_late'], values=[0, 1], cmap='coolwarm', ax=ax)

df_staged_cox = None

# there is no patient at stage IVB, which is the last stage, so we remove it.
for i in range(1, 9):
    print(f'===========\n{i}\n===========\n')
    print(stages[i])
    df_temp[stages[i]] = df_temp.stage_n > i
    cph = CoxPHFitter()
    cph.fit(df_temp[['survival', 'event', stages[i]]], duration_col='survival', event_col='event')
    cph.print_summary()

    ax = plt.subplot(3, 4, i+3)
    cph.plot_partial_effects_on_outcome(covariates=stages[i], values=[0, 1], cmap='coolwarm', ax=ax)

df_temp[stages[8]] = df_temp.stage_n > 9
    
plt.show()

In [None]:
df_temp

## Prep table for regularized Cox regression to be performed in R

### extract and transform data at time of diagnosis

In [None]:
# Check if we should include individual malignancies
cols_mal = [f'other_malignancies___{i}' for i in range(1, 27)]
(df[cols_mal] == 'Checked').sum()

# at most 2 cases per malignancy, so don't include the individual malignancies

In [None]:
# Check if we should include individual expposure types
cols = [f'exposure_type___{i}' for i in range(1, 8)]
(df[cols] == 'Checked').sum()

# at most 2 cases per exposure, except #7, which is "other", so donn't include individual exposure types

In [None]:
df.tcr_clone.value_counts()

In [None]:
# straight extraction of some features
df_cox = df[['survival', 'event', 'age_at_dx', 'stage_at_diagnosis_n', 'lesion_diagnosis_n', 
             'tbsa_diagnosis', 'mswat_diagnosis', 'duration_before_dx', 
             ]].copy()
display(df_cox.head(5))

In [None]:
# gender
df_cox['gender_male'] = df.gender.map(lambda x: 1 if x == 'Male' else 0)


# combine race/ethnicity
# race: group other, asian, AIAN, and unknown together because few samples
df.race.value_counts(dropna=False)
race_replace_dict = {
    'White': 'white',
    'Black or African American': 'black',
    'Asian': 'other',
    'American Indian or Alaska Native': 'other',
    'Other': 'other',
    'Unknown': 'other', 
    np.nan: 'other'
}
race_eth_simplified = df.race.replace(race_replace_dict)
race_eth_simplified[df.ethnicity == 'Hispanic or Latino or Spanish Origin'] = 'hispanic'
race_eth_dummies = pd.get_dummies(race_eth_simplified, prefix='race_eth')
df_cox = pd.concat([df_cox, race_eth_dummies[['race_eth_black', 'race_eth_other', 'race_eth_hispanic']]], axis=1)        


# lymph node at diagnosis
def lymph_node_diagnosis(r):
    if pd.isnull(r.lymphnode_diagnosis) or r.lymphnode_diagnosis == 'No' :
        return 0
    if r.lymphnode_diagnosis == 'Yes' and not(pd.isnull(r.lymphnode_diagnosis)):
        return int(r.lymphnode_diagnosis_2 == 'Yes')
    
df_cox['lymph_node_diagnosis'] = df.apply(lymph_node_diagnosis, axis=1)


# past biopsy
df_cox[['past_biopsy2___1',  'past_biopsy2___2', 'past_biopsy2___3', 'past_biopsy2___4', 'past_biopsy2___5']] = np.nan
for index, r in df.iterrows():
    if pd.isnull(r.past_biopsy) or r.past_biopsy == 'No':
        df_cox.loc[index, ['past_biopsy2___1',  'past_biopsy2___2', 'past_biopsy2___3', 'past_biopsy2___4', 'past_biopsy2___5']] = np.nan
    else:
        df_cox.loc[index, ['past_biopsy2___1',  'past_biopsy2___2', 'past_biopsy2___3', 'past_biopsy2___4', 'past_biopsy2___5']] = \
            (df.loc[index, ['past_biopsy2___1',  'past_biopsy2___2', 'past_biopsy2___3', 'past_biopsy2___4', 'past_biopsy2___5']] == 'Checked').astype(np.int16)
        
        
# history
def simplify_yes_no_unknown(x):
    if pd.isnull(x) or x == 'Unknown':
        return np.nan
    if x == 'Yes':
        return 1
    if x == 'No':
        return 0    
history_columns = ['history_ad', 'history_psoriasis', 'history_rash', 'history_pruritus', 'failed_tx', 'dupixent', 'history_autoimmune', 'history_vitd', 
                   'history_ebv', 'history_cmv', 'history_staph', 'hx_malignancy', 'fhx_leukemia', 'hazardous_exposure', ]
df_cox[history_columns] = df[history_columns].applymap(simplify_yes_no_unknown)


In [None]:
display(df_cox)

In [None]:
df_cox.columns

In [None]:
stage_to_numeric_dict

In [None]:
inv_stage_to_numeric_dict = {v: k for k, v in stage_to_numeric_dict.items()}

In [None]:
inv_stage_to_numeric_dict

In [None]:
df_cox.columns

In [None]:
# convert stage at diagnosis to catagorical variable and apply one-hot encoding

# 91 valid records
#df_cox.stage_at_diagnosis_n.value_counts().to_list()
stages = ['IA', 'IB', 'IIA', 'IIB', 'IIIA', 'IIIB', 'IVA1', 'IVA2', 'IVB']
for i in range(1, 10):
    df_cox[stages[i-1]] = (df_cox.stage_at_diagnosis_n == i).astype(float)



In [None]:
df_cox.loc[:, ['stage_at_diagnosis_n'] + stages]

In [None]:
# Replace stage_at_diagnosis_n with one-hot encoding
df_cox = df_cox.loc[:, ~df_cox.columns.isin(['stage_at_diagnosis_n'])]

In [None]:
lesion_diagnosis_dict = {'Patch': 1,'Plaque': 2,'Tumor': 3,'Erythroderma': 4}
lesion = list(lesion_diagnosis_dict.keys())
for i in range(1, 5):
    df_cox[lesion[i-1]] = (df_cox.lesion_diagnosis_n== i).astype(float)

In [None]:
# Replace lesion_diagnosis_n with one-hot encoding
df_cox = df_cox.loc[:, ~df_cox.columns.isin(['lesion_diagnosis_n'])]

In [None]:
df_cox

In [None]:
df_cox.to_csv('cox_data_diagnosis.csv')

In [None]:
df_cox.columns

### add data from time of sampling

In [None]:
df.shape

In [None]:
copy_df_2 = df.copy()

In [None]:
df = df.dropna(subset=['date_sampling'])

In [None]:
df.shape

In [None]:
missing = [item for item in copy_df_2.index.to_list() if item not in df.index.to_list()]

In [None]:
copy_df_2.loc[missing, :]

In [None]:
# there are some abnormal values in date of sampling that we should exclude.
# there are 5 patients with no entry for date of sampling, so we excluded those and moved forward.
d = []
for ind in df.index:
    try:
        d.append((datetime.strptime(df.date_sampling[ind], '%Y-%m-%d') - datetime.strptime(df.date_of_diagnosis[ind], '%Y-%m-%d')).days/365.24)
    except Exception as e:
        print(e)
        print(df.date_sampling[ind])
        print(df.date_of_diagnosis[ind])
        
d = pd.Series(d)
d.hist(bins=15)
plt.xlabel('years between dx and sample')
plt.show()

In [None]:
# straight extraction of some features
# df_cos only contains data at sampling
sampling_columns = ['stage_at_sampling_n', 'lesion_sampling_n', 'tbsa_sampling', 'mswat_sampling',
                    'cd4cd8ratio', 'absolute_cd4', 'absolute_cd8', 'ldh_sampling', 'wbc', 'hgb', 'pct', 
                    'segmented_neutrophils_percent', 'lymphocyte_percentage', 'monocytes_percentage', 'eosinophils_percentage'
                   ] + tnmb_label_list
df_cox[sampling_columns] = df[sampling_columns]

In [None]:
df_cox.hgb['900-8']

In [None]:
df_cox

In [None]:
# lab values at time of sampling
labs_yes_no_unk = ['tumor_cell_cd30', 'large_cell_transformation']
df_cox[labs_yes_no_unk] = df[labs_yes_no_unk].applymap(simplify_yes_no_unknown)

# patient ID 900-7 has hgb 9.2mmol, so we removed the unit
df_cox.hgb['900-7'] = '9.2'

# Treatments: just include a positive whenever any of prior/during/after sampling included
treatment_names_sanitized = ['tx_' + re.sub('[- ]', '_', treatment_names[i]) for i in range(0, 17)]
for i in range(2, 17):
    c_prior = f'prior_treatment___{i}'    
    c_sampling = f'treatment_sampling___{i}'
    c_after = f'after_sampling___{i}'    
    df_cox[treatment_names_sanitized[i-1]] = ((df[c_prior] == 'Checked') | (df[c_sampling] == 'Checked') | (df[c_after] == 'Checked')).astype(np.int16)    

In [None]:
display(df_cox.head(40))

In [None]:
stages = ['IA', 'IB', 'IIA', 'IIB', 'IIIA', 'IIIB', 'IVA1', 'IVA2', 'IVB']
for i in range(1, 10):
    df_cox[stages[i-1] + '_at_sampling'] = (df_cox.stage_at_sampling_n == i).astype(float)
df_cox = df_cox.loc[:, ~df_cox.columns.isin(['stage_at_sampling_n'])]



In [None]:
lesion_sampling_dict = {'Patch': 1,'Plaque': 2,'Tumor': 3,'Erythroderma': 4}
lesion = list(lesion_sampling_dict.keys())
for i in range(1, 5):
    df_cox[lesion[i-1] + '_at_sampling'] = (df_cox.lesion_sampling_n== i).astype(float)

In [None]:
df_cox = df_cox.loc[:, ~df_cox.columns.isin(['lesion_sampling_n'])]

In [None]:
df_cox.to_csv('cox_data_sampling.csv')

In [None]:
df_cox.columns.to_list()

### Add genetic mutations

In [None]:
# Get the common and significant mutations
df_s6 = pd.read_excel(io=file_genetics_data, sheet_name='Table S6', header=2)
# genes = df_s6.loc[df_s6['Number of cases with mutations'] >= 5, 'Gene Symbol'].tolist()
genes = df_s6['Gene Symbol'].tolist()
print(genes)

In [None]:
# Read and combine tables S2 and S3
df_s2 = pd.read_excel(io=file_genetics_data, sheet_name='Table S2', header=2)
x = df_s2['Gene symbol'].map(lambda x: x in genes)
df_mutations = df_s2.loc[x, ['Gene symbol', 'Chromosome', 'Sample ID']]

df_s3 = pd.read_excel(io=file_genetics_data, sheet_name='Table S3', header=2)
x = df_s3['Gene symbol'].map(lambda x: x in genes)
df_mutations = pd.concat([df_mutations, df_s3.loc[x, ['Gene symbol', 'Chromosome', 'Sample ID']]], axis=0)

del df_s2
del df_s3

df_mutations.head()

In [None]:
df_patient_mutations = pd.DataFrame(columns=genes, index=df_cox.index, data=0)
for patient_id, sample_id in df.sample_id.items():
    patient_genes = set(df_mutations.loc[df_mutations['Sample ID'] == sample_id, 'Gene symbol'].tolist())
    for gene in patient_genes:
        df_patient_mutations.loc[patient_id, gene] = 1
    
mut_freq = df_patient_mutations.sum()
df_patient_mutations = df_patient_mutations.loc[:, mut_freq > 1]
display(df_patient_mutations)

In [None]:
(mut_freq > 1).sum()

In [None]:
s = list()
for c in df_patient_mutations.columns:
    s.append(f"'{c}'")
', '.join(s)

In [None]:
df_cox = pd.concat([df_cox, df_patient_mutations], axis=1)

# patient ID 900-7 has hgb 9.2mmol, so we removed the unit
df_cox.hgb['900-7'] = '9.2'

df_cox.to_csv('cox_data_sampling_genes.csv')

In [None]:
df_cox.head(30)

In [None]:
df_cox.columns.to_list()

In [None]:
df_cox = pd.read_csv('cox_data_sampling_genes.csv')

In [None]:
a = df_cox.columns.to_list()

In [None]:
"', '".join(a)