Equality in Survival Predictions

In [12]:
%%time
try:
    from lifelines.utils import concordance_index
except ModuleNotFoundError:
    print('Installing lifelines...')
    !pip install -q /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
    !pip install -q /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
    !pip install -q /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
    !pip install -q /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
    !pip install -q /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

CPU times: user 12 µs, sys: 3 µs, total: 15 µs
Wall time: 17.6 µs


In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns # data plotting
import matplotlib.pyplot as plt # --
import matplotlib # --

from sklearn.impute import SimpleImputer # missing data imputation
from sklearn.impute import KNNImputer # --

import xgboost as xgb # model training
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import pandas.api.types
from lifelines.utils import concordance_index

from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.base import clone

from colorama import Fore, Style
from lifelines import KaplanMeierFitter


In [None]:
train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
sample = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv')

train[:5]

In [18]:
sample

Missing values in train dataset:
 ID                            0
dri_score                   154
psych_disturb              2062
cyto_score                 8068
diabetes                   2119
hla_match_c_high           4620
hla_high_res_8             5829
tbi_status                    0
arrhythmia                 2202
hla_low_res_6              3270
graft_type                    0
vent_hist                   259
renal_issue                1915
pulm_severe                2135
prim_disease_hct              0
hla_high_res_6             5284
cmv_status                  634
hla_high_res_10            7163
hla_match_dqb1_high        5199
tce_imm_match             11133
hla_nmdp_6                 4197
hla_match_c_low            2800
rituximab                  2148
hla_match_drb1_low         2643
hla_match_dqb1_low         4194
prod_type                     0
cyto_score_detail         11923
conditioning_intensity     4789
ethnicity                   587
year_hct                      0
obesit

TypeError: could not convert string to float: 'N/A - non-malignant indication'

In [None]:
feature_mappings = {
    'dri_score': {
        'Intermediate': 1,
        'High': 2,
        'N/A - non-malignant indication': np.nan
    },
    'psych_disturb': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'cyto_score': {
        'Intermediate': 1,
        'Favorable': 2,
        'Poor': 3,
        'TBD': -1,
    },
    'diabetes': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'tbi_status': {
        'No TBI': 0,
        'TBI + Cy +- Other': 1,
        'TBI +- Other, < 200cGy': 2,
        'TBI, >= 200cGy': 3
    },
    'arrhythmia': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'graft_type': {
        'Peripheral blood': 1,
        'Bone marrow': 2
    },
    'vent_hist': {
        'Yes': 1,
        'No': 0,
    },
    'renal_issue': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'pulm_severe': {
        'Yes': 1,
        'No': -1,
        'Not done':np.nan,
    },
    'cmv_status': {
        '+/-': 1,
        '+/+': 2,
        '-/-': 3,
        '-/+': 4,
    },
    'tce_imm_match': {
        'P/P': 1,
        'G/G': 2,
        'H/H': 3,
        'G/B': 4,
        'H/B': 5,
        'P/H': 6,
        'P/G': 7,
    },
    'rituximab': {
        'Yes': 1,
        'No': 0,
    },
    'prod_type': {
        'PB': 1,
        'BM': 2
    },
    'conditioning_intensity': {
        'RIC': 1,
        'NMA': 2,
        'MAC': 3,
        'TBD': -1,
        'No drugs reported': 0,
    },
    'ethnicity': {
        'Not Hispanic or Latino': 1,
        'Hispanic or Latino': 2,
    },
    'obesity': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'mrd_hct': {
        'Negative': 1,
        'Positive': 2,
    },
    'in_vivo_tcd': {
        'Yes': 1,
        'No': 0,
    },
    'tce_match': {
        'Permissive': 1,
        'Fully matched': 2,
        'GvH non-permissive': 3,
    },
    'hepatic_severe': {
        'Yes': 1,
        'No': 0,
        'Not done': np.nan,
    },
    'prior_tumor': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'peptic_ulcer': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'rheum_issue': {
       'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'sex_match': {
        'M-M': 1,
        'F-F': 2,
        'F-M': 3,
        'M-F': 4,
    },
    'hepatic_mild': {
         'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'tce_div_match': {
        'Permissive mismatched': 1,
        'Bi-directional non-permissive': 2,
        'Other': 3
    },
    'donor_related': {
        'Unrelated': 1,
        'Related': 2,
        'Multiple donor (non-UCB)': 3
    },
    'cardiac': {
        'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    'pulm_moderate': {
       'Yes': 1,
        'No': -1,
        'Not done': np.nan,
    },
    #'', '', 'race_group'
    'prim_disease_hct':{
        
    'ALL': -5,
    'MPN': -4,
    'IPA': -3,
    'AML': -2,
    'MDS': -1,
    'Other acute leukemia': 0,
    'AI': 1,
    'SAA': 2,
    'IEA': 3,
    'NHL': 4,
    'PCD': 5,
    'IIS': 6,
    'HIS': 7,
    'Other leukemia': 8,
    'Solid tumor': 9,
    'IMD': 10,
    'HD': 11,
    'CML': 12
    }
    ,'cyto_score_detail':
        {
    'Intermediate': -4,
    'TBD': -3,
    'Favorable': -2,
    'Poor': -1,
    'Not tested': np.nan
    
    },
    'gvhd_proph':
    
        {
    'FK+ MMF +- others': -4,
    'Parent Q = yes, but no agent': -3,
    'FK+ MTX +- others(not MMF)': -2,
    'FKalone': -1,
    'Cyclophosphamide alone': 0,
    'CSA + MMF +- others(not FK)': 1,
    'TDEPLETION +- other': 2,
    'Cyclophosphamide +- others': 3,
    'No GvHD Prophylaxis': 4,
    'Other GVHD Prophylaxis': 5,
    'CSA alone': 6,
    'TDEPLETION alone': 7,
    'CDselect alone': 8,
    'CSA + MTX +- others(not MMF,FK)': 9,
    'FK+- others(not MMF,MTX)': 10,
    'CDselect +- other': 11,
    'CSA +- others(not FK,MMF,MTX)': 12

    },
    'race_group':{
    'White': -2,
    'Black or African-American': -1,
    'Native Hawaiian or other Pacific Islander': 0,
    'Asian': 1,
    'American Indian or Alaska Native': 2,
    'More than one race': 3

    },
    
    'melphalan_dose':{
        'N/A, Mel not given':-1,
        'MEL':1
    }
}

In [None]:
class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [None]:
for column, mapping in feature_mappings.items():
    if column in train.columns:
        train[column] = train[column].map(mapping)
        test[column] = test[column].map(mapping)
        
test

In [None]:
nan_count_per_column = train.isna().sum()
print("NaN count per column:\n", nan_count_per_column)

In [None]:
correlation_matrix = train.corr()  

plt.figure(figsize=(12, 10))  
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt='.2f', linewidths=0.5)

plt.title('Correlation Heatmap')
plt.show()


In [None]:
thresh = 0.01
low_correlation_features = correlation_matrix[
    (correlation_matrix['efs'] > -thresh) & (correlation_matrix['efs'] <thresh) &
    (correlation_matrix['efs_time'] > -thresh) & (correlation_matrix['efs_time'] <thresh)
].index

# Print the features
print("Features with low correlation to both 'efs' and 'efs_time':")
print(low_correlation_features.tolist())

In [None]:
categorical_features = sample.loc[sample['type'] == 'Categorical', 'variable']
categorical_features


In [None]:
imputer = SimpleImputer(strategy='most_frequent')

train[categorical_features] = pd.DataFrame(
    imputer.fit_transform(train[categorical_features]),
    columns=categorical_features
)

In [None]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')
train.head()


In [None]:
def train_wrapper(model_class, train, show_training=True):
    y = train['y']
    efs = train['efs']
    efs_time = train['efs_time']
    X = train.drop(columns=['efs', 'efs_time', 'y'])
    train['efs'] = efs
    train['efs_time'] = efs_time
    train['y'] = y
    print(train.head())
    n_splits = 10
    SEED = 42

    train_scores = []
    test_scores = []


    KF = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    for fold, (train_idx, test_idx) in enumerate(tqdm(KF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        # assuming model is already predefined outside function
        model = clone(model_class)
        model.fit(X_train, y_train)
        fitted_model = model
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # training score
        y_true = train.iloc[train_idx][["ID","efs","efs_time","race_group"]].copy()
        y_pred = X_train[["ID"]].copy()
        y_pred["prediction"] = y_train_pred
        m = score(y_true.copy(), y_pred.copy(), "ID")
        train_scores.append(m)

        
        # validation score
        y_true = train.iloc[test_idx][["ID","efs","efs_time","race_group"]].copy()
        y_pred = X_val[["ID"]].copy()
        y_pred["prediction"] = y_val_pred
        m = score(y_true.copy(), y_pred.copy(), "ID")
        test_scores.append(m)

        if show_training:
            print(f"Training Score = {m}")
            print(f"Validation Score = {m}")
        
    print(f"----> || Mean C-index training score :: {Fore.CYAN}{Style.BRIGHT} {sum(train_scores)/len(train_scores):.3f}{Style.RESET_ALL}")
    print(f"----> || Mean C-index validation score :: {Fore.CYAN}{Style.BRIGHT} {sum(test_scores)/len(test_scores):.3f}{Style.RESET_ALL}")

    return model

In [None]:
xgb_params = {
        'max_depth':3,  
        'colsample_bytree':0.5,  
        'n_estimators': 2000,  
        'learning_rate': 0.02,  
        'min_child_weight':80,
        'subsample':0.8
}

xgb_model = xgb.XGBRegressor(**xgb_params)
fitted = train_wrapper(xgb_model, train, show_training=True)

In [None]:
preds = fitted.predict(test)

sub = pd.DataFrame({
    'ID': test['ID'],
    'prediction': preds
})
sub.to_csv('submission.csv')
sub
