## Install lifelines package

In [1]:
!pip install /kaggle/input/cibmtr_lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/cibmtr_lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/cibmtr_lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/cibmtr_lifelines/formulaic-1.1.1-py3-none-any.whl
!pip install /kaggle/input/cibmtr_lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/cibmtr_lifelines/autograd-1.7.0-py3-none-any.whl
autograd is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/cibmtr_lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4031 sha256=2e01f7c4867887732d2041fda002b04d07824652bdcf47e69bf0f5ce821940e3
  Stored in directory: /root/.cache/pip/wheels/97/c4/e9/b8d72881091567d3cfbb1734056aa8ad5731785576546d141c
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/cibmtr_lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Successfu

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch, math, time
import polars as pl
import glob, pathlib, json, warnings, lifelines
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error,r2_score
import missingno as msno
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, ClassifierMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearnex import patch_sklearn
from lifelines import CoxPHFitter, KaplanMeierFitter, NelsonAalenFitter, WeibullFitter,ExponentialFitter, LogNormalFitter, NelsonAalenFitter
from sklearn.pipeline import make_pipeline, Pipeline
import lightgbm as lgb
import xgboost as xgb
import logging, os, tqdm
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from metric import score as score_f
from sklearn.ensemble import VotingRegressor, RandomForestRegressor


for dirname, _, filenames in os.walk('/kaggle/input/equity-post-HCT-survival-predictions'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')
logging.getLogger('sklearnex').setLevel(logging.WARNING)
patch_sklearn(verbose=False)

/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv
/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv
/kaggle/input/equity-post-HCT-survival-predictions/train.csv
/kaggle/input/equity-post-HCT-survival-predictions/test.csv


In [3]:
data_dic = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv")
test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")

In [4]:
feature_types = {var:typ for idx, (var, _, typ, _) in data_dic.iterrows()}
categorical_columns = [key for key, value in feature_types.items() if value=="Categorical" and key!="efs"]
numerical_columns = [key for key, value in feature_types.items() if value=="Numerical" and key!="efs_time"]
seed = 11

with open("/kaggle/input/cibmtr/features_information_naf_v1.json", mode="r") as file:
    best_scores = json.load(file)

with open("/kaggle/input/cibmtr/features_information_naf.json", mode="r") as file:
    best_scores_naf = json.load(file)
    
top_k_comb_naf = sorted(best_scores_naf.items(), key=lambda x: (x[-1], x[0]), reverse=True)[:10] #best feature combination
top_k_comb = sorted(best_scores.items(), key=lambda x: (x[-1], x[0]), reverse=True)[:5] #best feature combination

In [5]:
class NANCategorical(BaseEstimator, TransformerMixin):
    def __init__(self, columns, key="UNK"):
        super().__init__()
        self.columns = columns
        self.key = key

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.columns] = X[self.columns].fillna(self.key)
        return X

class NANNumerical(BaseEstimator, TransformerMixin):
    def __init__(self, columns, key=-1):
        super().__init__()
        self.columns = columns
        self.key = key

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.columns] = X[self.columns].fillna(self.key)
        return X

class NUMScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, scaler=StandardScaler()):
        super().__init__()
        self.columns = columns
        self.scaler = scaler

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self
    def transform(self, X):
        X[self.columns] = self.scaler.transform(X[self.columns])
        return X

def make_score(X, predictions):
    y_true = X[["ID","efs","efs_time","race_group"]].copy()
    y_pred = X[["ID"]].copy()
    y_pred["prediction"] = predictions
    m = score_f(y_true.copy(), y_pred.copy(), "ID")
    return m

class FeatureEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns, encoder=LabelEncoder()):
        self.columns = columns
        self.encoder = encoder

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            X_transformed[col] = self.encoder.fit_transform(X[col])
        return X_transformed

class SurvivalEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, columns, survival_function, name):
        self.params = {'boosting_type': 'gbdt', 'colsample_bytree': 0.75, 'model__force_col_wise': True, 'learning_rate': 0.65, 
                       'max_depth': 3, 'metric': 'rmse', 'min_child_samples': 64, 'min_split_gain': 0.03, 'n_estimators': 300, 
                       'num_leaves': 4, 'verbose': -1}
        self.model = make_lgb(params=self.params,seeds=[234, 262, 342, 408])
        self.columns = columns
        self.name = name
        self.survival_function = survival_function

    def fit(self, X, y=None):
        y_kmf = self.survival_function(X, time_col='efs_time', event_col='efs')
        self.model.fit(X[self.columns], y_kmf)
        return self

    def transform(self, X):
        return pd.DataFrame(self.model.predict(X[self.columns]), columns=[self.name], index=X.index)

class ClassEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, columns, label, name):
        self.params = {'boosting_type': 'gbdt', 'colsample_bytree': 0.75, 'model__force_col_wise': True, 'learning_rate': 0.65, 
                       'max_depth': 3, 'metric': 'rmse', 'min_child_samples': 64, 'min_split_gain': 0.03, 'n_estimators': 300, 
                       'num_leaves': 4, 'verbose': -1}
        self.model = make_lgb(params=self.params,seeds=[234, 262, 342, 408])
        self.columns = columns
        self.name = name
        self.label = label

    def fit(self, X, y=None):
        self.model.fit(X[self.columns], X[self.label])
        return self

    def transform(self, X):
        return pd.DataFrame(self.model.predict(X[self.columns]), columns=[self.name], index=X.index)

class AddSurvivalEstimates(BaseEstimator, TransformerMixin):
    def __init__(self, survival_functions, compute_aggregates):
        self.survival_functions = survival_functions
        self.transformers = {}
        self.compute_aggregates = compute_aggregates
        
    def fit(self, X, y=None):
        if self.survival_functions:
            for name, func in self.survival_functions.items():#list(X.columns[1:-2]) top_k_comb_naf[1][0].split("/")
                if func == "efs" or func == "efs_time":
                    estimator = ClassEstimator(columns=list(X.columns[1:-2]), label=func, name=name)
                    estimator.fit(X)
                    self.transformers[name] = estimator
                else:
                    estimator = SurvivalEstimator(survival_function=func, columns=list(X.columns[1:-2]), name=name)
                    estimator.fit(X)
                    self.transformers[name] = estimator
        return self
        
    def transform(self, X):
        if self.transformers:
            X_transforms = [X] + [transformer.transform(X) for _, transformer in self.transformers.items()] + [self.compute_aggregates(X)]
            X_transformed = pd.concat(X_transforms, axis=1)
            return X_transformed
        else:
            return X

class SELColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

def make_pipeline(columns, model, categorical_columns, numerical_columns, compute_aggregates, 
                  survival_functions, plain=False):
    if plain:
        pipe = Pipeline(
            steps=[
                ('nancategorical', NANCategorical(columns=categorical_columns, key="UNK")), 
                ('nannumerical', NANNumerical(columns=numerical_columns, key=-1)),
                ('labelencoder', FeatureEncoder(columns=categorical_columns, encoder=LabelEncoder())
                ),
                ('numscaler', NUMScaler(columns=numerical_columns, 
                                        scaler=MinMaxScaler(feature_range=(0, 1),clip=False))
                ),
                ('survivalestimates', AddSurvivalEstimates(survival_functions=survival_functions, compute_aggregates=compute_aggregates))
            ]
        )
    else:
        pipe = Pipeline(
            steps=[
                ('nancategorical', NANCategorical(columns=categorical_columns, key="UNK")), 
                ('nannumerical', NANNumerical(columns=numerical_columns, key=-1)),
                ('labelencoder', FeatureEncoder(columns=categorical_columns, encoder=LabelEncoder())),
                ('numscaler', NUMScaler(columns=numerical_columns, 
                                        scaler=MinMaxScaler(feature_range=(0, 1),clip=False))
                ),
                ('survivalestimates', AddSurvivalEstimates(survival_functions=survival_functions, compute_aggregates=compute_aggregates)),
                ('selcolumns', SELColumns(columns=columns)),
                ("model", model)
            ]
        )

    return pipe
    
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    """
    Transform using survival probability estimates
    """
    kmf = KaplanMeierFitter()
    kmf.fit(durations=np.log(df[time_col].values), event_observed=df[event_col])
    y = kmf.survival_function_at_times(np.log(df[time_col].values)).values
    return y

def transform_survival_nelson(df, time_col='efs_time', event_col='efs'):
    """
    Transform using survival probability estimates
    """
    naf = NelsonAalenFitter()
    naf.fit(durations=np.log(df[time_col].values), event_observed=df[event_col])
    y = -naf.cumulative_hazard_at_times(np.log(df[time_col].values)).values
    return y
    
def make_lgb(params,seeds):
    clfs = []
    for i, seed in enumerate(seeds):
        params["random_state"] = seed
        clfs.append((f"lgb_{i+1}", lgb.LGBMRegressor(**params)))
    
    return VotingRegressor(clfs, n_jobs=-1)

# Submission

In [6]:
def predict_kfold(X, y, test, pipeline, n_splits=5, seed=11, shuffle=True):
    kfold = KFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
    predictions = np.zeros(len(test))
    #Define cross validation
    for i, (train_ind, test_ind) in tqdm.tqdm(enumerate(kfold.split(X)), desc="Evaluating Model"):
        #Split train data for cross validation
        X_train, y_train = X.iloc[train_ind], y[train_ind]
        pipeline.fit(X_train, y_train)

        #Generate predictions 
        predictions += pipeline.predict(test)
    
    return predictions/n_splits

In [7]:
def compute_aggregates(X):
    df = X.copy()
    
    # Feature 1: HLA Matching Score (Summing all HLA match variables)
    hla_columns = [col for col in df.columns if 'hla_match' in col]
    df['hla_match_score_max'] = df[hla_columns].max(axis=1)
    
    # Feature 2: T-cell Epitope Mismatch Score (Combining `tce_match` and `tce_div_match`)
    df['tce_mismatch_score'] = df[['tce_match', 'tce_div_match']].sum(axis=1)
    
    # # Feature 3: GVHD Risk Score (Summing `gvhd_proph`, `in_vivo_tcd`, and `tce_imm_match`)
    df['gvhd_risk_score_max'] = df[['gvhd_proph', 'in_vivo_tcd', 'tce_imm_match']].max(axis=1)
    df['gvhd_risk_score'] = df[['gvhd_proph', 'in_vivo_tcd', 'tce_imm_match']].sum(axis=1)

    earliest_year = df['year_hct'].min()
    df['years_since_first_transplant'] = df['year_hct'] - earliest_year
    df['donor_recipient_age_gap'] = df['donor_age'] - df['age_at_hct']
    # # Feature 4: Comorbidity Index (Summing selected comorbidity indicators)
    comorbidity_columns = ['diabetes', 'cardiac', 'arrhythmia', 'renal_issue',
                           'hepatic_mild', 'hepatic_severe', 'pulm_severe', 'peptic_ulcer']
    df['comorbidity_index'] = df[comorbidity_columns].sum(axis=1)
    df['comorbidity_index_max'] = df[comorbidity_columns].max(axis=1)
    df['comorbidity_index_mean'] = df[comorbidity_columns].mean(axis=1)
    
    return df[['gvhd_risk_score_max','comorbidity_index','comorbidity_index_max','hla_match_score_max',
               'comorbidity_index_mean','gvhd_risk_score', 'tce_mismatch_score','years_since_first_transplant',
               'donor_recipient_age_gap']]
    

def generate_predictions(test, train, top_k_comb, categorical_columns, numerical_columns, y_func, 
                         model, compute_aggregates, survival_functions,plain=False, n_splits=5, seed=11):
    
    predictions = np.zeros(len(test))
    for cols, _ in top_k_comb:
        cols = cols.split("/")
        pipe = make_pipeline(columns=cols, model=model, categorical_columns=categorical_columns, numerical_columns=numerical_columns, 
                             compute_aggregates=compute_aggregates, survival_functions=survival_functions, plain=plain)
        
        predictions += predict_kfold(
            train, y=y_func(train), test=test, pipeline=pipe, n_splits=n_splits, seed=seed, shuffle=True
        )
    
    return predictions/len(top_k_comb)

In [8]:
survival_funcs = {"kmf": transform_survival_probability, "nelson": transform_survival_nelson, "efs_label": "efs"}
params = {'boosting_type': 'gbdt', 'colsample_bytree': 0.9, 'force_col_wise': True, 'learning_rate': 0.15, 
          'max_depth': 3, 'metric': 'rmse', 'min_child_samples': 84, 'min_split_gain': 0.05, 'n_estimators': 150, 
          'num_leaves': 3, 'verbose': -1}

oof_kmf = generate_predictions(test, train, top_k_comb, categorical_columns, numerical_columns, y_func=transform_survival_nelson, 
                               survival_functions=survival_funcs, plain=False, n_splits=5, seed=seed, 
                               compute_aggregates=compute_aggregates, model=make_lgb(params=params,seeds=[234, 262, 342, 408]))

Evaluating Model: 5it [00:57, 11.49s/it]
Evaluating Model: 5it [00:56, 11.34s/it]
Evaluating Model: 5it [00:49,  9.99s/it]
Evaluating Model: 5it [00:56, 11.20s/it]
Evaluating Model: 5it [00:52, 10.49s/it]


In [9]:
from scipy.stats import rankdata

sub = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
sub["prediction"] = oof_kmf
sub.to_csv("submission.csv",index=False)