In [52]:
# -------------- Imports und Einstellungen --------------
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor




In [53]:
# -------------- Daten einlesen --------------

# Train Daten zum trainieren der ML-Modelle
path="/datasets/equity-post-HCT-survival-predictions/train.csv"
df_train=pd.read_csv(path)

# Spaltenbeschreibungen
path_description="/datasets/equity-post-HCT-survival-predictions/data_dictionary.csv"
df_description=pd.read_csv(path_description)

# Test Daten für die Competition
path_test="/datasets/equity-post-HCT-survival-predictions/test.csv"
df_test=pd.read_csv(path_test)

# # -------------- ID-Werte aus df_test speichern --------------

ids = df_test['ID']

In [54]:
# Hintergrund KaplanMeierFitter: In Überlebensanalysen sind die Daten oft zensiert, das heißt, 
#für einige Personen wissen wir nicht, ob das Ereignis jemals eintreten wird 
#(z. B. weil sie die Studie verlassen haben oder das Ereignis nach Ende der Beobachtungszeit nicht eingetreten ist).
#Der Kaplan-Meier-Schätzer wird verwendet, um die Überlebenswahrscheinlichkeit über die Zeit hinweg zu berechnen.
#Er gibt an, wie wahrscheinlich es ist, dass das untersuchte Ereignis nach einer bestimmten Zeit noch nicht eingetreten ist.

#In unserem Fall verwenden wir den Kaplan-Meier-Schätzer, um für jede Beobachtung im Datensatz eine 
#individuelle Überlebenswahrscheinlichkeit zu berechnen. Diese Werte können 
#anschließend als Zielvariable für ein Machine-Learning-Modell genutzt werden


def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
df_train["y"] = transform_survival_probability(df_train, time_col='efs_time', event_col='efs')

In [55]:
#Data Cleaning
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in df_train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")


There are 57 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


In [56]:
CATS = []
for c in FEATURES:
    if df_train[c].dtype=="object":
        CATS.append(c)
        df_train[c] = df_train[c].fillna("NAN")
        df_test[c] = df_test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

In these features, there are 35 CATEGORICAL FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


In [57]:
combined = pd.concat([df_train,df_test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")

        
     # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
df_train = combined.iloc[:len(df_train)].copy()
df_test = combined.iloc[len(df_train):].reset_index(drop=True).copy()

We LABEL ENCODE the CATEGORICAL FEATURES: dri_score, psych_disturb, cyto_score, diabetes, tbi_status, arrhythmia, graft_type, vent_hist, renal_issue, pulm_severe, prim_disease_hct, cmv_status, tce_imm_match, rituximab, prod_type, cyto_score_detail, conditioning_intensity, ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hepatic_severe, prior_tumor, peptic_ulcer, gvhd_proph, rheum_issue, sex_match, race_group, hepatic_mild, tce_div_match, donor_related, melphalan_dose, cardiac, pulm_moderate, 

In [61]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from scipy.stats import rankdata

# FOLD SETTINGS
FOLDS = 2
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Initialize arrays to store out-of-fold and test predictions
oof_lgb = np.zeros(len(df_train))
pred_lgb = np.zeros(len(df_test))

# Loop through each fold
for i, (train_index, test_index) in enumerate(kf.split(df_train)):

    print("#" * 25)
    print(f"### Fold {i+1}")
    print("#" * 25)

    # Train / Validation Split
    x_train = df_train.loc[train_index, FEATURES].copy()
    y_train = df_train.loc[train_index, 'y']    
    x_valid = df_train.loc[test_index, FEATURES].copy()
    y_valid = df_train.loc[test_index, 'y']
    x_test = df_test[FEATURES].copy()

    # Initialize the LGBMRegressor model
    model_lgb = LGBMRegressor(
        max_depth=3, 
        colsample_bytree=0.4,  
        n_estimators=2500, 
        learning_rate=0.02, 
        objective="regression",  # Regression problem
        verbose=-1,
    )

    # Fit the model without early_stopping_rounds
    model_lgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
    )

    # INFER OOF predictions (Out of Fold)
    oof_lgb[test_index] = model_lgb.predict(x_valid)

    # INFER TEST predictions
    pred_lgb += model_lgb.predict(x_test)

# Compute average test predictions for submission
pred_lgb /= FOLDS
print(pred_lgb)
# Rank the predictions for the final output (use as risk score)



#########################
### Fold 1
#########################
#########################
### Fold 2
#########################
[0.4907011  0.64680948 0.43478722]


In [62]:
print(pred_lgb)
y_pred = pd.DataFrame({
    'ID': df_test['ID'],  # Assuming 'ID' is the column that identifies rows
    'prediction': pred_lgb  # Rank the predictions for submission
})

# Save submission
y_pred.to_csv('submission.csv', index=False)

[0.4907011  0.64680948 0.43478722]


ID,prediction
28800,0.4907011043791599
28801,0.646809476705795
28802,0.4347872177494269
