# Data Collection

## Installing Libarary

In [3]:
# %pip install pandas
# %pip install scipy
# %pip install numpy
# %pip install matplotlib
# %pip install scikit-learn
# %pip install statsmodels
# %pip install tensorflow
# %pip install scikit-survival
# %pip install xgboost
# %pip install factor-analyzer

## Reading CSV

In [5]:
import pandas as pd
AA02_data = pd.read_csv(r"equity-post-HCT-survival-predictions\train.csv")

## Checking structure of Data

In [7]:
AA02_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      28800 non-null  int64  
 1   dri_score               28646 non-null  object 
 2   psych_disturb           26738 non-null  object 
 3   cyto_score              20732 non-null  object 
 4   diabetes                26681 non-null  object 
 5   hla_match_c_high        24180 non-null  float64
 6   hla_high_res_8          22971 non-null  float64
 7   tbi_status              28800 non-null  object 
 8   arrhythmia              26598 non-null  object 
 9   hla_low_res_6           25530 non-null  float64
 10  graft_type              28800 non-null  object 
 11  vent_hist               28541 non-null  object 
 12  renal_issue             26885 non-null  object 
 13  pulm_severe             26665 non-null  object 
 14  prim_disease_hct        28800 non-null

In [8]:
AA02_data.head(3)

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793


# Data Preprocessing

## Supoorting Functions

In [11]:
def AA02_check_unique_values(dataframe):
    """
    Calculate the number of unique values, total values,
    and percentage of unique values for each column in the DataFrame.

    Args:
        dataframe (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A summary DataFrame with unique value statistics.
    """
    # Calculate unique values, total values, and percentage of unique values
    unique_counts = dataframe.nunique()
    total_counts = dataframe.count()
    percentages = (unique_counts / total_counts) * 100

    # Combine the results into a DataFrame for better AA02_display
    summary_AA02_df = pd.DataFrame({
        'Unique Values': unique_counts,
        'Total Values': total_counts,
        'Percentage (%)': percentages
    })

    return summary_AA02_df
    
import pandas as pd

# Function to calculate missing data information
def AA02_missing_data_info(AA02_sample_data):
    # Calculate missing count and percentage
    AA02_missing_count = AA02_sample_data.isnull().sum()
    AA02_missing_percentage = (AA02_missing_count / len(AA02_sample_data)) * 100

    # Create a DataFrame with missing data information
    AA02_missing_info = pd.DataFrame({
        'AA02_Variable': AA02_sample_data.columns,
        'AA02_Missing_Count': AA02_missing_count.values,
        'AA02_Missing_Percentage': AA02_missing_percentage.values
    }).reset_index(drop=True)

    # Format the percentage column
    AA02_missing_info['AA02_Missing_Percentage'] = AA02_missing_info['AA02_Missing_Percentage'].round(2).astype(str) + '%'

    return AA02_missing_info

import pandas as pd
from IPython.display import display

def display_full_dataframe(df):
    # Set display options for max columns and rows
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)

    # Display the DataFrame
    display(df)

    # Reset options to defaults after displaying
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')

## Data Sampling

In [13]:
# Sample 5001 of the data
# AA02_sample_data_0 = AA02_data.sample(n=5001, random_state=55002)

AA02_sample_data_0 = AA02_data.copy()

# AA02_display the first few rows of the sample
display_full_dataframe(AA02_sample_data_0.head(3))

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,renal_issue,pulm_severe,prim_disease_hct,hla_high_res_6,cmv_status,hla_high_res_10,hla_match_dqb1_high,tce_imm_match,hla_nmdp_6,hla_match_c_low,rituximab,hla_match_drb1_low,hla_match_dqb1_low,prod_type,cyto_score_detail,conditioning_intensity,ethnicity,year_hct,obesity,mrd_hct,in_vivo_tcd,tce_match,hla_match_a_high,hepatic_severe,donor_age,prior_tumor,hla_match_b_low,peptic_ulcer,age_at_hct,hla_match_a_low,gvhd_proph,rheum_issue,sex_match,hla_match_b_high,race_group,comorbidity_score,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,No,No,No,IEA,6.0,+/+,,2.0,,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2016,No,,Yes,,2.0,No,,No,2.0,No,9.942,2.0,FKalone,No,M-F,2.0,More than one race,0.0,90.0,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,No,No,No,AML,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Not Hispanic or Latino,2008,No,Positive,No,Permissive,2.0,No,72.29,No,2.0,No,43.705,2.0,Other GVHD Prophylaxis,No,F-F,2.0,Asian,3.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,HIS,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2019,No,,Yes,,2.0,No,,No,2.0,No,33.997,2.0,Cyclophosphamide alone,No,F-M,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793


## Feature Engeeniring

## Categorizing Variables

In [16]:
AA02_sample_data_0.columns

Index(['ID', 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
       'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
       'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
       'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
       'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
       'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
       'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
       'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
       'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
       'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
       'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_hi

In [17]:
# Check whether a Variable is index or not
AA02_check_unique_values(AA02_sample_data_0)

Unnamed: 0,Unique Values,Total Values,Percentage (%)
ID,28800,28800,100.0
dri_score,11,28646,0.0384
psych_disturb,3,26738,0.01122
cyto_score,7,20732,0.033764
diabetes,3,26681,0.011244
hla_match_c_high,3,24180,0.012407
hla_high_res_8,7,22971,0.030473
tbi_status,8,28800,0.027778
arrhythmia,3,26598,0.011279
hla_low_res_6,5,25530,0.019585


In [18]:
# Load the CSV file
file_path = r'equity-post-HCT-survival-predictions\data_dictionary.csv'  # Update with your actual file path
data = pd.read_csv(file_path)

# Extract all variables
AA02_columns = data['variable'].tolist()

# Display the results
print("All Columns:", AA02_columns)

All Columns: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'efs', 'efs_time']


In [19]:
AA02_sample_data_1 = AA02_sample_data_0[AA02_columns]
AA02_sample_data_1.head(3)

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793


### ID Variable

In [21]:
# Verify whether all variables are non index variables
AA02_check_unique_values(AA02_sample_data_1)

Unnamed: 0,Unique Values,Total Values,Percentage (%)
dri_score,11,28646,0.0384
psych_disturb,3,26738,0.01122
cyto_score,7,20732,0.033764
diabetes,3,26681,0.011244
hla_match_c_high,3,24180,0.012407
hla_high_res_8,7,22971,0.030473
tbi_status,8,28800,0.027778
arrhythmia,3,26598,0.011279
hla_low_res_6,5,25530,0.019585
graft_type,2,28800,0.006944


### CAT/NON CAT Variable

In [23]:
data['type'] = data['type'].str.strip().str.lower()

# Separate categorical and numerical columns
AA02_categorical_columns = data[data['type'] == 'categorical']['variable'].tolist()
AA02_non_categorical_columns = data[data['type'] == 'numerical']['variable'].tolist()
print("Categorical Columns:", AA02_categorical_columns)
print("Numerical Columns:", AA02_non_categorical_columns)

Categorical Columns: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate', 'efs']
Numerical Columns: ['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10', 'efs_time']


### Ordinal/Nominal Variable

In [25]:
## Divde Categorical variable into two parts ordinal & nominal
AA02_categorical_ordinal_columns = []
AA02_categorical_nominal_columns = AA02_categorical_columns

### Dependent/Independent Variable

#### Supporting Function

In [28]:
def prepare_data(AA02_y_vars, AA02_cat_vars, AA02_non_cat_vars):
    """
    Prepares the data by calculating the feature set (AA02_x) while excluding dependent variables.

    Args:
    - AA02_y_vars: A list of dependent variable names (can handle multiple dependent variables).
    - AA02_cat_vars: A list of categorical variable names.
    - AA02_non_cat_vars: A list of non-categorical variable names.

    Returns:
    - AA02_y_vars: A list of dependent variable names.
    - AA02_x: A list of feature variable names, excluding dependent variables.
    """
    # Combine categorical and non-categorical variable lists
    AA02_all_vars = AA02_cat_vars + AA02_non_cat_vars

    # Ensure `AA02_y_vars` is a list for consistency
    if isinstance(AA02_y_vars, str):
        AA02_y_vars = [AA02_y_vars]

    # Calculate the feature set (x) as the difference between AA02_all_vars and y_vars
    AA02_x = [AA02_var for AA02_var in AA02_all_vars if AA02_var not in AA02_y_vars]

    return AA02_y_vars, AA02_x

#### Dependent variable

In [30]:
AA02_y = ['efs', 'efs_time']  # Target variable defined manually

#### Independent Variables

In [32]:
AA02_y_columns, AA02_x_columns = prepare_data(AA02_y, AA02_sample_data_1.columns.tolist(), [])

print("Target (y):", AA02_y_columns)
print("Feature Set (x):", AA02_x_columns)

Target (y): ['efs', 'efs_time']
Feature Set (x): ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


## Imputation

### Checking Missing Percentage

#### Variable

In [36]:
import pandas as pd

# Function to calculate missing data information
def AA02_missing_data_info(AA02_sample_data):
    # Calculate missing count and percentage
    AA02_missing_count = AA02_sample_data.isnull().sum()
    AA02_missing_percentage = (AA02_missing_count / len(AA02_sample_data)) * 100

    # Create a DataFrame with missing data information
    AA02_missing_info = pd.DataFrame({
        'AA02_Variable': AA02_sample_data.columns,
        'AA02_Missing_Count': AA02_missing_count.values,
        'AA02_Missing_Percentage': AA02_missing_percentage.values
    }).reset_index(drop=True)

    # Format the percentage column
    AA02_missing_info['AA02_Missing_Percentage'] = AA02_missing_info['AA02_Missing_Percentage'].round(2).astype(str) + '%'

    return AA02_missing_info

# Call the function
AA02_missing_data_info(AA02_sample_data_1)

Unnamed: 0,AA02_Variable,AA02_Missing_Count,AA02_Missing_Percentage
0,dri_score,154,0.53%
1,psych_disturb,2062,7.16%
2,cyto_score,8068,28.01%
3,diabetes,2119,7.36%
4,hla_match_c_high,4620,16.04%
5,hla_high_res_8,5829,20.24%
6,tbi_status,0,0.0%
7,arrhythmia,2202,7.65%
8,hla_low_res_6,3270,11.35%
9,graft_type,0,0.0%


In [37]:
import pandas as pd

# Function to omit variables with more than a threshold of missing values and log omitted variables
def AA02_clean_data_with_logging(
    AA02_sample_data,
    AA02_categorical_columns,
    AA02_non_categorical_columns,
    AA02_columns,
    AA02_categorical_ordinal_columns,
    AA02_categorical_nominal_columns,
    AA02_y_columns,
    AA02_x_columns,
    missing_threshold=50
):
    # Calculate missing percentage for each variable
    AA02_missing_percentage = (AA02_sample_data.isnull().sum() / len(AA02_sample_data)) * 100

    # Identify variables to omit (missing percentage > threshold)
    variables_to_omit = AA02_missing_percentage[AA02_missing_percentage > missing_threshold]

    # Create a DataFrame for omitted variables
    omitted_info = []
    for variable, percentage in variables_to_omit.items():
        if variable in AA02_categorical_columns:
            source = "AA02_categorical_columns"
        elif variable in AA02_non_categorical_columns:
            source = "AA02_non_categorical_columns"
        elif variable in AA02_columns:
            source = "AA02_columns"
        else:
            source = "Unknown"

        omitted_info.append({
            "Variable": variable,
            "Missing_Percentage": round(percentage, 2),
            "Omitted_From": source
        })

    # Convert omitted info to DataFrame
    AA02_omitted_df = pd.DataFrame(omitted_info)

    # Identify variables to keep
    variables_to_keep = AA02_missing_percentage[AA02_missing_percentage <= missing_threshold].index.tolist()

    # Filter the dataset
    AA02_sample_data_cleaned = AA02_sample_data[variables_to_keep]

    # Update the lists (only keep variables that are not omitted)
    AA02_columns[:] = [col for col in AA02_columns if col in variables_to_keep]
    AA02_categorical_columns[:] = [col for col in AA02_categorical_columns if col in variables_to_keep]
    AA02_non_categorical_columns[:] = [col for col in AA02_non_categorical_columns if col in variables_to_keep]
    AA02_categorical_ordinal_columns[:] = [col for col in AA02_categorical_ordinal_columns if col in variables_to_keep]
    AA02_categorical_nominal_columns[:] = [col for col in AA02_categorical_nominal_columns if col in variables_to_keep]
    AA02_y_columns[:] = [col for col in AA02_y_columns if col in variables_to_keep]
    AA02_x_columns[:] = [col for col in AA02_x_columns if col in variables_to_keep]



    # Print the DataFrame of omitted variables
    print("Variables Omitted Due to Missing Values (> {}%):".format(missing_threshold))
    print(AA02_omitted_df)

    return AA02_sample_data_cleaned

# Example usage
# Assuming AA02_sample_data_1, AA02_categorical_columns, AA02_non_categorical_columns, and AA02_columns are defined
AA02_sample_data_cleaned = AA02_clean_data_with_logging(
    AA02_sample_data_1,
    AA02_categorical_columns,
    AA02_non_categorical_columns,
    AA02_columns,
    AA02_categorical_ordinal_columns,
    AA02_categorical_nominal_columns,
    AA02_y_columns,
    AA02_x_columns,
    missing_threshold=25
)


Variables Omitted Due to Missing Values (> 25%):
            Variable  Missing_Percentage              Omitted_From
0         cyto_score               28.01  AA02_categorical_columns
1      tce_imm_match               38.66  AA02_categorical_columns
2  cyto_score_detail               41.40  AA02_categorical_columns
3            mrd_hct               57.63  AA02_categorical_columns
4          tce_match               65.96  AA02_categorical_columns
5      tce_div_match               39.57  AA02_categorical_columns


In [38]:
print(AA02_columns)

['dri_score', 'psych_disturb', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'in_vivo_tcd', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'efs', 'efs_time']


In [39]:
# Dropping variables from data frame

In [40]:
# Select variables from AA02_sample_data_1 that are present in AA02_columns
AA02_sample_data_dropped_variable = AA02_sample_data_1[AA02_columns]

# Display the resulting dataset
AA02_sample_data_dropped_variable.head(3)

Unnamed: 0,dri_score,psych_disturb,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,...,hepatic_mild,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,N/A - non-malignant indication,No,No,,,No TBI,No,6.0,Bone marrow,No,...,No,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,Intermediate,No,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,No,...,No,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,N/A - non-malignant indication,No,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,...,No,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793


#### Records

In [42]:
import pandas as pd

def AA02_remove_records_with_missing_values(AA02_sample_data_dropped_variable, percentage):
    """
    This function removes records from the DataFrame where the percentage of missing values 
    exceeds the specified threshold.

    Parameters:
        AA02_sample_data_dropped_variable (pd.DataFrame): The input DataFrame.
        percentage (float): The threshold percentage of missing values.

    Returns:
        pd.DataFrame: Modified DataFrame with records removed.
    """
    # Calculate the threshold for missing values based on the given percentage
    threshold = (percentage / 100) * AA02_sample_data_dropped_variable.shape[1]

    # Identify records with missing values exceeding the threshold
    AA02_records_with_excessive_missing = AA02_sample_data_dropped_variable[AA02_sample_data_dropped_variable.isnull().sum(axis=1) > threshold]

    # Print the records with excessive missing values
    print("Records with more than", percentage, "% missing values:")
    AA02_records_with_excessive_missing

    # Remove those records from the original DataFrame
    AA02_sample_data_dropped_records = AA02_sample_data_dropped_variable.drop(index=AA02_records_with_excessive_missing.index)

    # Return the modified DataFrame
    return AA02_sample_data_dropped_records, AA02_records_with_excessive_missing

# Example Usage:
# Assuming you have a DataFrame `df`:
AA02_sample_data_dropped_records, AA02_records_with_excessive_missing = AA02_remove_records_with_missing_values(AA02_sample_data_dropped_variable.copy(), 25)
AA02_records_with_excessive_missing

Records with more than 25 % missing values:


Unnamed: 0,dri_score,psych_disturb,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,...,hepatic_mild,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
11,N/A - disease not classifiable,Not done,Yes,,,"TBI +- Other, >cGy",No,,Bone marrow,No,...,No,Related,"N/A, Mel not given",,No,2.0,No,,1.0,5.191
18,High,No,No,,,No TBI,No,,Peripheral blood,No,...,Yes,Related,"N/A, Mel not given",,No,,Not done,,1.0,10.083
35,Intermediate,No,No,,,No TBI,No,,Peripheral blood,No,...,No,Related,"N/A, Mel not given",,No,,No,,0.0,42.949
37,Intermediate - TED AML case <missing cytogenetics,,,,,No TBI,,,Peripheral blood,No,...,No,Multiple donor (non-UCB),,,,,,,1.0,3.735
39,N/A - non-malignant indication,No,Yes,,,No TBI,No,,Peripheral blood,No,...,Yes,Related,"N/A, Mel not given",,No,,No,,0.0,36.129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28760,N/A - pediatric,,,2.0,8.0,No TBI,,6.0,Bone marrow,No,...,,Unrelated,,8.0,,2.0,,10.0,0.0,13.787
28765,Low,No,No,,,TBI + Cy +- Other,No,,Peripheral blood,No,...,No,Related,"N/A, Mel not given",,No,1.0,No,,1.0,4.108
28775,N/A - non-malignant indication,No,No,,,No TBI,No,,Peripheral blood,No,...,No,Related,"N/A, Mel not given",,No,,No,,1.0,5.567
28777,Intermediate,No,No,,,TBI + Cy +- Other,No,,Peripheral blood,No,...,No,Related,"N/A, Mel not given",,No,,No,,1.0,6.479


### Categorical Imputation

In [44]:
from sklearn.impute import SimpleImputer # type: ignore

AA02_sample_data_imputed = AA02_sample_data_dropped_variable.copy()

# # Initialize SimpleImputer with most_frequent strategy
# AA02_imputer = SimpleImputer(strategy='most_frequent')

# # Apply imputation
# AA02_sample_data_imputed[AA02_categorical_columns] = AA02_imputer.fit_transform(AA02_sample_data_imputed[AA02_categorical_columns])

### Non Categorical Imputation

In [46]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer # type: ignore
import numpy as np

def AA02_impute_columns_with_mean_or_median(AA02_df, columns):
    for col in columns:
        # Ensure column is numeric
        AA02_df[col] = pd.to_numeric(AA02_df[col], errors='coerce')

        # Replace invalid values with NaN
        AA02_df[col].replace([np.inf, -np.inf], np.nan, inplace=True)

        # Skip if column has no missing values
        if AA02_df[col].isnull().sum() == 0:
            continue

        # Calculate mean and median
        AA02_col_mean = AA02_df[col].mean()
        AA02_col_median = AA02_df[col].median()

        # Choose strategy based on significant difference
        if abs(AA02_col_mean - AA02_col_median) / max(abs(AA02_col_mean), abs(AA02_col_median)) > 0.1:  # Significant difference
            print(f"Imputing '{col}' with median (significant difference between mean and median)")
            AA02_imputer = SimpleImputer(strategy='median')
        else:
            print(f"Imputing '{col}' with mean (no significant difference between mean and median)")
            AA02_imputer = SimpleImputer(strategy='mean')

        # Apply the AA02_imputer
        AA02_df[[col]] = AA02_imputer.fit_transform(AA02_df[[col]])

    return AA02_df

# AA02_sample_data_imputed = AA02_impute_columns_with_mean_or_median(AA02_sample_data_imputed, AA02_non_categorical_columns)

### Deep Learning-Based Imputation

In [48]:
import pandas as pd
import numpy as np

def convert_to_numeric(dataframe):
    """
    Converts all columns in the DataFrame into numeric types where possible.
    - Strings will be converted to numeric if feasible.
    - True/False (case insensitive) will be converted to 1 and 0.
    - If a value cannot be converted to numeric, it will remain as is.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame to convert.

    Returns:
        pd.DataFrame: A DataFrame with numeric conversions applied where possible.
    """
    def safe_convert(value):
        # Handle case-insensitive True/False
        if isinstance(value, str):
            if value.strip().lower() == 'true':
                return 1
            elif value.strip().lower() == 'false':
                return 0
        
        # Try to convert other values to numeric
        try:
            return pd.to_numeric(value, errors='raise')
        except:
            return value

    # Apply safe conversion to all elements in the DataFrame
    dataframe = dataframe.applymap(safe_convert)

    return dataframe

# Example usage
AA02_sample_data_numeric = convert_to_numeric(AA02_sample_data_imputed)

In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Step 1: Preprocess the Data

# Separate numeric and non-numeric columns
numeric_cols = AA02_sample_data_imputed.select_dtypes(include=[np.number]).columns
non_numeric_cols = AA02_sample_data_imputed.select_dtypes(exclude=[np.number]).columns

# Handle non-numeric columns
# Replace missing values in non-numeric columns with 'Unknown' or the most frequent value
for col in non_numeric_cols:
    most_frequent_value = AA02_sample_data_imputed[col].mode()[0]
    AA02_sample_data_imputed[col].fillna(most_frequent_value, inplace=True)

# Handle numeric columns: Replace NaN with the mean value
AA02_sample_data_imputed[numeric_cols] = AA02_sample_data_imputed[numeric_cols].fillna(
    AA02_sample_data_imputed[numeric_cols].mean()
)

# Normalize only numeric data
scaler = MinMaxScaler()
AA02_normalized_data = scaler.fit_transform(AA02_sample_data_imputed[numeric_cols])

# Convert normalized data back to a DataFrame
AA02_normalized_data = pd.DataFrame(AA02_normalized_data, columns=numeric_cols)

# Combine normalized numeric data and non-numeric data into one DataFrame
AA02_combined_data = pd.concat(
    [AA02_normalized_data, AA02_sample_data_imputed[non_numeric_cols].reset_index(drop=True)], axis=1
)

# Step 2: Build the Autoencoder
input_dim = AA02_normalized_data.shape[1]
latent_dim = int(input_dim / 2)  # Compression to half of the original dimensions

# Build the Autoencoder
input_layer = Input(shape=(input_dim,))
encoder = Dense(latent_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile the Autoencoder
autoencoder.compile(optimizer='adam', loss='mse')

# Step 3: Train the Autoencoder
history = autoencoder.fit(
    AA02_normalized_data,
    AA02_normalized_data,
    epochs=3,
    batch_size=1,
    shuffle=True,
    validation_split=0.2
)

# Step 4: Reconstruct the Data
AA02_reconstructed_data = autoencoder.predict(AA02_normalized_data)

# Convert back to original scale
AA02_imputed_data = scaler.inverse_transform(AA02_reconstructed_data)

# Convert back to DataFrame
AA02_imputed_data = pd.DataFrame(AA02_imputed_data, columns=numeric_cols)

# Step 5: Replace Missing Values
final_imputed_data = AA02_sample_data_imputed.copy()
final_imputed_data[numeric_cols] = AA02_imputed_data

# Final Output
final_imputed_data

Epoch 1/3
[1m23040/23040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 853us/step - loss: 0.0295 - val_loss: 0.0091
Epoch 2/3
[1m23040/23040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 840us/step - loss: 0.0089 - val_loss: 0.0083
Epoch 3/3
[1m23040/23040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 862us/step - loss: 0.0083 - val_loss: 0.0082
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397us/step


Unnamed: 0,dri_score,psych_disturb,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,...,hepatic_mild,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,N/A - non-malignant indication,No,No,1.919227,7.520434,No TBI,No,5.773517,Bone marrow,No,...,No,Unrelated,"N/A, Mel not given",7.681392,No,1.858588,No,9.596973,0.000612,40.092350
1,Intermediate,No,No,1.928004,7.623692,"TBI +- Other, >cGy",No,5.790064,Peripheral blood,No,...,No,Related,"N/A, Mel not given",7.647401,No,1.886458,Yes,9.575131,0.999745,4.822805
2,N/A - non-malignant indication,No,No,1.943752,7.696064,No TBI,No,5.793385,Bone marrow,No,...,No,Related,"N/A, Mel not given",7.715603,No,1.900352,No,9.654185,0.003313,23.497608
3,High,No,No,1.960038,7.648953,No TBI,No,5.769296,Bone marrow,No,...,Yes,Unrelated,"N/A, Mel not given",7.744062,No,1.872368,No,9.672004,0.000010,103.068748
4,High,No,No,1.919519,7.630819,No TBI,No,5.773418,Peripheral blood,No,...,No,Related,MEL,7.660840,No,1.887722,No,9.580358,0.008412,21.632509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,Intermediate - TED AML case <missing cytogenetics,No,No,1.962932,7.738688,No TBI,No,5.799851,Peripheral blood,No,...,No,Related,"N/A, Mel not given",7.754634,No,1.912281,No,9.695785,0.004517,24.894444
28796,High,No,Yes,1.165162,3.771065,No TBI,No,5.140062,Peripheral blood,No,...,No,Related,"N/A, Mel not given",6.678726,Yes,1.266866,Yes,8.285807,0.997062,3.675912
28797,TBD cytogenetics,No,No,1.945328,7.677855,No TBI,No,5.892027,Peripheral blood,No,...,No,Unrelated,"N/A, Mel not given",7.827389,No,1.889561,No,9.722552,0.008288,22.436764
28798,N/A - non-malignant indication,No,No,0.878877,3.845691,No TBI,No,2.703928,Peripheral blood,No,...,No,Related,MEL,3.523571,No,0.843187,No,4.962386,0.000901,39.115631


### Verify Missing Percetage = 0

In [51]:
AA02_missing_data_info(final_imputed_data)

Unnamed: 0,AA02_Variable,AA02_Missing_Count,AA02_Missing_Percentage
0,dri_score,0,0.0%
1,psych_disturb,0,0.0%
2,diabetes,0,0.0%
3,hla_match_c_high,0,0.0%
4,hla_high_res_8,0,0.0%
5,tbi_status,0,0.0%
6,arrhythmia,0,0.0%
7,hla_low_res_6,0,0.0%
8,graft_type,0,0.0%
9,vent_hist,0,0.0%


## Numerical Encoding

### Total Nominal unique value

In [54]:
def AA02_check_unique_values(dataframe):
    """
    Checks and returns the number of unique values for each column in the DataFrame.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A DataFrame with column names and their unique value counts.
    """
    unique_values = dataframe.nunique()
    return pd.DataFrame({"Variable Name": unique_values.index, "Unique Values": unique_values.values})

# Example usage
unique_values_df = AA02_check_unique_values(AA02_sample_data_imputed[AA02_categorical_nominal_columns])
sum_unique_values = unique_values_df["Unique Values"].sum()

print("Sum of all unique values:", sum_unique_values)
display_full_dataframe(AA02_check_unique_values(AA02_sample_data_numeric[AA02_categorical_nominal_columns]))

Sum of all unique values: 133


Unnamed: 0,Variable Name,Unique Values
0,dri_score,11
1,psych_disturb,3
2,diabetes,3
3,tbi_status,8
4,arrhythmia,3
5,graft_type,2
6,vent_hist,2
7,renal_issue,3
8,pulm_severe,3
9,prim_disease_hct,18


### Ordinal/Nominal Encoding

In [56]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd

def encode_categorical_columns(dataframe, ordinal_columns, nominal_columns, use_one_hot_for_nominal=False, ordinal_categories=None):
    """
    Encodes categorical columns in the dataframe using either OrdinalEncoder or OneHotEncoder for nominal columns
    and OrdinalEncoder for ordinal columns.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame to encode.
        ordinal_columns (list): List of ordinal column names to encode.
        nominal_columns (list): List of nominal column names to encode.
        use_one_hot_for_nominal (bool): If True, use OneHotEncoder for nominal columns. Otherwise, use OrdinalEncoder.
        ordinal_categories (list of lists): The order of categories for ordinal columns. Pass None if not applicable.

    Returns:
        pd.DataFrame: DataFrame with encoded categorical columns.
    """
    # Make a copy of the DataFrame
    dataframe_encoded = dataframe.copy()

    # Initialize OrdinalEncoder for ordinal columns with specified order
    if ordinal_categories:
        ordinal_encoder_ordinal = OrdinalEncoder(categories=ordinal_categories)
    else:
        ordinal_encoder_ordinal = OrdinalEncoder()

    # Encode ordinal columns
    if ordinal_columns:
        dataframe_encoded[ordinal_columns] = ordinal_encoder_ordinal.fit_transform(
            dataframe_encoded[ordinal_columns].astype(str)
        )

    # Encode nominal columns
    if use_one_hot_for_nominal:
        # Exclude numeric columns from one-hot encoding
        nominal_columns_to_encode = [col for col in nominal_columns if not pd.api.types.is_numeric_dtype(dataframe[col])]
        if nominal_columns_to_encode:
            one_hot_encoder = OneHotEncoder(sparse_output=False, drop=None)
            encoded_nominal_columns = one_hot_encoder.fit_transform(dataframe_encoded[nominal_columns_to_encode].astype(str))
            encoded_nominal_df = pd.DataFrame(
                encoded_nominal_columns,
                columns=one_hot_encoder.get_feature_names_out(nominal_columns_to_encode),
                index=dataframe_encoded.index
            )
            dataframe_encoded = dataframe_encoded.drop(nominal_columns_to_encode, axis=1)
            dataframe_encoded = pd.concat([dataframe_encoded, encoded_nominal_df], axis=1)
    else:
        ordinal_encoder_nominal = OrdinalEncoder()
        dataframe_encoded[nominal_columns] = ordinal_encoder_nominal.fit_transform(
            dataframe_encoded[nominal_columns].astype(str)
        )

    return dataframe_encoded

# Example usage
AA02_ordinal_categories = [#['low', 'mid', 'high'] #low is 0 mid is 1 hight is 2
    
]

AA02_sample_data_ordinally_encoded = encode_categorical_columns(
    dataframe=AA02_sample_data_numeric,
    ordinal_columns=AA02_categorical_ordinal_columns,
    nominal_columns=AA02_categorical_nominal_columns,
    use_one_hot_for_nominal=True,  # Set to False to use OrdinalEncoder for nominal columns
    ordinal_categories=AA02_ordinal_categories
)
pd.set_option('display.max_columns', None)
display(AA02_sample_data_ordinally_encoded)

Unnamed: 0,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,year_hct,hla_match_a_high,donor_age,hla_match_b_low,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time,dri_score_High,dri_score_High - TED AML case <missing cytogenetics,dri_score_Intermediate,dri_score_Intermediate - TED AML case <missing cytogenetics,dri_score_Low,dri_score_Missing disease status,dri_score_N/A - disease not classifiable,dri_score_N/A - non-malignant indication,dri_score_N/A - pediatric,dri_score_TBD cytogenetics,dri_score_Very high,dri_score_nan,psych_disturb_No,psych_disturb_Not done,psych_disturb_Yes,psych_disturb_nan,diabetes_No,diabetes_Not done,diabetes_Yes,diabetes_nan,tbi_status_No TBI,tbi_status_TBI + Cy +- Other,"tbi_status_TBI +- Other, -cGy, fractionated","tbi_status_TBI +- Other, -cGy, single","tbi_status_TBI +- Other, -cGy, unknown dose","tbi_status_TBI +- Other, <=cGy","tbi_status_TBI +- Other, >cGy","tbi_status_TBI +- Other, unknown dose",arrhythmia_No,arrhythmia_Not done,arrhythmia_Yes,arrhythmia_nan,graft_type_Bone marrow,graft_type_Peripheral blood,vent_hist_No,vent_hist_Yes,vent_hist_nan,renal_issue_No,renal_issue_Not done,renal_issue_Yes,renal_issue_nan,pulm_severe_No,pulm_severe_Not done,pulm_severe_Yes,pulm_severe_nan,prim_disease_hct_AI,prim_disease_hct_ALL,prim_disease_hct_AML,prim_disease_hct_CML,prim_disease_hct_HD,prim_disease_hct_HIS,prim_disease_hct_IEA,prim_disease_hct_IIS,prim_disease_hct_IMD,prim_disease_hct_IPA,prim_disease_hct_MDS,prim_disease_hct_MPN,prim_disease_hct_NHL,prim_disease_hct_Other acute leukemia,prim_disease_hct_Other leukemia,prim_disease_hct_PCD,prim_disease_hct_SAA,prim_disease_hct_Solid tumor,cmv_status_+/+,cmv_status_+/-,cmv_status_-/+,cmv_status_-/-,cmv_status_nan,rituximab_No,rituximab_Yes,rituximab_nan,prod_type_BM,prod_type_PB,conditioning_intensity_MAC,"conditioning_intensity_N/A, F(pre-TED) not submitted",conditioning_intensity_NMA,conditioning_intensity_No drugs reported,conditioning_intensity_RIC,conditioning_intensity_TBD,conditioning_intensity_nan,ethnicity_Hispanic or Latino,ethnicity_Non-resident of the U.S.,ethnicity_Not Hispanic or Latino,ethnicity_nan,obesity_No,obesity_Not done,obesity_Yes,obesity_nan,in_vivo_tcd_No,in_vivo_tcd_Yes,in_vivo_tcd_nan,hepatic_severe_No,hepatic_severe_Not done,hepatic_severe_Yes,hepatic_severe_nan,prior_tumor_No,prior_tumor_Not done,prior_tumor_Yes,prior_tumor_nan,peptic_ulcer_No,peptic_ulcer_Not done,peptic_ulcer_Yes,peptic_ulcer_nan,gvhd_proph_CDselect +- other,gvhd_proph_CDselect alone,gvhd_proph_CSA + MMF +- others(not FK),"gvhd_proph_CSA + MTX +- others(not MMF,FK)","gvhd_proph_CSA +- others(not FK,MMF,MTX)",gvhd_proph_CSA alone,gvhd_proph_Cyclophosphamide +- others,gvhd_proph_Cyclophosphamide alone,gvhd_proph_FK+ MMF +- others,gvhd_proph_FK+ MTX +- others(not MMF),"gvhd_proph_FK+- others(not MMF,MTX)",gvhd_proph_FKalone,gvhd_proph_No GvHD Prophylaxis,gvhd_proph_Other GVHD Prophylaxis,"gvhd_proph_Parent Q = yes, but no agent",gvhd_proph_TDEPLETION +- other,gvhd_proph_TDEPLETION alone,gvhd_proph_nan,rheum_issue_No,rheum_issue_Not done,rheum_issue_Yes,rheum_issue_nan,sex_match_F-F,sex_match_F-M,sex_match_M-F,sex_match_M-M,sex_match_nan,race_group_American Indian or Alaska Native,race_group_Asian,race_group_Black or African-American,race_group_More than one race,race_group_Native Hawaiian or other Pacific Islander,race_group_White,hepatic_mild_No,hepatic_mild_Not done,hepatic_mild_Yes,hepatic_mild_nan,donor_related_Multiple donor (non-UCB),donor_related_Related,donor_related_Unrelated,donor_related_nan,melphalan_dose_MEL,"melphalan_dose_N/A, Mel not given",melphalan_dose_nan,cardiac_No,cardiac_Not done,cardiac_Yes,cardiac_nan,pulm_moderate_No,pulm_moderate_Not done,pulm_moderate_Yes,pulm_moderate_nan
0,,,6.0,6.0,,2.0,6.0,2.0,2.0,2.0,2016,2.0,,2.0,9.942,2.0,2.0,0.0,90.0,8.0,2.0,10.0,0.0,42.356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2008,2.0,72.290,2.0,43.705,2.0,2.0,3.0,90.0,8.0,2.0,10.0,1.0,4.672,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2019,2.0,,2.0,33.997,2.0,2.0,0.0,90.0,8.0,2.0,10.0,0.0,19.793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2009,2.0,29.230,2.0,43.245,2.0,2.0,0.0,90.0,8.0,2.0,10.0,0.0,102.349,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2.0,8.0,6.0,6.0,10.0,2.0,5.0,2.0,2.0,2.0,2018,2.0,56.810,2.0,29.740,2.0,2.0,1.0,90.0,8.0,2.0,10.0,0.0,16.223,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2018,2.0,24.212,2.0,51.136,2.0,2.0,0.0,,8.0,2.0,10.0,0.0,18.633,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
28796,1.0,4.0,5.0,3.0,6.0,2.0,4.0,1.0,2.0,2.0,2017,1.0,30.770,1.0,18.075,2.0,1.0,3.0,90.0,6.0,1.0,8.0,1.0,4.892,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
28797,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2018,2.0,22.627,2.0,51.005,2.0,2.0,5.0,90.0,8.0,2.0,10.0,0.0,23.157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
28798,1.0,4.0,3.0,3.0,5.0,1.0,3.0,1.0,1.0,1.0,2018,1.0,58.074,1.0,0.044,1.0,1.0,1.0,90.0,4.0,1.0,5.0,0.0,52.351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Redefine Independent variable

In [58]:
AA02_y_columns, AA02_x_columns = prepare_data(AA02_y, AA02_sample_data_ordinally_encoded.columns.tolist(), [])

## Transformation

In [60]:
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer # type: ignore
import pandas as pd

# Function to handle transformations based on distribution characteristics
def apply_transformations(AA02_sample_data, columns):
    # Initialize a list to store transformation logs
    AA02_transformation_logs = []

    for column in columns:
        # Compute AA02_skewness and AA02_kurtosis
        AA02_skewness = AA02_sample_data[column].skew()
        AA02_kurtosis = AA02_sample_data[column].kurt()
        AA02_action = "None"  # Default AA02_action

        # Handle Right Skew (Positive Skew)
        if AA02_skewness > 1:
            AA02_action = "Log Transformation"
            AA02_sample_data[column] = np.log1p(AA02_sample_data[column])

        # Handle Left Skew (Negative Skew)
        elif AA02_skewness < -1:
            AA02_action = "Reflect and Log Transformation"
            AA02_sample_data[column] = np.log1p(AA02_sample_data[column].max() - AA02_sample_data[column])

        # Handle High Kurtosis (Heavy Tails)
        if AA02_kurtosis > 3:
            try:
                AA02_action = "Box-Cox Transformation"
                AA02_sample_data[column], _ = boxcox(AA02_sample_data[column].clip(lower=1))
            except ValueError:
                AA02_action = "Box-Cox Failed, Applied Yeo-Johnson"
                transformer = PowerTransformer(method='yeo-johnson')
                AA02_sample_data[column] = transformer.fit_transform(AA02_sample_data[[column]])

        # Handle Low Kurtosis (Light Tails)
        elif AA02_kurtosis < 3 and AA02_action == "None":
            AA02_action = "Yeo-Johnson Transformation"
            transformer = PowerTransformer(method='yeo-johnson')
            AA02_sample_data[column] = transformer.fit_transform(AA02_sample_data[[column]])

        AA02_skewness_after_transformation = AA02_sample_data[column].skew()
        AA02_kurtosis_after_transformation = AA02_sample_data[column].kurt()

        # Append the log entry
        AA02_transformation_logs.append({
            'Column Name': column,
            'Skewness Before Transformation': AA02_skewness,
            'Kurtosis Before Transformationv': AA02_kurtosis,
            'Action Taken': AA02_action,
            'Skewness After Transformation': AA02_skewness_after_transformation,
            'Kurtosis After Transformationv': AA02_kurtosis_after_transformation
        })

    # Create a DataFrame for transformation logs
    transformation_log_AA02_df = pd.DataFrame(AA02_transformation_logs)
    return AA02_sample_data, transformation_log_AA02_df

# Example usage with AA02_sample_data_encoded
AA02_sample_data_encoded = AA02_sample_data_ordinally_encoded.copy()
AA02_sample_data_transformed, AA02_transformation_logs = apply_transformations(AA02_sample_data_encoded, AA02_non_categorical_columns)

# AA02_display the transformation log DataFrame
print("Transformation Log:")
AA02_transformation_logs

Transformation Log:


Unnamed: 0,Column Name,Skewness Before Transformation,Kurtosis Before Transformationv,Action Taken,Skewness After Transformation,Kurtosis After Transformationv
0,hla_match_c_high,-1.367969,0.225165,Reflect and Log Transformation,1.297894,-0.234049
1,hla_high_res_8,-0.972463,-0.73125,Yeo-Johnson Transformation,-0.662376,-1.34679
2,hla_low_res_6,-0.949163,-0.815015,Yeo-Johnson Transformation,-0.673799,-1.369529
3,hla_high_res_6,-0.892156,-0.892141,Yeo-Johnson Transformation,-0.619505,-1.424301
4,hla_high_res_10,-0.998479,-0.648562,Yeo-Johnson Transformation,-0.653897,-1.321393
5,hla_match_dqb1_high,-1.184635,-0.288735,Reflect and Log Transformation,1.121569,-0.671686
6,hla_nmdp_6,-1.0141,-0.67685,Reflect and Log Transformation,0.779403,-1.13905
7,hla_match_c_low,-1.313565,0.045777,Reflect and Log Transformation,1.249757,-0.364855
8,hla_match_drb1_low,-0.954226,-1.089535,Yeo-Johnson Transformation,-0.954226,-1.089535
9,hla_match_dqb1_low,-1.450511,0.518563,Reflect and Log Transformation,1.368966,-0.029414


In [61]:
# Code for AA02_displaying transformed datset
AA02_sample_data_transformed.head(5)

Unnamed: 0,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,year_hct,hla_match_a_high,donor_age,hla_match_b_low,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time,dri_score_High,dri_score_High - TED AML case <missing cytogenetics,dri_score_Intermediate,dri_score_Intermediate - TED AML case <missing cytogenetics,dri_score_Low,dri_score_Missing disease status,dri_score_N/A - disease not classifiable,dri_score_N/A - non-malignant indication,dri_score_N/A - pediatric,dri_score_TBD cytogenetics,dri_score_Very high,dri_score_nan,psych_disturb_No,psych_disturb_Not done,psych_disturb_Yes,psych_disturb_nan,diabetes_No,diabetes_Not done,diabetes_Yes,diabetes_nan,tbi_status_No TBI,tbi_status_TBI + Cy +- Other,"tbi_status_TBI +- Other, -cGy, fractionated","tbi_status_TBI +- Other, -cGy, single","tbi_status_TBI +- Other, -cGy, unknown dose","tbi_status_TBI +- Other, <=cGy","tbi_status_TBI +- Other, >cGy","tbi_status_TBI +- Other, unknown dose",arrhythmia_No,arrhythmia_Not done,arrhythmia_Yes,arrhythmia_nan,graft_type_Bone marrow,graft_type_Peripheral blood,vent_hist_No,vent_hist_Yes,vent_hist_nan,renal_issue_No,renal_issue_Not done,renal_issue_Yes,renal_issue_nan,pulm_severe_No,pulm_severe_Not done,pulm_severe_Yes,pulm_severe_nan,prim_disease_hct_AI,prim_disease_hct_ALL,prim_disease_hct_AML,prim_disease_hct_CML,prim_disease_hct_HD,prim_disease_hct_HIS,prim_disease_hct_IEA,prim_disease_hct_IIS,prim_disease_hct_IMD,prim_disease_hct_IPA,prim_disease_hct_MDS,prim_disease_hct_MPN,prim_disease_hct_NHL,prim_disease_hct_Other acute leukemia,prim_disease_hct_Other leukemia,prim_disease_hct_PCD,prim_disease_hct_SAA,prim_disease_hct_Solid tumor,cmv_status_+/+,cmv_status_+/-,cmv_status_-/+,cmv_status_-/-,cmv_status_nan,rituximab_No,rituximab_Yes,rituximab_nan,prod_type_BM,prod_type_PB,conditioning_intensity_MAC,"conditioning_intensity_N/A, F(pre-TED) not submitted",conditioning_intensity_NMA,conditioning_intensity_No drugs reported,conditioning_intensity_RIC,conditioning_intensity_TBD,conditioning_intensity_nan,ethnicity_Hispanic or Latino,ethnicity_Non-resident of the U.S.,ethnicity_Not Hispanic or Latino,ethnicity_nan,obesity_No,obesity_Not done,obesity_Yes,obesity_nan,in_vivo_tcd_No,in_vivo_tcd_Yes,in_vivo_tcd_nan,hepatic_severe_No,hepatic_severe_Not done,hepatic_severe_Yes,hepatic_severe_nan,prior_tumor_No,prior_tumor_Not done,prior_tumor_Yes,prior_tumor_nan,peptic_ulcer_No,peptic_ulcer_Not done,peptic_ulcer_Yes,peptic_ulcer_nan,gvhd_proph_CDselect +- other,gvhd_proph_CDselect alone,gvhd_proph_CSA + MMF +- others(not FK),"gvhd_proph_CSA + MTX +- others(not MMF,FK)","gvhd_proph_CSA +- others(not FK,MMF,MTX)",gvhd_proph_CSA alone,gvhd_proph_Cyclophosphamide +- others,gvhd_proph_Cyclophosphamide alone,gvhd_proph_FK+ MMF +- others,gvhd_proph_FK+ MTX +- others(not MMF),"gvhd_proph_FK+- others(not MMF,MTX)",gvhd_proph_FKalone,gvhd_proph_No GvHD Prophylaxis,gvhd_proph_Other GVHD Prophylaxis,"gvhd_proph_Parent Q = yes, but no agent",gvhd_proph_TDEPLETION +- other,gvhd_proph_TDEPLETION alone,gvhd_proph_nan,rheum_issue_No,rheum_issue_Not done,rheum_issue_Yes,rheum_issue_nan,sex_match_F-F,sex_match_F-M,sex_match_M-F,sex_match_M-M,sex_match_nan,race_group_American Indian or Alaska Native,race_group_Asian,race_group_Black or African-American,race_group_More than one race,race_group_Native Hawaiian or other Pacific Islander,race_group_White,hepatic_mild_No,hepatic_mild_Not done,hepatic_mild_Yes,hepatic_mild_nan,donor_related_Multiple donor (non-UCB),donor_related_Related,donor_related_Unrelated,donor_related_nan,melphalan_dose_MEL,"melphalan_dose_N/A, Mel not given",melphalan_dose_nan,cardiac_No,cardiac_Not done,cardiac_Yes,cardiac_nan,pulm_moderate_No,pulm_moderate_Not done,pulm_moderate_Yes,pulm_moderate_nan
0,,,0.760798,0.786157,,0.0,0.0,0.0,0.63089,0.0,1.609438,0.644806,,0.0,-1.35433,0.637627,0.650365,0.0,0.489956,0.0,0.63921,0.0,0.0,1.468783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.781744,0.760798,0.786157,0.802636,0.0,0.0,0.0,0.63089,0.0,2.564949,0.644806,1.671818,0.0,0.261483,0.637627,0.650365,1.386294,0.489956,0.0,0.63921,0.0,1.0,0.574851,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.781744,0.760798,0.786157,0.802636,0.0,0.0,0.0,0.63089,0.0,0.693147,0.644806,,0.0,-0.185358,0.637627,0.650365,0.0,0.489956,0.0,0.63921,0.0,0.0,1.208258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.781744,0.760798,0.786157,0.802636,0.0,0.0,0.0,0.63089,0.0,2.484907,0.644806,-0.841334,0.0,0.240523,0.637627,0.650365,0.0,0.489956,0.0,0.63921,0.0,0.0,1.726025,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.781744,0.760798,0.786157,0.802636,0.0,0.693147,0.0,0.63089,0.0,1.098612,0.644806,0.961828,0.0,-0.384692,0.637627,0.650365,0.693147,0.489956,0.0,0.63921,0.0,0.0,1.132878,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Scaling

In [63]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def scale_dataframe(AA02_df,AA02_y_columns, method='standard'):
    """
    Scales numeric columns of the input DataFrame, excluding binary columns.

    Parameters:
        AA02_df (pd.DataFrame): Input DataFrame to scale.
        method (str): Scaling method, either 'standard' (default) for StandardScaler or 'minmax' for MinMaxScaler.

    Returns:
        pd.DataFrame: Scaled DataFrame with the same column names as the input.
    """
    if not isinstance(AA02_df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")

    # Select numeric columns only
    AA02_numeric_cols = AA02_df.select_dtypes(include=['float64', 'int64']).columns

    # Exclude binary columns (those with only two unique values)
    AA02_non_binary_cols = [col for col in AA02_numeric_cols if AA02_df[col].nunique() > 2]

    AA02_non_binary_cols = [
    var for var in AA02_non_binary_cols if var not in AA02_y_columns
]

    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Invalid method. Use 'standard' or 'minmax'.")

    # Scale non-binary numeric columns
    AA02_df_scaled = AA02_df.copy()
    AA02_df_scaled[AA02_non_binary_cols] = scaler.fit_transform(AA02_df[AA02_non_binary_cols])

    return AA02_df_scaled

AA02_sample_data_scaled =  AA02_sample_data_transformed.copy()
AA02_sample_data_scaled = scale_dataframe(AA02_sample_data_scaled, AA02_y_columns)

# Machine Learning

## Train Test Split

In [66]:
from sklearn.model_selection import train_test_split

AA02_sample_data_train_test_split = AA02_sample_data_scaled.copy()

# Extract the target columns (multi-output targets)
AA02_y_data = AA02_sample_data_train_test_split[AA02_y_columns]  # AA02_y_columns should be a list like ['efs', 'efs_time']

# Extract the features (as DataFrame)
AA02_x_data = AA02_sample_data_train_test_split[AA02_x_columns]  # AA02_x_columns is the list of feature column names

# Perform the train-test split
AA02_x_train, AA02_x_test, AA02_y_train, AA02_y_test = train_test_split(
    AA02_x_data, AA02_y_data, test_size=0.1, random_state=55002, stratify=AA02_y_data['efs']  # Stratify by 'efs'
)

## Train test split verification

In [68]:
# Print shapes of training and test sets
print("Shape of AA02_x_data:", AA02_x_data.shape)
print("Shape of AA02_y_data:", AA02_y_data.shape)

Shape of AA02_x_data: (28800, 177)
Shape of AA02_y_data: (28800, 2)


In [69]:
# Print shapes of training and test sets
print("Shape of AA02_x_train:", AA02_x_train.shape)
print("Shape of AA02_x_test:", AA02_x_test.shape)
print("Shape of AA02_y_train:", AA02_y_train.shape)
print("Shape of AA02_y_test:", AA02_y_test.shape)

Shape of AA02_x_train: (25920, 177)
Shape of AA02_x_test: (2880, 177)
Shape of AA02_y_train: (25920, 2)
Shape of AA02_y_test: (2880, 2)


In [70]:
import pandas as pd

# Compute unique value counts for AA02_x_train and AA02_x_test
AA02_unique_values_AA02_x_train = AA02_x_train.nunique()
AA02_unique_values_AA02_x_test = AA02_x_test.nunique()

# Safely compute unique value counts for dependent variables in AA02_y_train and AA02_y_test
AA02_unique_values_AA02_y_train = {}
AA02_unique_values_AA02_y_test = {}

for y in AA02_y_columns:
    if y in AA02_y_train.columns:
        AA02_unique_values_AA02_y_train[y] = AA02_y_train[y].nunique()
    else:
        print(f"Warning: '{y}' not found in AA02_y_train")
    
    if y in AA02_y_test.columns:
        AA02_unique_values_AA02_y_test[y] = AA02_y_test[y].nunique()
    else:
        print(f"Warning: '{y}' not found in AA02_y_test")

# Convert to pandas Series
AA02_unique_values_AA02_y_train = pd.Series(AA02_unique_values_AA02_y_train)
AA02_unique_values_AA02_y_test = pd.Series(AA02_unique_values_AA02_y_test)

# Combine the results into a single DataFrame
AA02_unique_values_AA02_df = pd.DataFrame({
    'AA02_x_train': AA02_unique_values_AA02_x_train,
    'AA02_x_test': AA02_unique_values_AA02_x_test,
    'AA02_y_train': AA02_unique_values_AA02_y_train,
    'AA02_y_test': AA02_unique_values_AA02_y_test
})

# Display the DataFrame
display_full_dataframe(AA02_unique_values_AA02_df)

Unnamed: 0,AA02_x_train,AA02_x_test,AA02_y_train,AA02_y_test
age_at_hct,20360.0,2708.0,,
arrhythmia_No,2.0,2.0,,
arrhythmia_Not done,2.0,2.0,,
arrhythmia_Yes,2.0,2.0,,
arrhythmia_nan,2.0,2.0,,
cardiac_No,2.0,2.0,,
cardiac_Not done,2.0,2.0,,
cardiac_Yes,2.0,2.0,,
cardiac_nan,2.0,2.0,,
cmv_status_+/+,2.0,2.0,,


## Neural Network

### Training

In [92]:
import pandas as pd
from itertools import combinations
from sklearn.metrics import r2_score
from sksurv.metrics import concordance_index_censored
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import os

def evaluate_variable_combinations(AA02_x_train, AA02_x_test, AA02_y_train, AA02_y_test, important_variables_list, min_var_fraction=0.9999):
    """
    Evaluate all combinations of variables for a neural network model and calculate performance metrics.

    Parameters:
        AA02_x_train (pd.DataFrame): Training feature set.
        AA02_x_test (pd.DataFrame): Test feature set.
        AA02_y_train (pd.DataFrame): Training target set.
        AA02_y_test (pd.DataFrame): Test target set.
        important_variables_list (list): List of important variables.
        min_var_fraction (float): Minimum fraction of variables to consider in combinations.

    Returns:
        tuple: A tuple containing the best model, and a DataFrame with evaluation metrics and variable presence.
    """
    # Initialize the results list
    results_list = []
    best_model = None
    best_c_index = -float('inf')

    # Determine the minimum number of variables to include
    min_important_vars = max(int(min_var_fraction * len(important_variables_list)), 1)

    def train_model(x_train, y_train, input_shape):
        try:
            input_layer = Input(shape=(input_shape,), name="input_layer")
            shared_layer = Dense(128, activation="relu")(input_layer)
            shared_layer = Dropout(0.3)(shared_layer)
            shared_layer = Dense(64, activation="relu")(shared_layer)
            efs_time_output = Dense(1, activation="linear", name="efs_time_output")(shared_layer)

            model = Model(inputs=input_layer, outputs=efs_time_output)
            model.compile(optimizer="adam", loss="mse", metrics=["mse"])

            model.fit(x_train, y_train, epochs=3, batch_size=64, verbose=0)
            return model
        except Exception as e:
            print(f"Error in train_model: {e}")
            return None

    def evaluate_subset(subset):
        nonlocal best_model, best_c_index

        # Subset the train and test data
        AA02_x_train_subset = AA02_x_train[list(subset)]
        AA02_x_test_subset = AA02_x_test[list(subset)]

        # Convert to tensors
        x_train_tensor = tf.convert_to_tensor(AA02_x_train_subset.values, dtype=tf.float32)
        y_train_tensor = tf.convert_to_tensor(AA02_y_train["efs_time"].values, dtype=tf.float32)

        # Train the model
        model = train_model(x_train_tensor, y_train_tensor, x_train_tensor.shape[1])

        if model is None:
            return None

        # Predict and evaluate
        x_test_tensor = tf.convert_to_tensor(AA02_x_test_subset.values, dtype=tf.float32)
        efs_time_predictions = model.predict(x_test_tensor).ravel()
        event_indicator = AA02_y_test["efs"].values.astype(bool)
        event_time = AA02_y_test["efs_time"].values

        efs_time_cindex = concordance_index_censored(
            event_indicator,  # Event indicator
            event_time,       # Observed survival times
            -efs_time_predictions  # Predicted risk scores
        )
        if isinstance(efs_time_cindex, tuple):
            efs_time_cindex = efs_time_cindex[0]

        efs_time_r2 = r2_score(AA02_y_test["efs_time"], efs_time_predictions)

        # Check if this model is the best so far
        if efs_time_cindex > best_c_index:
            best_c_index = efs_time_cindex
            best_model = model

        # Store results in a dictionary
        result = {
            "C-index": efs_time_cindex,
            "R²": efs_time_r2
        }

        # Add variable presence
        for var in important_variables_list:
            result[var] = 1 if var in subset else 0

        return result

    def safe_evaluate_subset(subset):
        try:
            return evaluate_subset(subset)
        except Exception as e:
            print(f"Error evaluating subset {subset}: {e}")
            return None

    # Generate all combinations of variables
    all_combinations = []
    for r in range(min_important_vars, len(important_variables_list) + 1):
        all_combinations.extend(combinations(important_variables_list, r))

    # Log the total number of models to be trained
    print(f"Total models to be trained: {len(all_combinations)}")

    # Use multithreading to evaluate combinations
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        results_list = list(filter(None, executor.map(safe_evaluate_subset, all_combinations)))

    # Convert results list to DataFrame
    results_df = pd.DataFrame(results_list)

    return best_model, results_df

# Example usage
best_model, results_df = evaluate_variable_combinations(AA02_x_train, AA02_x_test, AA02_y_train, AA02_y_test, AA02_x_train.columns.tolist())

Total models to be trained: 178
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
[1m90/

### Evaluation

In [94]:
import pandas as pd
from IPython.display import display

def display_full_dataframe(df):
    # Set display options for max columns and rows
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)

    # Display the DataFrame
    display(df)

    # Reset options to defaults after displaying
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')
display_full_dataframe(results_df)

Unnamed: 0,C-index,R²,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,year_hct,hla_match_a_high,donor_age,hla_match_b_low,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,dri_score_High,dri_score_High - TED AML case <missing cytogenetics,dri_score_Intermediate,dri_score_Intermediate - TED AML case <missing cytogenetics,dri_score_Low,dri_score_Missing disease status,dri_score_N/A - disease not classifiable,dri_score_N/A - non-malignant indication,dri_score_N/A - pediatric,dri_score_TBD cytogenetics,dri_score_Very high,dri_score_nan,psych_disturb_No,psych_disturb_Not done,psych_disturb_Yes,psych_disturb_nan,diabetes_No,diabetes_Not done,diabetes_Yes,diabetes_nan,tbi_status_No TBI,tbi_status_TBI + Cy +- Other,"tbi_status_TBI +- Other, -cGy, fractionated","tbi_status_TBI +- Other, -cGy, single","tbi_status_TBI +- Other, -cGy, unknown dose","tbi_status_TBI +- Other, <=cGy","tbi_status_TBI +- Other, >cGy","tbi_status_TBI +- Other, unknown dose",arrhythmia_No,arrhythmia_Not done,arrhythmia_Yes,arrhythmia_nan,graft_type_Bone marrow,graft_type_Peripheral blood,vent_hist_No,vent_hist_Yes,vent_hist_nan,renal_issue_No,renal_issue_Not done,renal_issue_Yes,renal_issue_nan,pulm_severe_No,pulm_severe_Not done,pulm_severe_Yes,pulm_severe_nan,prim_disease_hct_AI,prim_disease_hct_ALL,prim_disease_hct_AML,prim_disease_hct_CML,prim_disease_hct_HD,prim_disease_hct_HIS,prim_disease_hct_IEA,prim_disease_hct_IIS,prim_disease_hct_IMD,prim_disease_hct_IPA,prim_disease_hct_MDS,prim_disease_hct_MPN,prim_disease_hct_NHL,prim_disease_hct_Other acute leukemia,prim_disease_hct_Other leukemia,prim_disease_hct_PCD,prim_disease_hct_SAA,prim_disease_hct_Solid tumor,cmv_status_+/+,cmv_status_+/-,cmv_status_-/+,cmv_status_-/-,cmv_status_nan,rituximab_No,rituximab_Yes,rituximab_nan,prod_type_BM,prod_type_PB,conditioning_intensity_MAC,"conditioning_intensity_N/A, F(pre-TED) not submitted",conditioning_intensity_NMA,conditioning_intensity_No drugs reported,conditioning_intensity_RIC,conditioning_intensity_TBD,conditioning_intensity_nan,ethnicity_Hispanic or Latino,ethnicity_Non-resident of the U.S.,ethnicity_Not Hispanic or Latino,ethnicity_nan,obesity_No,obesity_Not done,obesity_Yes,obesity_nan,in_vivo_tcd_No,in_vivo_tcd_Yes,in_vivo_tcd_nan,hepatic_severe_No,hepatic_severe_Not done,hepatic_severe_Yes,hepatic_severe_nan,prior_tumor_No,prior_tumor_Not done,prior_tumor_Yes,prior_tumor_nan,peptic_ulcer_No,peptic_ulcer_Not done,peptic_ulcer_Yes,peptic_ulcer_nan,gvhd_proph_CDselect +- other,gvhd_proph_CDselect alone,gvhd_proph_CSA + MMF +- others(not FK),"gvhd_proph_CSA + MTX +- others(not MMF,FK)","gvhd_proph_CSA +- others(not FK,MMF,MTX)",gvhd_proph_CSA alone,gvhd_proph_Cyclophosphamide +- others,gvhd_proph_Cyclophosphamide alone,gvhd_proph_FK+ MMF +- others,gvhd_proph_FK+ MTX +- others(not MMF),"gvhd_proph_FK+- others(not MMF,MTX)",gvhd_proph_FKalone,gvhd_proph_No GvHD Prophylaxis,gvhd_proph_Other GVHD Prophylaxis,"gvhd_proph_Parent Q = yes, but no agent",gvhd_proph_TDEPLETION +- other,gvhd_proph_TDEPLETION alone,gvhd_proph_nan,rheum_issue_No,rheum_issue_Not done,rheum_issue_Yes,rheum_issue_nan,sex_match_F-F,sex_match_F-M,sex_match_M-F,sex_match_M-M,sex_match_nan,race_group_American Indian or Alaska Native,race_group_Asian,race_group_Black or African-American,race_group_More than one race,race_group_Native Hawaiian or other Pacific Islander,race_group_White,hepatic_mild_No,hepatic_mild_Not done,hepatic_mild_Yes,hepatic_mild_nan,donor_related_Multiple donor (non-UCB),donor_related_Related,donor_related_Unrelated,donor_related_nan,melphalan_dose_MEL,"melphalan_dose_N/A, Mel not given",melphalan_dose_nan,cardiac_No,cardiac_Not done,cardiac_Yes,cardiac_nan,pulm_moderate_No,pulm_moderate_Not done,pulm_moderate_Yes,pulm_moderate_nan
0,0.5,-0.0007974736,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
1,0.5,-7.652691e-05,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1
2,0.5,-5.052146e-07,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1
3,0.5,-0.0001350812,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1
4,0.5,-0.0008437245,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1
5,0.5,-1.091723e-06,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1
6,0.5,-0.001645465,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
7,0.5,-0.001100578,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1
8,0.5,-0.002792911,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
9,0.5,-0.006038385,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


### Filtering Important Variables

In [116]:
def filter_important_variables_v2(results_df, important_variables_list, c_index_threshold=None):
    """
    Identify important variables based on their frequency in high C-index models.

    Parameters:
        results_df (pd.DataFrame): DataFrame containing C-index and variables used in training.
        important_variables_list (list): List of all potential important variables.
        c_index_threshold (float, optional): Minimum C-index value to consider a model high-performing.
                                             If None, defaults to the 75th percentile of C-index.

    Returns:
        list: Updated list of important variables.
    """
    if c_index_threshold is None:
        # Default to 75th percentile if no threshold is provided
        c_index_threshold = results_df["C-index"].quantile(0.95)
    
    # Filter rows with C-index above the threshold
    high_performance_models = results_df[results_df["C-index"] >= c_index_threshold]
    
    # Calculate the frequency of each variable being used in high-performing models
    variable_scores = high_performance_models[important_variables_list].sum(axis=0) / len(high_performance_models)
    
    # Retain variables that are used in at least 50% of the high-performing models
    newest_important_variables = variable_scores[variable_scores >= 0.95].index.tolist()
    
    return newest_important_variables

# Example usage
updated_important_variables = filter_important_variables_v2(results_df, AA02_x_train.columns.tolist())

In [118]:
len(updated_important_variables)

177

### Re Training

In [121]:
# Re trian
best_model, updated_results_df = evaluate_variable_combinations(AA02_x_train, AA02_x_test, AA02_y_train, AA02_y_test, 
                                                        updated_important_variables, min_var_fraction=1.0000)


Total models to be trained: 1
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 632us/step


### Re Evaluation

In [123]:
display_full_dataframe(updated_results_df)

Unnamed: 0,C-index,R²,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,year_hct,hla_match_a_high,donor_age,hla_match_b_low,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,dri_score_High,dri_score_High - TED AML case <missing cytogenetics,dri_score_Intermediate,dri_score_Intermediate - TED AML case <missing cytogenetics,dri_score_Low,dri_score_Missing disease status,dri_score_N/A - disease not classifiable,dri_score_N/A - non-malignant indication,dri_score_N/A - pediatric,dri_score_TBD cytogenetics,dri_score_Very high,dri_score_nan,psych_disturb_No,psych_disturb_Not done,psych_disturb_Yes,psych_disturb_nan,diabetes_No,diabetes_Not done,diabetes_Yes,diabetes_nan,tbi_status_No TBI,tbi_status_TBI + Cy +- Other,"tbi_status_TBI +- Other, -cGy, fractionated","tbi_status_TBI +- Other, -cGy, single","tbi_status_TBI +- Other, -cGy, unknown dose","tbi_status_TBI +- Other, <=cGy","tbi_status_TBI +- Other, >cGy","tbi_status_TBI +- Other, unknown dose",arrhythmia_No,arrhythmia_Not done,arrhythmia_Yes,arrhythmia_nan,graft_type_Bone marrow,graft_type_Peripheral blood,vent_hist_No,vent_hist_Yes,vent_hist_nan,renal_issue_No,renal_issue_Not done,renal_issue_Yes,renal_issue_nan,pulm_severe_No,pulm_severe_Not done,pulm_severe_Yes,pulm_severe_nan,prim_disease_hct_AI,prim_disease_hct_ALL,prim_disease_hct_AML,prim_disease_hct_CML,prim_disease_hct_HD,prim_disease_hct_HIS,prim_disease_hct_IEA,prim_disease_hct_IIS,prim_disease_hct_IMD,prim_disease_hct_IPA,prim_disease_hct_MDS,prim_disease_hct_MPN,prim_disease_hct_NHL,prim_disease_hct_Other acute leukemia,prim_disease_hct_Other leukemia,prim_disease_hct_PCD,prim_disease_hct_SAA,prim_disease_hct_Solid tumor,cmv_status_+/+,cmv_status_+/-,cmv_status_-/+,cmv_status_-/-,cmv_status_nan,rituximab_No,rituximab_Yes,rituximab_nan,prod_type_BM,prod_type_PB,conditioning_intensity_MAC,"conditioning_intensity_N/A, F(pre-TED) not submitted",conditioning_intensity_NMA,conditioning_intensity_No drugs reported,conditioning_intensity_RIC,conditioning_intensity_TBD,conditioning_intensity_nan,ethnicity_Hispanic or Latino,ethnicity_Non-resident of the U.S.,ethnicity_Not Hispanic or Latino,ethnicity_nan,obesity_No,obesity_Not done,obesity_Yes,obesity_nan,in_vivo_tcd_No,in_vivo_tcd_Yes,in_vivo_tcd_nan,hepatic_severe_No,hepatic_severe_Not done,hepatic_severe_Yes,hepatic_severe_nan,prior_tumor_No,prior_tumor_Not done,prior_tumor_Yes,prior_tumor_nan,peptic_ulcer_No,peptic_ulcer_Not done,peptic_ulcer_Yes,peptic_ulcer_nan,gvhd_proph_CDselect +- other,gvhd_proph_CDselect alone,gvhd_proph_CSA + MMF +- others(not FK),"gvhd_proph_CSA + MTX +- others(not MMF,FK)","gvhd_proph_CSA +- others(not FK,MMF,MTX)",gvhd_proph_CSA alone,gvhd_proph_Cyclophosphamide +- others,gvhd_proph_Cyclophosphamide alone,gvhd_proph_FK+ MMF +- others,gvhd_proph_FK+ MTX +- others(not MMF),"gvhd_proph_FK+- others(not MMF,MTX)",gvhd_proph_FKalone,gvhd_proph_No GvHD Prophylaxis,gvhd_proph_Other GVHD Prophylaxis,"gvhd_proph_Parent Q = yes, but no agent",gvhd_proph_TDEPLETION +- other,gvhd_proph_TDEPLETION alone,gvhd_proph_nan,rheum_issue_No,rheum_issue_Not done,rheum_issue_Yes,rheum_issue_nan,sex_match_F-F,sex_match_F-M,sex_match_M-F,sex_match_M-M,sex_match_nan,race_group_American Indian or Alaska Native,race_group_Asian,race_group_Black or African-American,race_group_More than one race,race_group_Native Hawaiian or other Pacific Islander,race_group_White,hepatic_mild_No,hepatic_mild_Not done,hepatic_mild_Yes,hepatic_mild_nan,donor_related_Multiple donor (non-UCB),donor_related_Related,donor_related_Unrelated,donor_related_nan,melphalan_dose_MEL,"melphalan_dose_N/A, Mel not given",melphalan_dose_nan,cardiac_No,cardiac_Not done,cardiac_Yes,cardiac_nan,pulm_moderate_No,pulm_moderate_Not done,pulm_moderate_Yes,pulm_moderate_nan
0,0.5,-5.1e-05,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


# Prediction

## Loading Dataset

In [None]:
test_file_path = r'equity-post-HCT-survival-predictions/test.csv'

In [None]:
test_dataframe = pd.read_csv(test_file_path)
test_dataframe

## Imputaion

### Missing Percentage

In [None]:
import pandas as pd

# Function to calculate missing data information
def AA02_missing_data_info(AA02_sample_data):
    # Calculate missing count and percentage
    AA02_missing_count = AA02_sample_data.isnull().sum()
    AA02_missing_percentage = (AA02_missing_count / len(AA02_sample_data)) * 100

    # Create a DataFrame with missing data information
    AA02_missing_info = pd.DataFrame({
        'AA02_Variable': AA02_sample_data.columns,
        'AA02_Missing_Count': AA02_missing_count.values,
        'AA02_Missing_Percentage': AA02_missing_percentage.values
    }).reset_index(drop=True)

    # Format the percentage column
    AA02_missing_info['AA02_Missing_Percentage'] = AA02_missing_info['AA02_Missing_Percentage'].round(2).astype(str) + '%'

    return AA02_missing_info

# Call the function
# AA02_missing_data_info(test_dataframe)

### Categorical imputaion

In [None]:
from sklearn.impute import SimpleImputer # type: ignore

test_dataframe_imputed = test_dataframe.copy()

# Initialize SimpleImputer with most_frequent strategy
AA02_imputer = SimpleImputer(strategy='most_frequent')

if 'efs' in AA02_categorical_columns:
    AA02_categorical_columns.remove('efs')

# Apply imputation
test_dataframe_imputed[AA02_categorical_columns] = AA02_imputer.fit_transform(test_dataframe_imputed[AA02_categorical_columns])

### Non Categorical Imputation 

In [None]:
import warnings
warnings.filterwarnings('ignore')

AA02_non_categorical_columns_test = [
    var for var in AA02_non_categorical_columns if var not in AA02_y_columns
]

test_dataframe_imputed = AA02_impute_columns_with_mean_or_median(test_dataframe_imputed, AA02_non_categorical_columns_test)

In [None]:
test_dataframe_numeric = convert_to_numeric(test_dataframe_imputed)

## Encoding

In [None]:
test_dataframe_imputed_ordinally_encoded = encode_categorical_columns(
    dataframe=test_dataframe_numeric,
    ordinal_columns=AA02_categorical_ordinal_columns,
    nominal_columns=AA02_categorical_nominal_columns,
    use_one_hot_for_nominal=True,  # Set to False to use OrdinalEncoder for nominal columns
    ordinal_categories=AA02_ordinal_categories
)

## Transformation

In [None]:
test_dataframe_transformed, log = apply_transformations(test_dataframe_imputed_ordinally_encoded.copy(), AA02_non_categorical_columns_test) 
test_dataframe_transformed

## Scaling

In [None]:
test_datframe_scaled = scale_dataframe(test_dataframe_transformed, AA02_y_columns)

## Aligining

In [None]:
def align_columns_with_updated_list(updated_important_variables, test_df):
    """
    Align the columns of the test DataFrame to match the updated important variables list by adding missing columns as 0.

    Parameters:
        updated_important_variables (list): The list of updated important variables.
        test_df (pd.DataFrame): The test DataFrame to align.

    Returns:
        pd.DataFrame: The test DataFrame with aligned columns.
    """
    # Identify missing columns in the test DataFrame
    missing_columns = [col for col in updated_important_variables if col not in test_df.columns]

    # Add missing columns to the test DataFrame and set their values to 0
    for col in missing_columns:
        test_df[col] = 0

    # Ensure the column order matches the updated important variables list
    test_df = test_df[updated_important_variables]

    return test_df

# Example usage:
# Align the test DataFrame to the updated important variables list
AA02_x_test_subset = align_columns_with_updated_list(updated_important_variables, test_datframe_scaled)

## Prediction

In [None]:
# Make predictions
efs_time_predictions = best_model.predict(AA02_x_test_subset)

# Convert predictions to a suitable format (flatten the array)
efs_time_predictions = efs_time_predictions.ravel()

# Add predictions to the dataset for reference
test_dataframe_transformed['prediction'] = efs_time_predictions

# Create a DataFrame for ID and predictions
predictions_output = test_dataframe_transformed[['ID', 'prediction']]

# Display the DataFrame with ID and predictions
predictions_output

# Submission

In [None]:
def save_predictions_to_csv(predictions_output, filename="submission.csv"):
    """
    Save the predictions DataFrame to a CSV file.

    Parameters:
        predictions_output (pd.DataFrame): DataFrame containing predictions.
        filename (str): The name of the file to save the predictions.
    """
    predictions_output.to_csv(filename, index=False)

# Example usage:
save_predictions_to_csv(predictions_output, "submission.csv")

# Notebook Closed