# Data Collection

## Installing Libraries

In [4]:
# %pip install pandas
# %pip install scipy
# %pip install numpy
# %pip install matplotlib
# %pip install scikit-learn
# %pip install statsmodels
# %pip install tensorflow
# %pip install scikit-survival
# %pip install xgboost
# %pip install factor-analyzer

In [5]:
import pandas as pd
AA02_data = pd.read_csv(r"equity-post-HCT-survival-predictions\train.csv")

## Checking structure of Data

In [7]:
# AA02_data.info()

# Data Preprocessing

## Data Sampling

In [10]:
AA02_sample_data = AA02_data.copy()

### Supporting Fucntions

In [12]:
def AA02_check_unique_values(dataframe):
    """
    Calculate the number of unique values, total values,
    and percentage of unique values for each column in the DataFrame.

    Args:
        dataframe (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A summary DataFrame with unique value statistics.
    """
    # Calculate unique values, total values, and percentage of unique values
    unique_counts = dataframe.nunique()
    total_counts = dataframe.count()
    percentages = (unique_counts / total_counts) * 100

    # Combine the results into a DataFrame for better AA02_display
    summary_AA02_df = pd.DataFrame({
        'Unique Values': unique_counts,
        'Total Values': total_counts,
        'Percentage (%)': percentages
    })

    return summary_AA02_df
    
import pandas as pd

# Function to calculate missing data information
def AA02_missing_data_info(AA02_sample_data):
    # Calculate missing count and percentage
    AA02_missing_count = AA02_sample_data.isnull().sum()
    AA02_missing_percentage = (AA02_missing_count / len(AA02_sample_data)) * 100

    # Create a DataFrame with missing data information
    AA02_missing_info = pd.DataFrame({
        'AA02_Variable': AA02_sample_data.columns,
        'AA02_Missing_Count': AA02_missing_count.values,
        'AA02_Missing_Percentage': AA02_missing_percentage.values
    }).reset_index(drop=True)

    # Format the percentage column
    AA02_missing_info['AA02_Missing_Percentage'] = AA02_missing_info['AA02_Missing_Percentage'].round(2).astype(str) + '%'

    return AA02_missing_info

import pandas as pd
from IPython.display import display

def display_full_dataframe(df):
    # Set display options for max columns and rows
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)

    # Display the DataFrame
    display(df)

    # Reset options to defaults after displaying
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')

## Feature engineering

### Supporting Function

In [15]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

def feature_engineering_data(AA02_sample_data_cleaned):
    """
    Perform feature engineering on the dataset by extracting day, month, and year
    from date-like columns. Also removes square brackets and angle brackets from column names.

    Parameters:
    AA02_sample_data_cleaned (pd.DataFrame): The dataset to which feature engineering will be applied.

    Returns:
    pd.DataFrame: The dataset with cleaned column names and additional engineered features.
    """
    # Remove square brackets and angle brackets from column names
    AA02_sample_data_cleaned.columns = [
        col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') 
        for col in AA02_sample_data_cleaned.columns
    ]
    return AA02_sample_data_cleaned


### Function call

In [17]:
# AA02_sample_data = feature_engineering_data(AA02_sample_data)

## Categorizing Variables

In [19]:
# Load the CSV file
file_path = r'equity-post-HCT-survival-predictions\data_dictionary.csv'  # Update with your actual file path
data = pd.read_csv(file_path)

# Extract all variables
AA02_columns = data['variable'].tolist()

In [20]:
# Extract data type from data_dictionary's df
data['type'] = data['type'].str.strip().str.lower()

# Separate categorical and numerical columns
AA02_categorical_columns = data[data['type'] == 'categorical']['variable'].tolist()
AA02_non_categorical_columns = data[data['type'] == 'numerical']['variable'].tolist()

In [21]:
## Divde Categorical variable into two parts ordinal & nominal
AA02_categorical_ordinal_columns = []
AA02_categorical_nominal_columns = AA02_categorical_columns

In [22]:
AA02_y = ['efs', 'efs_time']  # Target variable

## Imputation

### Checking Missing Percentage

#### Variable

In [26]:
import pandas as pd

# Function to calculate missing data information
def AA02_missing_data_info(AA02_sample_data):
    # Calculate missing count and percentage
    AA02_missing_count = AA02_sample_data.isnull().sum()
    AA02_missing_percentage = (AA02_missing_count / len(AA02_sample_data)) * 100

    # Create a DataFrame with missing data information
    AA02_missing_info = pd.DataFrame({
        'AA02_Variable': AA02_sample_data.columns,
        'AA02_Missing_Count': AA02_missing_count.values,
        'AA02_Missing_Percentage': AA02_missing_percentage.values
    }).reset_index(drop=True)

    # Format the percentage column
    AA02_missing_info['AA02_Missing_Percentage'] = AA02_missing_info['AA02_Missing_Percentage'].round(2).astype(str) + '%'

    return AA02_missing_info

In [27]:
display_full_dataframe(AA02_missing_data_info(AA02_sample_data))

Unnamed: 0,AA02_Variable,AA02_Missing_Count,AA02_Missing_Percentage
0,ID,0,0.0%
1,dri_score,154,0.53%
2,psych_disturb,2062,7.16%
3,cyto_score,8068,28.01%
4,diabetes,2119,7.36%
5,hla_match_c_high,4620,16.04%
6,hla_high_res_8,5829,20.24%
7,tbi_status,0,0.0%
8,arrhythmia,2202,7.65%
9,hla_low_res_6,3270,11.35%


##### Supporting Fucntion

In [29]:
import pandas as pd

# Function to omit variables with more than a threshold of missing values and log omitted variables
def AA02_clean_data_with_logging(
    AA02_sample_data,
    AA02_categorical_columns,
    AA02_non_categorical_columns,
    AA02_columns,
    missing_threshold=50
):
    # Calculate missing percentage for each variable
    AA02_missing_percentage = (AA02_sample_data.isnull().sum() / len(AA02_sample_data)) * 100

    # Identify variables to omit (missing percentage > threshold)
    variables_to_omit = AA02_missing_percentage[AA02_missing_percentage > missing_threshold]

    # Create a DataFrame for omitted variables
    omitted_info = []
    for variable, percentage in variables_to_omit.items():
        if variable in AA02_categorical_columns:
            source = "AA02_categorical_columns"
        elif variable in AA02_non_categorical_columns:
            source = "AA02_non_categorical_columns"
        elif variable in AA02_columns:
            source = "AA02_columns"
        else:
            source = "Unknown"

        omitted_info.append({
            "Variable": variable,
            "Missing_Percentage": round(percentage, 2),
            "Omitted_From": source
        })

    # Convert omitted info to DataFrame
    AA02_omitted_df = pd.DataFrame(omitted_info)

    # Identify variables to keep
    variables_to_keep = AA02_missing_percentage[AA02_missing_percentage <= missing_threshold].index.tolist()

    # Filter the dataset
    AA02_sample_data_cleaned = AA02_sample_data[variables_to_keep]

    # Update the lists (only keep variables that are not omitted)
    AA02_columns[:] = [col for col in AA02_columns if col in variables_to_keep]
    AA02_categorical_columns[:] = [col for col in AA02_categorical_columns if col in variables_to_keep]
    AA02_non_categorical_columns[:] = [col for col in AA02_non_categorical_columns if col in variables_to_keep]

    # Print the DataFrame of omitted variables
    print("Variables Omitted Due to Missing Values (> {}%):".format(missing_threshold))
    display_full_dataframe(AA02_omitted_df)

    return AA02_sample_data_cleaned


##### Function Call

In [31]:
AA02_sample_data_cleaned = AA02_clean_data_with_logging(
    AA02_sample_data,
    AA02_categorical_columns,
    AA02_non_categorical_columns,
    AA02_columns,
    missing_threshold=99
)

Variables Omitted Due to Missing Values (> 99%):


In [32]:
# Dropping variables from data frame
# Select variables from AA02_sample_data that are present in AA02_columns
AA02_sample_data_dropped_variable = AA02_sample_data[AA02_columns]

# Display the resulting dataset
AA02_sample_data_dropped_variable.head(3)

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793


#### Records

In [34]:
import pandas as pd

def AA02_remove_records_with_missing_values(AA02_sample_data_dropped_variable, percentage):
    """
    This function removes records from the DataFrame where the percentage of missing values 
    exceeds the specified threshold.

    Parameters:
        AA02_sample_data_dropped_variable (pd.DataFrame): The input DataFrame.
        percentage (float): The threshold percentage of missing values.

    Returns:
        pd.DataFrame: Modified DataFrame with records removed.
    """
    # Calculate the threshold for missing values based on the given percentage
    threshold = (percentage / 100) * AA02_sample_data_dropped_variable.shape[1]

    # Identify records with missing values exceeding the threshold
    AA02_records_with_excessive_missing = AA02_sample_data_dropped_variable[AA02_sample_data_dropped_variable.isnull().sum(axis=1) > threshold]

    # Print the records with excessive missing values
    print("Records with more than", percentage, "% missing values:")
    AA02_records_with_excessive_missing

    # Remove those records from the original DataFrame
    AA02_sample_data_dropped_records = AA02_sample_data_dropped_variable.drop(index=AA02_records_with_excessive_missing.index)

    # Return the modified DataFrame
    return AA02_sample_data_dropped_records, AA02_records_with_excessive_missing


AA02_sample_data_dropped_records, AA02_records_with_excessive_missing = AA02_remove_records_with_missing_values(AA02_sample_data_dropped_variable.copy(), 50)
AA02_records_with_excessive_missing


Records with more than 50 % missing values:


Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
108,Intermediate,,Favorable,,,,No TBI,,,Peripheral blood,...,,Related,,,,,No,,1.0,6.705
109,Low,,,,,,No TBI,,,Peripheral blood,...,,Related,,,,,,,1.0,7.669
772,,,Favorable,No,,,No TBI,,,Bone marrow,...,,Unrelated,"N/A, Mel not given",,,,,,0.0,45.330
896,Intermediate,,Intermediate,,,,No TBI,,,Peripheral blood,...,,Related,,,,2.0,,,0.0,67.147
984,High,,TBD,No,,,TBI + Cy +- Other,,,Peripheral blood,...,,Related,"N/A, Mel not given",,,,,,1.0,4.891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27584,Intermediate,,Intermediate,,,,No TBI,,,Peripheral blood,...,,Related,"N/A, Mel not given",,,,,,1.0,6.185
27727,TBD cytogenetics,,Intermediate,,,,No TBI,,,Bone marrow,...,,Related,"N/A, Mel not given",,,,,,1.0,4.198
27773,TBD cytogenetics,,Poor,,,,"TBI +- Other, -cGy, unknown dose",,,Peripheral blood,...,,Related,,,,,,,1.0,2.113
28177,N/A - pediatric,,,,2.0,,No TBI,,,Peripheral blood,...,,Unrelated,,,,2.0,,,1.0,6.331


### Categorical Imputation

In [36]:
from sklearn.impute import SimpleImputer # type: ignore

AA02_sample_data_imputed = AA02_sample_data_dropped_variable.copy()

# Initialize SimpleImputer with most_frequent strategy
AA02_imputer = SimpleImputer(strategy='most_frequent')

# Apply imputation
AA02_sample_data_imputed[AA02_categorical_columns] = AA02_imputer.fit_transform(AA02_sample_data_imputed[AA02_categorical_columns])

### Non Categorical Imputation

#### Supporting Function

In [39]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer # type: ignore

def AA02_impute_columns_with_mean_or_median(AA02_df, columns):
    """
    Impute missing values in specified columns with mean or median based on significant difference.

    Parameters:
    AA02_df (pd.DataFrame): The dataframe to impute.
    columns (list): List of column names to impute.

    Returns:
    pd.DataFrame: The updated dataframe after imputation.
    pd.DataFrame: A dataframe with imputation details for each column.
    """
    imputation_details = []

    for col in columns:
        # Ensure column is numeric
        AA02_df[col] = pd.to_numeric(AA02_df[col], errors='coerce')

        # Replace invalid values with NaN
        AA02_df[col].replace([np.inf, -np.inf], np.nan, inplace=True)

        # Skip if column has no missing values
        if AA02_df[col].isnull().sum() == 0:
            imputation_details.append({
                'Variable': col,
                'Imputation Method': 'None (No Missing Values)',
                'Significant Difference': 0,
                'Percentage Difference': 0.00
            })
            continue

        # Calculate mean and median
        AA02_col_mean = AA02_df[col].mean()
        AA02_col_median = AA02_df[col].median()

        # Calculate percentage difference
        percentage_diff = abs(AA02_col_mean - AA02_col_median) / max(abs(AA02_col_mean), abs(AA02_col_median)) * 100
        significant_diff = int(percentage_diff > 10)  # Binary: 1 if significant, 0 otherwise

        # Choose strategy based on significant difference
        if significant_diff:
            imputation_method = 'Median'
            AA02_imputer = SimpleImputer(strategy='median')
        else:
            imputation_method = 'Mean'
            AA02_imputer = SimpleImputer(strategy='mean')

        # Apply the AA02_imputer
        AA02_df[[col]] = AA02_imputer.fit_transform(AA02_df[[col]])

        # Append details to the list
        imputation_details.append({
            'Variable': col,
            'Imputation Method': imputation_method,
            'Significant Difference': significant_diff,
            'Percentage Difference': round(percentage_diff, 2)
        })

    # Create a dataframe with imputation details
    imputation_details_df = pd.DataFrame(imputation_details)

    return AA02_df, imputation_details_df



#### Function call

In [41]:
AA02_sample_data_imputed, imputation_details = AA02_impute_columns_with_mean_or_median(AA02_sample_data_imputed, AA02_non_categorical_columns)
imputation_details

Unnamed: 0,Variable,Imputation Method,Significant Difference,Percentage Difference
0,hla_match_c_high,Median,1,11.77
1,hla_high_res_8,Median,1,14.04
2,hla_low_res_6,Median,1,14.28
3,hla_high_res_6,Median,1,14.85
4,hla_high_res_10,Median,1,13.83
5,hla_match_dqb1_high,Median,1,13.16
6,hla_nmdp_6,Median,1,13.99
7,hla_match_c_low,Median,1,12.11
8,hla_match_drb1_low,Median,1,14.24
9,hla_match_dqb1_low,Median,1,11.31


### Verify Missing Percetage = 0

In [43]:
AA02_missing_data_info(AA02_sample_data_imputed)

Unnamed: 0,AA02_Variable,AA02_Missing_Count,AA02_Missing_Percentage
0,dri_score,0,0.0%
1,psych_disturb,0,0.0%
2,cyto_score,0,0.0%
3,diabetes,0,0.0%
4,hla_match_c_high,0,0.0%
5,hla_high_res_8,0,0.0%
6,tbi_status,0,0.0%
7,arrhythmia,0,0.0%
8,hla_low_res_6,0,0.0%
9,graft_type,0,0.0%


## Covert to numeric if possible

### Supporting Function

In [46]:
import pandas as pd
import numpy as np

def convert_to_numeric(dataframe):
    """
    Converts all columns in the DataFrame into numeric types where possible.
    - Strings will be converted to numeric if feasible.
    - True/False (case insensitive) will be converted to 1 and 0.
    - If a value cannot be converted to numeric, it will remain as is.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame to convert.

    Returns:
        pd.DataFrame: A DataFrame with numeric conversions applied where possible.
    """
    def safe_convert(value):
        # Handle case-insensitive True/False
        if isinstance(value, str):
            if value.strip().lower() == 'true':
                return 1
            elif value.strip().lower() == 'false':
                return 0
        
        # Try to convert other values to numeric
        try:
            return pd.to_numeric(value, errors='raise')
        except:
            return value

    # Apply safe conversion to all elements in the DataFrame
    dataframe = dataframe.applymap(safe_convert)

    return dataframe

### Function call

In [48]:
# Example usage
AA02_sample_data_numeric = convert_to_numeric(AA02_sample_data_imputed)

## Numerical Encoding

### Total Nominal unique value

In [51]:
def AA02_check_unique_values(dataframe):
    """
    Checks and returns the number of unique values for each column in the DataFrame.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A DataFrame with column names and their unique value counts.
    """
    unique_values = dataframe.nunique()
    return pd.DataFrame({"Variable Name": unique_values.index, "Unique Values": unique_values.values})

# Example usage
unique_values_df = AA02_check_unique_values(AA02_sample_data_imputed[AA02_categorical_nominal_columns])
sum_unique_values = unique_values_df["Unique Values"].sum()

print("Sum of all unique values:", sum_unique_values)
display_full_dataframe(AA02_check_unique_values(AA02_sample_data_numeric[AA02_categorical_nominal_columns]))

Sum of all unique values: 163


Unnamed: 0,Variable Name,Unique Values
0,dri_score,11
1,psych_disturb,3
2,cyto_score,7
3,diabetes,3
4,tbi_status,8
5,arrhythmia,3
6,graft_type,2
7,vent_hist,2
8,renal_issue,3
9,pulm_severe,3


### Ordinal/Nominal Encoding

#### Supporting function

In [54]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd

def encode_categorical_columns(dataframe, ordinal_columns, nominal_columns, use_one_hot_for_nominal=False, ordinal_categories=None):
    """
    Encodes categorical columns in the dataframe using either OrdinalEncoder or OneHotEncoder for nominal columns
    and OrdinalEncoder for ordinal columns.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame to encode.
        ordinal_columns (list): List of ordinal column names to encode.
        nominal_columns (list): List of nominal column names to encode.
        use_one_hot_for_nominal (bool): If True, use OneHotEncoder for nominal columns. Otherwise, use OrdinalEncoder.
        ordinal_categories (list of lists): The order of categories for ordinal columns. Pass None if not applicable.

    Returns:
        pd.DataFrame: DataFrame with encoded categorical columns.
    """
    # Make a copy of the DataFrame
    dataframe_encoded = dataframe.copy()

    # Initialize OrdinalEncoder for ordinal columns with specified order
    if ordinal_categories:
        ordinal_encoder_ordinal = OrdinalEncoder(categories=ordinal_categories)
    else:
        ordinal_encoder_ordinal = OrdinalEncoder()

    # Encode ordinal columns
    if ordinal_columns:
        dataframe_encoded[ordinal_columns] = ordinal_encoder_ordinal.fit_transform(
            dataframe_encoded[ordinal_columns].astype(str)
        )

    # Encode nominal columns
    if use_one_hot_for_nominal:
        # Exclude numeric columns from one-hot encoding
        nominal_columns_to_encode = [col for col in nominal_columns if not pd.api.types.is_numeric_dtype(dataframe[col])]
        if nominal_columns_to_encode:
            one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
            encoded_nominal_columns = one_hot_encoder.fit_transform(dataframe_encoded[nominal_columns_to_encode].astype(str))
            encoded_nominal_df = pd.DataFrame(
                encoded_nominal_columns,
                columns=one_hot_encoder.get_feature_names_out(nominal_columns_to_encode),
                index=dataframe_encoded.index
            )
            dataframe_encoded = dataframe_encoded.drop(nominal_columns_to_encode, axis=1)
            dataframe_encoded = pd.concat([dataframe_encoded, encoded_nominal_df], axis=1)
    else:
        ordinal_encoder_nominal = OrdinalEncoder()
        dataframe_encoded[nominal_columns] = ordinal_encoder_nominal.fit_transform(
            dataframe_encoded[nominal_columns].astype(str)
        )

    return dataframe_encoded

# Example usage
AA02_ordinal_categories = [#['low', 'mid', 'high'] #low is 0 mid is 1 hight is 2
]


#### Function Call

In [56]:
AA02_sample_data_ordinally_encoded = encode_categorical_columns(
    dataframe=AA02_sample_data_numeric,
    ordinal_columns=AA02_categorical_ordinal_columns,
    nominal_columns=AA02_categorical_nominal_columns,
    use_one_hot_for_nominal=True,  # Set to False to use OrdinalEncoder for nominal columns
    ordinal_categories=AA02_ordinal_categories
)
pd.set_option('display.max_columns', None)
display(AA02_sample_data_ordinally_encoded)

Unnamed: 0,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,year_hct,hla_match_a_high,donor_age,hla_match_b_low,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time,dri_score_High - TED AML case <missing cytogenetics,dri_score_Intermediate,dri_score_Intermediate - TED AML case <missing cytogenetics,dri_score_Low,dri_score_Missing disease status,dri_score_N/A - disease not classifiable,dri_score_N/A - non-malignant indication,dri_score_N/A - pediatric,dri_score_TBD cytogenetics,dri_score_Very high,psych_disturb_Not done,psych_disturb_Yes,cyto_score_Intermediate,cyto_score_Normal,cyto_score_Not tested,cyto_score_Other,cyto_score_Poor,cyto_score_TBD,diabetes_Not done,diabetes_Yes,tbi_status_TBI + Cy +- Other,"tbi_status_TBI +- Other, -cGy, fractionated","tbi_status_TBI +- Other, -cGy, single","tbi_status_TBI +- Other, -cGy, unknown dose","tbi_status_TBI +- Other, <=cGy","tbi_status_TBI +- Other, >cGy","tbi_status_TBI +- Other, unknown dose",arrhythmia_Not done,arrhythmia_Yes,graft_type_Peripheral blood,vent_hist_Yes,renal_issue_Not done,renal_issue_Yes,pulm_severe_Not done,pulm_severe_Yes,prim_disease_hct_ALL,prim_disease_hct_AML,prim_disease_hct_CML,prim_disease_hct_HD,prim_disease_hct_HIS,prim_disease_hct_IEA,prim_disease_hct_IIS,prim_disease_hct_IMD,prim_disease_hct_IPA,prim_disease_hct_MDS,prim_disease_hct_MPN,prim_disease_hct_NHL,prim_disease_hct_Other acute leukemia,prim_disease_hct_Other leukemia,prim_disease_hct_PCD,prim_disease_hct_SAA,prim_disease_hct_Solid tumor,cmv_status_+/-,cmv_status_-/+,cmv_status_-/-,tce_imm_match_G/G,tce_imm_match_H/B,tce_imm_match_H/H,tce_imm_match_P/B,tce_imm_match_P/G,tce_imm_match_P/H,tce_imm_match_P/P,rituximab_Yes,prod_type_PB,cyto_score_detail_Intermediate,cyto_score_detail_Not tested,cyto_score_detail_Poor,cyto_score_detail_TBD,"conditioning_intensity_N/A, F(pre-TED) not submitted",conditioning_intensity_NMA,conditioning_intensity_No drugs reported,conditioning_intensity_RIC,conditioning_intensity_TBD,ethnicity_Non-resident of the U.S.,ethnicity_Not Hispanic or Latino,obesity_Not done,obesity_Yes,mrd_hct_Positive,in_vivo_tcd_Yes,tce_match_GvH non-permissive,tce_match_HvG non-permissive,tce_match_Permissive,hepatic_severe_Not done,hepatic_severe_Yes,prior_tumor_Not done,prior_tumor_Yes,peptic_ulcer_Not done,peptic_ulcer_Yes,gvhd_proph_CDselect alone,gvhd_proph_CSA + MMF +- others(not FK),"gvhd_proph_CSA + MTX +- others(not MMF,FK)","gvhd_proph_CSA +- others(not FK,MMF,MTX)",gvhd_proph_CSA alone,gvhd_proph_Cyclophosphamide +- others,gvhd_proph_Cyclophosphamide alone,gvhd_proph_FK+ MMF +- others,gvhd_proph_FK+ MTX +- others(not MMF),"gvhd_proph_FK+- others(not MMF,MTX)",gvhd_proph_FKalone,gvhd_proph_No GvHD Prophylaxis,gvhd_proph_Other GVHD Prophylaxis,"gvhd_proph_Parent Q = yes, but no agent",gvhd_proph_TDEPLETION +- other,gvhd_proph_TDEPLETION alone,rheum_issue_Not done,rheum_issue_Yes,sex_match_F-M,sex_match_M-F,sex_match_M-M,race_group_Asian,race_group_Black or African-American,race_group_More than one race,race_group_Native Hawaiian or other Pacific Islander,race_group_White,hepatic_mild_Not done,hepatic_mild_Yes,tce_div_match_GvH non-permissive,tce_div_match_HvG non-permissive,tce_div_match_Permissive mismatched,donor_related_Related,donor_related_Unrelated,"melphalan_dose_N/A, Mel not given",cardiac_Not done,cardiac_Yes,pulm_moderate_Not done,pulm_moderate_Yes
0,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2016,2.0,42.511591,2.0,9.942,2.0,2.0,0.0,90.00000,8.0,2.0,10.0,0.0,42.356,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2008,2.0,72.290000,2.0,43.705,2.0,2.0,3.0,90.00000,8.0,2.0,10.0,1.0,4.672,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2019,2.0,42.511591,2.0,33.997,2.0,2.0,0.0,90.00000,8.0,2.0,10.0,0.0,19.793,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2009,2.0,29.230000,2.0,43.245,2.0,2.0,0.0,90.00000,8.0,2.0,10.0,0.0,102.349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,2.0,8.0,6.0,6.0,10.0,2.0,5.0,2.0,2.0,2.0,2018,2.0,56.810000,2.0,29.740,2.0,2.0,1.0,90.00000,8.0,2.0,10.0,0.0,16.223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2018,2.0,24.212000,2.0,51.136,2.0,2.0,0.0,83.83208,8.0,2.0,10.0,0.0,18.633,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
28796,1.0,4.0,5.0,3.0,6.0,2.0,4.0,1.0,2.0,2.0,2017,1.0,30.770000,1.0,18.075,2.0,1.0,3.0,90.00000,6.0,1.0,8.0,1.0,4.892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
28797,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2018,2.0,22.627000,2.0,51.005,2.0,2.0,5.0,90.00000,8.0,2.0,10.0,0.0,23.157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
28798,1.0,4.0,3.0,3.0,5.0,1.0,3.0,1.0,1.0,1.0,2018,1.0,58.074000,1.0,0.044,1.0,1.0,1.0,90.00000,4.0,1.0,5.0,0.0,52.351,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Transformation

#### Supporting Function

In [59]:
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer # type: ignore
import pandas as pd

# Function to handle transformations based on distribution characteristics
def apply_transformations(AA02_sample_data, columns):
    # Initialize a list to store transformation logs
    AA02_transformation_logs = []

    for column in columns:
        # Compute AA02_skewness and AA02_kurtosis
        AA02_skewness = AA02_sample_data[column].skew()
        AA02_kurtosis = AA02_sample_data[column].kurt()
        AA02_action = "None"  # Default AA02_action

        # Handle Right Skew (Positive Skew)
        if AA02_skewness > 1:
            AA02_action = "Log Transformation"
            AA02_sample_data[column] = np.log1p(AA02_sample_data[column])

        # Handle Left Skew (Negative Skew)
        elif AA02_skewness < -1:
            AA02_action = "Reflect and Log Transformation"
            AA02_sample_data[column] = np.log1p(AA02_sample_data[column].max() - AA02_sample_data[column])

        # Handle High Kurtosis (Heavy Tails)
        if AA02_kurtosis > 3:
            try:
                AA02_action = "Box-Cox Transformation"
                AA02_sample_data[column], _ = boxcox(AA02_sample_data[column].clip(lower=1))
            except ValueError:
                AA02_action = "Box-Cox Failed, Applied Yeo-Johnson"
                transformer = PowerTransformer(method='yeo-johnson')
                AA02_sample_data[column] = transformer.fit_transform(AA02_sample_data[[column]])

        # Handle Low Kurtosis (Light Tails)
        elif AA02_kurtosis < 3 and AA02_action == "None":
            AA02_action = "Yeo-Johnson Transformation"
            transformer = PowerTransformer(method='yeo-johnson')
            AA02_sample_data[column] = transformer.fit_transform(AA02_sample_data[[column]])

        AA02_skewness_after_transformation = AA02_sample_data[column].skew()
        AA02_kurtosis_after_transformation = AA02_sample_data[column].kurt()

        # Append the log entry
        AA02_transformation_logs.append({
            'Column Name': column,
            'Skewness Before Transformation': AA02_skewness,
            'Kurtosis Before Transformationv': AA02_kurtosis,
            'Action Taken': AA02_action,
            'Skewness After Transformation': AA02_skewness_after_transformation,
            'Kurtosis After Transformationv': AA02_kurtosis_after_transformation
        })

    # Create a DataFrame for transformation logs
    transformation_log_AA02_df = pd.DataFrame(AA02_transformation_logs)
    return AA02_sample_data, transformation_log_AA02_df

#### Function Call

In [61]:
# Example usage with AA02_sample_data_encoded
AA02_sample_data_encoded = AA02_sample_data_ordinally_encoded.copy()
AA02_sample_data_transformed, AA02_transformation_logs = apply_transformations(AA02_sample_data_encoded, AA02_non_categorical_columns)

# AA02_display the transformation log DataFrame
print("Transformation Log:")
AA02_transformation_logs

Transformation Log:


Unnamed: 0,Column Name,Skewness Before Transformation,Kurtosis Before Transformationv,Action Taken,Skewness After Transformation,Kurtosis After Transformationv
0,hla_match_c_high,-1.641248,1.077994,Reflect and Log Transformation,1.569921,0.552994
1,hla_high_res_8,-1.315875,0.072317,Reflect and Log Transformation,1.048433,-0.648534
2,hla_low_res_6,-1.133016,-0.426111,Reflect and Log Transformation,0.923411,-0.927304
3,hla_high_res_6,-1.201192,-0.234368,Reflect and Log Transformation,0.981679,-0.807772
4,hla_high_res_10,-1.431305,0.432026,Reflect and Log Transformation,1.105414,-0.49075
5,hla_match_dqb1_high,-1.492601,0.561152,Reflect and Log Transformation,1.428966,0.118056
6,hla_nmdp_6,-1.253538,-0.12252,Reflect and Log Transformation,1.022901,-0.697168
7,hla_match_c_low,-1.47213,0.501754,Reflect and Log Transformation,1.407888,0.058617
8,hla_match_drb1_low,-1.10282,-0.783842,Reflect and Log Transformation,1.10282,-0.783842
9,hla_match_dqb1_low,-1.69791,1.331203,Reflect and Log Transformation,1.614817,0.711881


In [62]:
# Code for AA02_displaying transformed datset
AA02_sample_data_transformed.head(5)

Unnamed: 0,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,year_hct,hla_match_a_high,donor_age,hla_match_b_low,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time,dri_score_High - TED AML case <missing cytogenetics,dri_score_Intermediate,dri_score_Intermediate - TED AML case <missing cytogenetics,dri_score_Low,dri_score_Missing disease status,dri_score_N/A - disease not classifiable,dri_score_N/A - non-malignant indication,dri_score_N/A - pediatric,dri_score_TBD cytogenetics,dri_score_Very high,psych_disturb_Not done,psych_disturb_Yes,cyto_score_Intermediate,cyto_score_Normal,cyto_score_Not tested,cyto_score_Other,cyto_score_Poor,cyto_score_TBD,diabetes_Not done,diabetes_Yes,tbi_status_TBI + Cy +- Other,"tbi_status_TBI +- Other, -cGy, fractionated","tbi_status_TBI +- Other, -cGy, single","tbi_status_TBI +- Other, -cGy, unknown dose","tbi_status_TBI +- Other, <=cGy","tbi_status_TBI +- Other, >cGy","tbi_status_TBI +- Other, unknown dose",arrhythmia_Not done,arrhythmia_Yes,graft_type_Peripheral blood,vent_hist_Yes,renal_issue_Not done,renal_issue_Yes,pulm_severe_Not done,pulm_severe_Yes,prim_disease_hct_ALL,prim_disease_hct_AML,prim_disease_hct_CML,prim_disease_hct_HD,prim_disease_hct_HIS,prim_disease_hct_IEA,prim_disease_hct_IIS,prim_disease_hct_IMD,prim_disease_hct_IPA,prim_disease_hct_MDS,prim_disease_hct_MPN,prim_disease_hct_NHL,prim_disease_hct_Other acute leukemia,prim_disease_hct_Other leukemia,prim_disease_hct_PCD,prim_disease_hct_SAA,prim_disease_hct_Solid tumor,cmv_status_+/-,cmv_status_-/+,cmv_status_-/-,tce_imm_match_G/G,tce_imm_match_H/B,tce_imm_match_H/H,tce_imm_match_P/B,tce_imm_match_P/G,tce_imm_match_P/H,tce_imm_match_P/P,rituximab_Yes,prod_type_PB,cyto_score_detail_Intermediate,cyto_score_detail_Not tested,cyto_score_detail_Poor,cyto_score_detail_TBD,"conditioning_intensity_N/A, F(pre-TED) not submitted",conditioning_intensity_NMA,conditioning_intensity_No drugs reported,conditioning_intensity_RIC,conditioning_intensity_TBD,ethnicity_Non-resident of the U.S.,ethnicity_Not Hispanic or Latino,obesity_Not done,obesity_Yes,mrd_hct_Positive,in_vivo_tcd_Yes,tce_match_GvH non-permissive,tce_match_HvG non-permissive,tce_match_Permissive,hepatic_severe_Not done,hepatic_severe_Yes,prior_tumor_Not done,prior_tumor_Yes,peptic_ulcer_Not done,peptic_ulcer_Yes,gvhd_proph_CDselect alone,gvhd_proph_CSA + MMF +- others(not FK),"gvhd_proph_CSA + MTX +- others(not MMF,FK)","gvhd_proph_CSA +- others(not FK,MMF,MTX)",gvhd_proph_CSA alone,gvhd_proph_Cyclophosphamide +- others,gvhd_proph_Cyclophosphamide alone,gvhd_proph_FK+ MMF +- others,gvhd_proph_FK+ MTX +- others(not MMF),"gvhd_proph_FK+- others(not MMF,MTX)",gvhd_proph_FKalone,gvhd_proph_No GvHD Prophylaxis,gvhd_proph_Other GVHD Prophylaxis,"gvhd_proph_Parent Q = yes, but no agent",gvhd_proph_TDEPLETION +- other,gvhd_proph_TDEPLETION alone,rheum_issue_Not done,rheum_issue_Yes,sex_match_F-M,sex_match_M-F,sex_match_M-M,race_group_Asian,race_group_Black or African-American,race_group_More than one race,race_group_Native Hawaiian or other Pacific Islander,race_group_White,hepatic_mild_Not done,hepatic_mild_Yes,tce_div_match_GvH non-permissive,tce_div_match_HvG non-permissive,tce_div_match_Permissive mismatched,donor_related_Related,donor_related_Unrelated,"melphalan_dose_N/A, Mel not given",cardiac_Not done,cardiac_Yes,pulm_moderate_Not done,pulm_moderate_Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.609438,0.0,0.133385,0.0,-1.35433,0.0,0.0,0.0,0.502736,0.0,0.0,0.0,0.0,1.468783,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.564949,0.0,1.735661,0.0,0.261483,0.0,0.0,1.386294,0.502736,0.0,0.0,0.0,1.0,0.574851,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,0.133385,0.0,-0.185358,0.0,0.0,0.0,0.502736,0.0,0.0,0.0,0.0,1.208258,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.484907,0.0,-0.880811,0.0,0.240523,0.0,0.0,0.0,0.502736,0.0,0.0,0.0,0.0,1.726025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,1.098612,0.0,0.983534,0.0,-0.384692,0.0,0.0,0.693147,0.502736,0.0,0.0,0.0,0.0,1.132878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Scaling

#### Supporting function

In [65]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def scale_dataframe(AA02_df,AA02_y_columns, method='standard'):
    """
    Scales numeric columns of the input DataFrame, excluding binary columns.

    Parameters:
        AA02_df (pd.DataFrame): Input DataFrame to scale.
        method (str): Scaling method, either 'standard' (default) for StandardScaler or 'minmax' for MinMaxScaler.

    Returns:
        pd.DataFrame: Scaled DataFrame with the same column names as the input.
    """
    if not isinstance(AA02_df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")

    # Select numeric columns only
    AA02_numeric_cols = AA02_df.select_dtypes(include=['float64', 'int64']).columns

    # Exclude binary columns (those with only two unique values)
    AA02_non_binary_cols = [col for col in AA02_numeric_cols if AA02_df[col].nunique() > 2]

    AA02_non_binary_cols = [
    var for var in AA02_non_binary_cols if var not in AA02_y_columns
]

    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Invalid method. Use 'standard' or 'minmax'.")

    # Scale non-binary numeric columns
    AA02_df_scaled = AA02_df.copy()
    AA02_df_scaled[AA02_non_binary_cols] = scaler.fit_transform(AA02_df[AA02_non_binary_cols])

    return AA02_df_scaled



#### Function Call

In [67]:
AA02_sample_data_scaled =  AA02_sample_data_transformed.copy()

# Machine Learning

## Dependent/Independent variables

### Supporting Function

In [71]:
def prepare_data(AA02_y_vars, AA02_cat_vars, AA02_non_cat_vars):
    """
    Prepares the data by calculating the feature set (AA02_x) while excluding dependent variables.

    Args:
    - AA02_y_vars: A list of dependent variable names (can handle multiple dependent variables).
    - AA02_cat_vars: A list of categorical variable names.
    - AA02_non_cat_vars: A list of non-categorical variable names.

    Returns:
    - AA02_y_vars: A list of dependent variable names.
    - AA02_x: A list of feature variable names, excluding dependent variables.
    """
    # Combine categorical and non-categorical variable lists
    AA02_all_vars = AA02_cat_vars + AA02_non_cat_vars

    # Ensure `AA02_y_vars` is a list for consistency
    if isinstance(AA02_y_vars, str):
        AA02_y_vars = [AA02_y_vars]

    # Calculate the feature set (x) as the difference between AA02_all_vars and y_vars
    AA02_x = [AA02_var for AA02_var in AA02_all_vars if AA02_var not in AA02_y_vars]

    return AA02_y_vars, AA02_x

### Function Call

In [73]:
AA02_y = ['efs', 'efs_time']

AA02_y_columns, AA02_x_columns = prepare_data(AA02_y, AA02_sample_data_scaled.columns.tolist(), [])

print("Target (y):", AA02_y_columns)
# print("Feature Set (x):", AA02_x_columns)

Target (y): ['efs', 'efs_time']


## Train Test Split

In [75]:
from sklearn.model_selection import train_test_split

AA02_sample_data_train_test_split = AA02_sample_data_scaled.copy()

# Extract the target columns (multi-output targets)
AA02_y_data = AA02_sample_data_train_test_split[AA02_y_columns]  # AA02_y_columns should be a list like ['efs', 'efs_time']

# Extract the features (as DataFrame)
AA02_x_data = AA02_sample_data_train_test_split[AA02_x_columns]  # AA02_x_columns is the list of feature column names

# Perform the train-test split
AA02_x_train, AA02_x_test, AA02_y_train, AA02_y_test = train_test_split(
    AA02_x_data, AA02_y_data, test_size=0.1, random_state=55002, stratify=AA02_y_data['efs']  # Stratify by 'efs'
)

## Train test split verification

### Supporting Function

In [78]:
import pandas as pd

# Compute unique value counts for AA02_x_train and AA02_x_test
AA02_unique_values_AA02_x_train = AA02_x_train.nunique()
AA02_unique_values_AA02_x_test = AA02_x_test.nunique()

# Safely compute unique value counts for dependent variables in AA02_y_train and AA02_y_test
AA02_unique_values_AA02_y_train = {}
AA02_unique_values_AA02_y_test = {}

for y in AA02_y_columns:
    if y in AA02_y_train.columns:
        AA02_unique_values_AA02_y_train[y] = AA02_y_train[y].nunique()
    else:
        print(f"Warning: '{y}' not found in AA02_y_train")
    
    if y in AA02_y_test.columns:
        AA02_unique_values_AA02_y_test[y] = AA02_y_test[y].nunique()
    else:
        print(f"Warning: '{y}' not found in AA02_y_test")

# Convert to pandas Series
AA02_unique_values_AA02_y_train = pd.Series(AA02_unique_values_AA02_y_train)
AA02_unique_values_AA02_y_test = pd.Series(AA02_unique_values_AA02_y_test)

### Function Call

In [80]:
# Combine the results into a single DataFrame
AA02_unique_values_AA02_df = pd.DataFrame({
    'AA02_x_train': AA02_unique_values_AA02_x_train,
    'AA02_x_test': AA02_unique_values_AA02_x_test,
    'AA02_y_train': AA02_unique_values_AA02_y_train,
    'AA02_y_test': AA02_unique_values_AA02_y_test
})

# Display the DataFrame
display_full_dataframe(AA02_unique_values_AA02_df)

Unnamed: 0,AA02_x_train,AA02_x_test,AA02_y_train,AA02_y_test
age_at_hct,20360.0,2708.0,,
arrhythmia_Not done,2.0,2.0,,
arrhythmia_Yes,2.0,2.0,,
cardiac_Not done,2.0,2.0,,
cardiac_Yes,2.0,2.0,,
cmv_status_+/-,2.0,2.0,,
cmv_status_-/+,2.0,2.0,,
cmv_status_-/-,2.0,2.0,,
comorbidity_score,11.0,11.0,,
"conditioning_intensity_N/A, F(pre-TED) not submitted",2.0,2.0,,


In [81]:
AA02_x_train.columns = [
    col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') 
    for col in AA02_x_train.columns]
AA02_x_test.columns = [
    col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') 
    for col in AA02_x_test.columns]

## XGBoost

### Training

#### Function

In [85]:
{'n_estimators': 50,
 'learning_rate': 0.01,
 'max_depth': 2,
 'subsample': 0.6,
 'colsample_bytree': 0.6,
 'gamma': 0.0,
 'min_child_weight': 1,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0}

{'n_estimators': 50,
 'learning_rate': 0.01,
 'max_depth': 2,
 'subsample': 0.6,
 'colsample_bytree': 0.6,
 'gamma': 0.0,
 'min_child_weight': 1,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0}

In [86]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sksurv.metrics import concordance_index_censored
import pandas as pd
import os
import psutil

# Get the number of physical cores
physical_cores = psutil.cpu_count(logical=False)

# Set the process affinity to the first `physical_cores` CPUs
process = psutil.Process(os.getpid())
cpu_affinity = list(range(physical_cores))  # Only use physical cores (0 to 15)
process.cpu_affinity(cpu_affinity)

n_jobs = max(1, physical_cores - 1) # leave one core

xgb_param_grid = {
    'n_estimators': [500],  # Broad range to capture different levels of model complexity
    'learning_rate': [0.1],  # Covers small to moderately high learning rates
    'max_depth': [2],  # Gradual increase in depth to avoid overfitting
    'subsample': [0.8],  # Typical values for subsampling data
    'colsample_bytree': [0.6],  # Fraction of features to consider
    'gamma': [0.1],  # Minimum loss reduction for a split
    'min_child_weight': [7],  # Controls overfitting and tree size
    'reg_alpha': [0.5],  # L1 regularization, effective for feature sparsity
    'reg_lambda': [1.5]  # L2 regularization for controlling complexity
}

# Initialize RandomizedSearchCV
xgb_random_search = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=None, 
                           tree_method='gpu_hist', 
                           predictor='gpu_predictor', 
                           verbosity=2
    ),
    param_distributions=xgb_param_grid,
    n_iter=1000,  # Number of random parameter combinations
    cv=5,  # Number of cross-validation folds
    scoring='neg_mean_squared_error',
    n_jobs=n_jobs,  # Use defined cores
    random_state=None,
    verbose = 3
)

print("Training...")
import numpy as np
from sksurv.metrics import concordance_index_censored

y_train_time = AA02_y_train['efs_time']
y_test_time = AA02_y_test['efs_time']

def calculate_cindex(model, X_test, y_test_event, y_test_time):
    predictions = model.predict(X_test)
    c_index = concordance_index_censored(
        y_test_event.astype(bool),
        y_test_time,
        -predictions
    )[0]
    return c_index

xgb_random_search.fit(AA02_x_train, y_train_time)
best_xgb_model = xgb_random_search.best_estimator_
print("Model Trained!")

Training...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Model Trained!


In [87]:
print("Calculating Metrics...")
# Store all parameters and their corresponding c-index
all_results = []
for i in range(len(xgb_random_search.cv_results_['params'])):
    params = xgb_random_search.cv_results_['params'][i]
    mean_score = xgb_random_search.cv_results_['mean_test_score'][i]

    # Extract individual parameter values
    n_estimators = params.get('n_estimators', None)
    learning_rate = params.get('learning_rate', None)
    max_depth = params.get('max_depth', None)
    subsample = params.get('subsample', None)
    colsample_bytree = params.get('colsample_bytree', None)
    gamma = params.get('gamma', None)
    min_child_weight = params.get('min_child_weight', None)
    reg_alpha = params.get('reg_alpha', None)
    reg_lambda = params.get('reg_lambda', None)

    # Use the already trained model to calculate c-index using GPU
    model = best_xgb_model  # Use the pre-trained model
    model.set_params(predictor='gpu_predictor')  # Ensure GPU is used for predictions
    predictions = model.predict(AA02_x_test)
    c_index = concordance_index_censored(
        AA02_y_test['efs'].astype(bool),
        y_test_time,
        -predictions
    )[0]

    all_results.append({
        "n_estimators": n_estimators,
        "learning_rate": learning_rate,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "gamma": gamma,
        "min_child_weight": min_child_weight,
        "reg_alpha": reg_alpha,
        "reg_lambda": reg_lambda,
        "Mean Test Score": mean_score,
        "C-Index": c_index
    })
    print(i, end="|",)
        
print("\n")
# Create a DataFrame to store all results
results_df = pd.DataFrame(all_results)

# Normalize c-index values using softmax
def softmax(values):
    exp_values = np.exp(values - np.max(values))  # Subtract max for numerical stability
    return exp_values / exp_values.sum()

# Calculate softmax of the c-index
results_df["C-Index Softmax"] = softmax(results_df["C-Index"])

# Identify the parameter set with the highest c-index softmax value
best_params = results_df.loc[results_df["C-Index Softmax"].idxmax()]

# Calculate individual best parameters
individual_best_params = {}
for param in xgb_param_grid.keys():
    grouped = results_df.groupby(param)["C-Index"].mean()  # Average C-Index for each parameter value
    best_value = grouped.idxmax()  # Get the parameter value with the highest average C-Index
    individual_best_params[param] = best_value
print("Metrics Calculated")

Calculating Metrics...
0|

Metrics Calculated


### Evaluation

In [88]:
print("All Results:")
display_full_dataframe(results_df)

All Results:


Unnamed: 0,n_estimators,learning_rate,max_depth,subsample,colsample_bytree,gamma,min_child_weight,reg_alpha,reg_lambda,Mean Test Score,C-Index,C-Index Softmax
0,500,0.1,2,0.8,0.6,0.1,7,0.5,1.5,-0.146626,0.654714,1.0


# Prediction

## Loading Dataset

In [93]:
test_file_path = r'equity-post-HCT-survival-predictions/test.csv'

In [94]:
test_dataframe = pd.read_csv(test_file_path)
test_dataframe

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,28800,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,90.0,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0
1,28801,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0
2,28802,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0


## Imputaion

### Missing Percentage

In [97]:
import pandas as pd

# Function to calculate missing data information
def AA02_missing_data_info(AA02_sample_data):
    # Calculate missing count and percentage
    AA02_missing_count = AA02_sample_data.isnull().sum()
    AA02_missing_percentage = (AA02_missing_count / len(AA02_sample_data)) * 100

    # Create a DataFrame with missing data information
    AA02_missing_info = pd.DataFrame({
        'AA02_Variable': AA02_sample_data.columns,
        'AA02_Missing_Count': AA02_missing_count.values,
        'AA02_Missing_Percentage': AA02_missing_percentage.values
    }).reset_index(drop=True)

    # Format the percentage column
    AA02_missing_info['AA02_Missing_Percentage'] = AA02_missing_info['AA02_Missing_Percentage'].round(2).astype(str) + '%'

    return AA02_missing_info

# Call the function
# AA02_missing_data_info(test_dataframe)

### Categorical imputaion

In [99]:
from sklearn.impute import SimpleImputer # type: ignore

test_dataframe_imputed = test_dataframe.copy()

# Initialize SimpleImputer with most_frequent strategy
AA02_imputer = SimpleImputer(strategy='most_frequent')

if 'efs' in AA02_categorical_columns:
    AA02_categorical_columns.remove('efs')

# Apply imputation
test_dataframe_imputed[AA02_categorical_columns] = AA02_imputer.fit_transform(test_dataframe_imputed[AA02_categorical_columns])

### Non Categorical Imputation 

In [101]:
import warnings
warnings.filterwarnings('ignore')

AA02_non_categorical_columns_test = [
    var for var in AA02_non_categorical_columns if var not in AA02_y_columns
]

test_dataframe_imputed, temp = AA02_impute_columns_with_mean_or_median(test_dataframe_imputed, AA02_non_categorical_columns_test)

In [102]:
test_dataframe_numeric = convert_to_numeric(test_dataframe_imputed)

## Encoding

In [104]:
test_dataframe_imputed_ordinally_encoded = encode_categorical_columns(
    dataframe=test_dataframe_numeric,
    ordinal_columns=AA02_categorical_ordinal_columns,
    nominal_columns=AA02_categorical_nominal_columns,
    use_one_hot_for_nominal=True,  # Set to False to use OrdinalEncoder for nominal columns
    ordinal_categories=AA02_ordinal_categories
)

## Transformation

In [106]:
test_dataframe_transformed, log = apply_transformations(test_dataframe_imputed_ordinally_encoded.copy(), AA02_non_categorical_columns_test) 
display_full_dataframe(test_dataframe_transformed)

Unnamed: 0,ID,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,year_hct,hla_match_a_high,donor_age,hla_match_b_low,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,dri_score_N/A - non-malignant indication,"tbi_status_TBI +- Other, >cGy",graft_type_Peripheral blood,prim_disease_hct_HIS,prim_disease_hct_IEA,prod_type_PB,in_vivo_tcd_Yes,gvhd_proph_FKalone,gvhd_proph_Other GVHD Prophylaxis,sex_match_F-M,sex_match_M-F,race_group_More than one race,donor_related_Unrelated,pulm_moderate_Yes
0,28800,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,1.386294,2.0,72.29,2.0,3.548554,2.0,2.0,0.0,90.0,8.0,2.0,10.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
1,28801,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,2.484907,2.0,72.29,2.0,0.0,2.0,2.0,1.386294,90.0,8.0,2.0,10.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,28802,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,0.0,2.0,72.29,2.0,2.370991,2.0,2.0,0.0,90.0,8.0,2.0,10.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


## Scaling

In [108]:
test_datframe_scaled = scale_dataframe(test_dataframe_transformed, AA02_y_columns)

## Aligining

In [110]:
def align_columns_with_updated_list(updated_important_variables, test_df):
    """
    Align the columns of the test DataFrame to match the updated important variables list by adding missing columns as 0.

    Parameters:
        updated_important_variables (list): The list of updated important variables.
        test_df (pd.DataFrame): The test DataFrame to align.

    Returns:
        pd.DataFrame: The test DataFrame with aligned columns.
    """
    # Identify missing columns in the test DataFrame
    missing_columns = [col for col in updated_important_variables if col not in test_df.columns]

    # Add missing columns to the test DataFrame and set their values to 0
    for col in missing_columns:
        test_df[col] = 0

    # Ensure the column order matches the updated important variables list
    test_df = test_df[updated_important_variables]

    return test_df

# Example usage:
# Align the test DataFrame to the updated important variables list
AA02_x_test_subset = align_columns_with_updated_list(AA02_x_columns, test_datframe_scaled)

In [119]:
AA02_x_test_subset.columns = [
    col.replace('[', '').replace(']', '').replace('<', '').replace('>', '') 
    for col in AA02_x_test_subset.columns]

## Prediction

In [122]:
# Make predictions
efs_time_predictions = best_xgb_model.predict(AA02_x_test_subset)

# Convert predictions to a suitable format (flatten the array)
efs_time_predictions = efs_time_predictions.ravel()

# Add predictions to the dataset for reference
test_dataframe_transformed['prediction'] = efs_time_predictions

# Create a DataFrame for ID and predictions
predictions_output = test_dataframe_transformed[['ID', 'prediction']]

# Display the DataFrame with ID and predictions
predictions_output

Unnamed: 0,ID,prediction
0,28800,1.064029
1,28801,1.114983
2,28802,1.204271


# Submission

In [124]:
def save_predictions_to_csv(predictions_output, filename="submission.csv"):
    """
    Save the predictions DataFrame to a CSV file.

    Parameters:
        predictions_output (pd.DataFrame): DataFrame containing predictions.
        filename (str): The name of the file to save the predictions.
    """
    predictions_output.to_csv(filename, index=False)

# Example usage:
save_predictions_to_csv(predictions_output, "submission.csv")

# Notebook Closed