In [1]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

In [2]:
class DataParser:
    def __init__(self, file_path, label_col=None, encoder = None):
        """
        Initialize the DataParser.

        :param file_path: Path to the CSV file (training or test).
        :param label_col: The label column name for training data; None for test data.
        """
        # Load the data
        data_df = pd.read_csv(file_path)
        self.encoder = encoder
        self.test = False

        # Extract features and labels (if present)
        if label_col:
            self.data_features = data_df.drop(columns=[label_col, 'efs_time'], errors='ignore')
            self.data_label = data_df[label_col].to_numpy()
        else:
            self.data_features = data_df
            self.data_label = None

    def normalize_data(self, binary_cols=None, ordinal_cols=None):
        """
        Normalize and preprocess data, handling binary, ordinal, and other categorical features.
    
        :param binary_cols: List of binary columns to encode.
        :param ordinal_cols: List of ordinal columns to encode.
        """
        if binary_cols is None:
            binary_cols = []
        if ordinal_cols is None:
            ordinal_cols = []
    
        # Treat missing values
        self.data_features = self.data_features.fillna({
            **{col: 'Missing' for col in binary_cols + ordinal_cols},  # Placeholder for binary/ordinal categorical
            **{col: self.data_features[col].mean() for col in self.data_features.select_dtypes(include=['int64', 'float64']).columns}  # Mean for numerical
        })
    
        # Handle binary data
        if binary_cols:
            self.data_features[binary_cols] = self.data_features[binary_cols].replace({
                'No': 0, 'Yes': 1, 'Not done': -1, 'nan': -1, 'Missing':-1
            })
    
        # Handle ordinal data
        if ordinal_cols:
            ordinal_encoder = OrdinalEncoder()
            self.data_features[ordinal_cols] = ordinal_encoder.fit_transform(self.data_features[ordinal_cols])
    
        # Identify remaining categorical columns (not binary or ordinal)
        categorical_cols = self.data_features.select_dtypes(include=['object']).columns
        other_categorical_cols = list(set(categorical_cols) - set(binary_cols) - set(ordinal_cols))
    
        # Handle remaining categorical data with one-hot encoding
        if (other_categorical_cols and not self.encoder):
            ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            encoded_nominal = ohe.fit_transform(self.data_features[other_categorical_cols])
            nominal_df = pd.DataFrame(encoded_nominal, columns=ohe.get_feature_names_out(other_categorical_cols))
            self.data_features = self.data_features.drop(columns=other_categorical_cols)
            self.data_features = pd.concat([self.data_features, nominal_df], axis=1)
            self.encoder = ohe
        elif self.test:
            ohe = self.encoder
            encoded_nominal = ohe.transform(self.data_features[other_categorical_cols])
            nominal_df = pd.DataFrame(encoded_nominal, columns=ohe.get_feature_names_out(other_categorical_cols))
            self.data_features = self.data_features.drop(columns=other_categorical_cols)
            self.data_features = pd.concat([self.data_features, nominal_df], axis=1)
    
        # Normalize numerical features
        numerical_features = self.data_features.select_dtypes(include=['int64', 'float64']).columns
        if len(numerical_features) > 0:
            scaler = StandardScaler()
            self.data_features[numerical_features] = scaler.fit_transform(self.data_features[numerical_features])
    
        self.data_features = self.data_features.to_numpy()

In [3]:
# Load the train.csv into a DataFrame
train_file_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv" 

binary_cols = ['diabetes', 'psych_disturb', 'arrhythmia']
ordinal_cols = ['cyto_score', 'dri_score']

data_parser = DataParser(file_path=train_file_path,label_col='efs')
data_parser.normalize_data(binary_cols=binary_cols, ordinal_cols=ordinal_cols)




# Access parsed data
print("Features shape:", data_parser.data_features.shape)
print("Labels shape:", data_parser.data_label.shape)



  self.data_features[binary_cols] = self.data_features[binary_cols].replace({


Features shape: (28800, 187)
Labels shape: (28800,)


In [4]:
# Split the data into training and testing sets
X, y = data_parser.data_features, data_parser.data_label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=240402, stratify=y)

# Train the Logistic Regression model
clf = LogisticRegression(random_state=0, max_iter=1000)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class (efs = 1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.6859
ROC AUC Score: 0.7501
Confusion Matrix:
[[1539 1115]
 [ 694 2412]]


In [5]:
clf_for = RandomForestClassifier(max_depth=100, random_state=0)
clf_for.fit(X_train, y_train)

y_pred = clf_for.predict(X_test)
y_pred_proba = clf_for.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class (efs = 1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.6773
ROC AUC Score: 0.7334
Confusion Matrix:
[[1466 1188]
 [ 671 2435]]


In [6]:
test_file_path = "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"

test_parser = DataParser(file_path=test_file_path)
test_parser.encoder = data_parser.encoder
test_parser.test = True
print(test_parser.test)
test_parser.normalize_data(binary_cols=binary_cols, ordinal_cols=ordinal_cols)


# Access parsed data
print("Features shape:", test_parser.data_features.shape)

True
Features shape: (3, 187)


  self.data_features[binary_cols] = self.data_features[binary_cols].replace({


In [7]:

# Process test data
test_df = pd.read_csv(test_file_path)

processed_test_features = test_parser.data_features


# Predict probabilities for test data
test_pred_proba = clf_for.predict_proba(processed_test_features)[:, 1]
print(test_pred_proba)

# Prepare the submission file
submission_df = pd.DataFrame({
    'ID': test_df['ID'],  # Ensure 'id' exists in the test dataset
    'prediction': test_pred_proba  # Predicted probabilities
})

# Save the submission file
submission_file_path = "/kaggle/working/submission.csv"
submission_df.to_csv(submission_file_path, index=False)
print(f"Submission file created: {submission_file_path}")


[0.26 0.71 0.23]
Submission file created: /kaggle/working/submission.csv
