In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor

In [2]:
train_data = pd.read_csv('cirrhosis.csv')

In [3]:
train_data['Status'] = train_data['Status'].map({'D': 1, 'C': 0, 'CL': 0})
le = LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
train_data['Drug'] = train_data['Drug'].map({'Placebo':0, 'penicillamine':1})
categorical_cols = ['Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
for col in categorical_cols:
    train_data[col] = train_data[col].map({'Y': 1, 'N': 0, 'S': 0.5}).astype('float64')

In [4]:
train_data.isnull().sum()

ID                 0
N_Days             0
Status             0
Drug             264
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [5]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42)
}

In [6]:
def train_model(X_train, y_train, X_val, y_val):
    scaler = StandardScaler()
    num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
    # cat_cols = train_data.select_dtypes(include=['object']).columns
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_val[num_cols] = scaler.fit_transform(X_val[num_cols])
    sum = 0
    for name, model in models.items():
        model.fit(X_train, y_train)     
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        sum += accuracy
        
        print(f"{name} Results: {accuracy:.4f}")
    print(f"Average Accuracy: {sum/4:.4f}")

# 1 Ignore Missing Data

In [7]:
train_clean = train_data[train_data.notna().all(axis=1)].copy()
X = train_clean.drop(['ID', 'Status'], axis=1)
y = train_clean['Status']
X_train_clean, X_val_clean, y_train_clean, y_val_clean = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
train_model(X_train_clean, y_train_clean, X_val_clean, y_val_clean)

Random Forest Results: 0.7857
Gradient Boosting Results: 0.8571
Logistic Regression Results: 0.8214
XGBoost Results: 0.8571
Average Accuracy: 0.8304


# 2 Imputation

In [9]:
train_data_imputed = train_data.copy()

In [10]:
median_cols = ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
mode_cols = ['Stage']

median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

train_data_imputed[median_cols] = median_imputer.fit_transform(train_data_imputed[median_cols])
train_data_imputed[mode_cols] = mode_imputer.fit_transform(train_data_imputed[mode_cols])

In [11]:
missing_cols = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']
knn_imputer = KNNImputer(n_neighbors=5)
train_data_imputed[missing_cols] = knn_imputer.fit_transform(train_data_imputed[missing_cols]) 

In [12]:
X = train_data_imputed.drop(['ID', 'Status'], axis=1)
y = train_data_imputed['Status']
X_train_imputed, X_val_imputed, y_train_imputed, y_val_imputed = train_test_split(X, y, test_size=0.2, random_state=42)
train_model(X_train_imputed, y_train_imputed, X_val_imputed, y_val_imputed)

Random Forest Results: 0.8452
Gradient Boosting Results: 0.8333
Logistic Regression Results: 0.8571
XGBoost Results: 0.8571
Average Accuracy: 0.8482


# 3 Label Missing

In [13]:
train_data_label = train_data.copy()

In [14]:
missing_flags = train_data_label.isnull().astype(int).add_suffix('_missing')

numeric_cols = ['Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
train_data_label[numeric_cols] = train_data_label[numeric_cols].fillna(train_data_label[numeric_cols].median())

cat_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
train_data_label[cat_cols] = train_data_label[cat_cols].fillna(train_data_label[cat_cols].mode().iloc[0])

train_data_label = pd.concat([train_data_label, missing_flags], axis=1)

In [15]:
X = train_data_label.drop(['ID', 'Status'], axis=1)
y = train_data_label['Status']
X_train_label, X_val_label, y_train_label, y_val_label = train_test_split(X, y, test_size=0.2, random_state=42)
train_model(X_train_label, y_train_label, X_val_label, y_val_label)

Random Forest Results: 0.8571
Gradient Boosting Results: 0.8333
Logistic Regression Results: 0.8095
XGBoost Results: 0.8690
Average Accuracy: 0.8423


# 4 EM

In [16]:
train_data_em = train_data.copy()

In [17]:
def em_algorithm(df, max_iter=50, tol=1e-3):
    feature_cols = ['Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema',
                   'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos',
                   'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
    
    df_imputed = df.copy()
    numeric_cols = df_imputed.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df_imputed.select_dtypes(include=['object']).columns
    
    for col in numeric_cols:
        df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())
    for col in categorical_cols:
        df_imputed[col] = df_imputed[col].fillna(df_imputed[col].mode()[0])
    
    scaler = StandardScaler()
    for iteration in range(max_iter):
        old_values = df_imputed.copy()
        
        for col in feature_cols:
            mask = df[col].isna()
            if mask.any():
                X = df_imputed.drop(columns=[col])
                y = df_imputed[col]
                numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
                if len(numeric_features) > 0:
                    X[numeric_features] = scaler.fit_transform(X[numeric_features])
                
                model = RandomForestRegressor(
                    n_estimators=100,
                    max_depth=5,
                    min_samples_split=5,
                    min_samples_leaf=2,
                    random_state=42
                )
                model.fit(X[~mask], y[~mask])
                df_imputed.loc[mask, col] = model.predict(X[mask])

                if col in ['Sex', 'Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']:
                    df_imputed[col] = df_imputed[col].round()

        numeric_change = np.abs(old_values.select_dtypes(include=['float64', 'int64']) - 
                              df_imputed.select_dtypes(include=['float64', 'int64'])).max().max()
        
        if numeric_change < tol:
            print(f"Converged after {iteration + 1} iterations")
            break
    
    return df_imputed

In [18]:
X = train_data_em.drop(['ID', 'Status'], axis=1)
y = train_data_em['Status']

X_train_em, X_val_em, y_train_em, y_val_em = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_processed = em_algorithm(X_train_em)
X_val_processed = em_algorithm(X_val_em)

train_model(X_train_processed, y_train_em, X_val_processed, y_val_em)

Random Forest Results: 0.8452
Gradient Boosting Results: 0.8095
Logistic Regression Results: 0.8571
XGBoost Results: 0.8571
Average Accuracy: 0.8423


# 5 Random Imputation + EM

In [19]:
train_data_5 = train_data.copy()

In [20]:
rand_cat_missing_label = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']
rand_num_missing_label = ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides']

In [21]:
def impute_missing_data(df, rand_cat_missing_label, rand_num_missing_label, reference_col='Ascites'):
    df_imputed = df.copy()
    
    target_missing_idx = df_imputed[df_imputed[reference_col].isna()].index
    
    for col in rand_cat_missing_label:
        valid_data = df_imputed[col].dropna()
        value_counts = valid_data.value_counts(normalize=True)
        
        current_missing = df_imputed.loc[target_missing_idx, col].isna()
        current_missing_idx = target_missing_idx[current_missing]
        
        if len(current_missing_idx) > 0:
            random_values = np.random.choice(
                value_counts.index,
                size=len(current_missing_idx),
                p=value_counts.values
            )
            
            df_imputed.loc[current_missing_idx, col] = random_values
    
    for col in rand_num_missing_label:
        valid_data = df_imputed[col].dropna()
        median = valid_data.median()
        std = valid_data.std()
        
        current_missing = df_imputed.loc[target_missing_idx, col].isna()
        current_missing_idx = target_missing_idx[current_missing]
        
        if len(current_missing_idx) > 0:
            random_values = np.random.normal(median, std, len(current_missing_idx))
            
            df_imputed.loc[current_missing_idx, col] = random_values
    
    return df_imputed

In [22]:
imputed_df = impute_missing_data(
    train_data_5,
    rand_cat_missing_label,
    rand_num_missing_label,
    reference_col='Ascites'
)

In [23]:

X = imputed_df.drop(['ID', 'Status'], axis=1)
y = train_data_5['Status']

X_train_5, X_val_5, y_train_5, y_val_5 = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_processed = em_algorithm(X_train_5)
X_val_processed = em_algorithm(X_val_5)

train_model(X_train_processed, y_train_5, X_val_processed, y_val_5)

Converged after 4 iterations
Random Forest Results: 0.8690
Gradient Boosting Results: 0.8333
Logistic Regression Results: 0.8452
XGBoost Results: 0.8571
Average Accuracy: 0.8512


# 6 Create Dataset

In [24]:
data = pd.read_csv('cirrhosis.csv')

In [25]:
data['Status'] = data['Status'].map({'D': 1, 'C': 0, 'CL': 0})
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])
data['Drug'] = data['Drug'].map({'Placebo':0, 'penicillamine':1})
categorical_cols = ['Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
for col in categorical_cols:
    data[col] = data[col].map({'Y': 1, 'N': 0, 'S': 0.5}).astype('float64')

In [26]:
missing_flags = data.isnull().astype(int).add_suffix('_missing')
missing_flags = missing_flags.drop(['ID_missing', 'N_Days_missing', 'Status_missing', 'Age_missing', 'Sex_missing', 'Edema_missing', 'Bilirubin_missing', 'Albumin_missing'], axis=1)

In [27]:
rand_cat_missing_label = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']
rand_num_missing_label = ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides']

In [28]:
df = impute_missing_data(
    data,
    rand_cat_missing_label,
    rand_num_missing_label,
    reference_col='Ascites'
)
df = em_algorithm(df)
data = pd.concat([df, missing_flags], axis=1)

Converged after 5 iterations


In [29]:
data.to_csv('./train.csv')

In [30]:
dataModified = data.copy()
dataModified['DiagnosedDay'] = dataModified['Age'] - dataModified['N_Days']
dataModified['Age_Group'] = pd.cut(
    round(dataModified['Age'] / 365.25).astype("int16"),
    bins=[19, 29, 39, 49, 59, 69, 99], 
    labels=[0, 1, 2, 3, 4, 5]
).astype('int16')
dataModified['BARatio'] = dataModified['Bilirubin'] / dataModified['Albumin']
dataModified['CARatio'] = dataModified['Copper'] / dataModified['Albumin']
dataModified['RiskScore'] = dataModified['Bilirubin'] + dataModified['Albumin'] - dataModified['Alk_Phos']
dataModified['Liver_Complication_Index'] = (dataModified['Ascites'] * dataModified['Hepatomegaly'] * dataModified['Spiders'])

In [31]:
dataModified.to_csv('./train2.csv')

In [32]:
dataModified['BiliRiskScore'] = (
    dataModified['Bilirubin'] + 
    dataModified['SGOT'] * 0.44 + 
    dataModified['Tryglicerides'] * 0.44 + 
    dataModified['Copper'] * 0.46
)

dataModified['BSRatio'] = dataModified['Bilirubin'] / dataModified['SGOT']
# dataModified['BSProduct'] = dataModified['Bilirubin'] * dataModified['SGOT']
dataModified['BTRatio'] = dataModified['Bilirubin'] / dataModified['Tryglicerides']
# dataModified['BTProduct'] = dataModified['Bilirubin'] * dataModified['Tryglicerides']
dataModified['BCRatio'] = dataModified['Bilirubin'] / dataModified['Copper']
# dataModified['BCProduct'] = dataModified['Bilirubin'] * dataModified['Copper']

In [33]:
dataModified.to_csv('./train3.csv')