In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, BayesianRidge
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [5]:
train_data = pd.read_csv('cirrhosis.csv')

In [6]:
train_data['Status'] = train_data['Status'].map({'D': 1, 'C': 0, 'CL': 0})
le = LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
train_data['Drug'] = train_data['Drug'].map({'Placebo':0, 'penicillamine':1})
categorical_cols = ['Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
for col in categorical_cols:
    train_data[col] = train_data[col].map({'Y': 1, 'N': 0, 'S': 0.5}).astype('float64')

In [7]:
train_data.isnull().sum()

Unnamed: 0,0
ID,0
N_Days,0
Status,0
Drug,264
Age,0
Sex,0
Ascites,106
Hepatomegaly,106
Spiders,106
Edema,0


In [8]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42)
}

In [9]:
def train_model(X_train, y_train, X_val, y_val):
    scaler = StandardScaler()
    num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
    # cat_cols = train_data.select_dtypes(include=['object']).columns
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_val[num_cols] = scaler.fit_transform(X_val[num_cols])
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)

        print(f"\n{name} Results:")
        print(f"Accuracy: {accuracy:.4f}")

# 1 Ignore Missing Data

In [10]:
train_clean = train_data[train_data.notna().all(axis=1)].copy()
X = train_clean.drop(['ID', 'Status'], axis=1)
y = train_clean['Status']
X_train_clean, X_val_clean, y_train_clean, y_val_clean = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
train_model(X_train_clean, y_train_clean, X_val_clean, y_val_clean)


Random Forest Results:
Accuracy: 0.7857

Gradient Boosting Results:
Accuracy: 0.8571

Logistic Regression Results:
Accuracy: 0.8214

XGBoost Results:
Accuracy: 0.8571


# 2 Imputation (BayesianRidge + Categorical Iterative)

In [12]:
train_data_imputed = train_data.copy()

In [13]:
median_cols = ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
mode_cols = ['Stage']

median_imputer = SimpleImputer(strategy='median')
linear_iterative_imputer = IterativeImputer(estimator = BayesianRidge(), max_iter=10, random_state=0)
mode_imputer = SimpleImputer(strategy='most_frequent')

train_data_imputed[median_cols] = linear_iterative_imputer.fit_transform(train_data_imputed[median_cols])
train_data_imputed[mode_cols] = mode_imputer.fit_transform(train_data_imputed[mode_cols])

In [14]:
missing_cols = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']
knn_imputer = KNNImputer(n_neighbors=5)
categorical_iterative_imputer = IterativeImputer(estimator = DecisionTreeRegressor(), max_iter=10, random_state=0)
train_data_imputed[missing_cols] = knn_imputer.fit_transform(train_data_imputed[missing_cols])

In [None]:
X = train_data_imputed.drop(['ID', 'Status'], axis=1)
y = train_data_imputed['Status']
X_train_imputed, X_val_imputed, y_train_imputed, y_val_imputed = train_test_split(X, y, test_size=0.2, random_state=42)
train_model(X_train_imputed, y_train_imputed, X_val_imputed, y_val_imputed)


Random Forest Results:
Accuracy: 0.8452

Gradient Boosting Results:
Accuracy: 0.8333

Logistic Regression Results:
Accuracy: 0.8571

XGBoost Results:
Accuracy: 0.8571


# 3 Label Missing

In [None]:
train_data_label = train_data.copy()

In [None]:
missing_flags = train_data_label.isnull().astype(int).add_suffix('_missing')

numeric_cols = ['Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
train_data_label[numeric_cols] = train_data_label[numeric_cols].fillna(train_data_label[numeric_cols].median())

cat_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
train_data_label[cat_cols] = train_data_label[cat_cols].fillna(train_data_label[cat_cols].mode().iloc[0])

train_data_label = pd.concat([train_data_label, missing_flags], axis=1)

In [None]:
X = train_data_label.drop(['ID', 'Status'], axis=1)
y = train_data_label['Status']
X_train_label, X_val_label, y_train_label, y_val_label = train_test_split(X, y, test_size=0.2, random_state=42)
train_model(X_train_label, y_train_label, X_val_label, y_val_label)


Random Forest Results:
Accuracy: 0.8571

Gradient Boosting Results:
Accuracy: 0.8333

Logistic Regression Results:
Accuracy: 0.8095

XGBoost Results:
Accuracy: 0.8690


# 4 EM

In [17]:
train_data_em = train_data.copy()

In [18]:
def em_algorithm(df, max_iter=50, tol=1e-3):
    feature_cols = ['Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema',
                   'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos',
                   'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']

    df_imputed = df[feature_cols].copy()
    numeric_cols = df_imputed.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df_imputed.select_dtypes(include=['object']).columns

    for col in numeric_cols:
        df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())
    for col in categorical_cols:
        df_imputed[col] = df_imputed[col].fillna(df_imputed[col].mode()[0])

    scaler = StandardScaler()
    for iteration in range(max_iter):
        old_values = df_imputed.copy()

        for col in feature_cols:
            mask = df[col].isna()
            if mask.any():
                X = df_imputed.drop(columns=[col])
                y = df_imputed[col]
                numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
                if len(numeric_features) > 0:
                    X[numeric_features] = scaler.fit_transform(X[numeric_features])

                model = RandomForestRegressor(
                    n_estimators=100,
                    max_depth=5,
                    min_samples_split=5,
                    min_samples_leaf=2,
                    random_state=42
                )
                model.fit(X[~mask], y[~mask])
                df_imputed.loc[mask, col] = model.predict(X[mask])

                if col in ['Sex', 'Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']:
                    df_imputed[col] = df_imputed[col].round()

        numeric_change = np.abs(old_values.select_dtypes(include=['float64', 'int64']) -
                              df_imputed.select_dtypes(include=['float64', 'int64'])).max().max()

        if numeric_change < tol:
            print(f"Converged after {iteration + 1} iterations")
            break

    return df_imputed

In [21]:
X = em_algorithm(train_data_em)
y = train_data_em['Status']

# Standardization using StandardScaler
scaler = StandardScaler()
Z = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.8)  # Retain 50% explained variance
x_pca = pca.fit_transform(Z)

# Convert PCA output to DataFrame
x_pca_df = pd.DataFrame(x_pca, columns=[f'PC{i+1}' for i in range(x_pca.shape[1])])

# Add selected columns back to the PCA DataFrame
# x_pca_df = pd.concat([x_pca_df, X[['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']].reset_index(drop=True)], axis=1)

# # Split data into training and validation sets
# X_train_imputed, X_val_imputed, y_train_imputed, y_val_imputed = train_test_split(
#     x_pca_df, y, test_size=0.2, random_state=42
# )

X_train_em, X_val_em, y_train_em, y_val_em = train_test_split(x_pca_df, y, test_size=0.2, random_state=42)
train_model(X_train_em, y_train_em, X_val_em, y_val_em)


Random Forest Results:
Accuracy: 0.8214

Gradient Boosting Results:
Accuracy: 0.7619

Logistic Regression Results:
Accuracy: 0.8095

XGBoost Results:
Accuracy: 0.7976




# 5 RandomForestRegressor


In [22]:
train_randomForest = train_data.copy()

In [23]:
# Separate features and target
X = train_randomForest.drop(['ID', 'Status'], axis=1)  # Drop ID and target column 'Status'
y = train_randomForest['Status']  # Target column

# Encode categorical variables
categorical_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
for col in categorical_cols:
    X[col] = pd.factorize(X[col])[0]  # Encode as integers

# Impute missing values via models
X_imputed = X.copy()
missing_cols = X.columns[X.isnull().any()]  # Identify columns with missing values

for col in missing_cols:
    missing = X[col].isnull()  # Rows with missing values
    not_missing = ~missing     # Rows without missing values

    # Train a RandomForestRegressor to predict missing values
    regressor = RandomForestRegressor(random_state=42, n_estimators=100)
    regressor.fit(X.loc[not_missing].drop(columns=[col]), X.loc[not_missing, col])

    # Predict and fill missing values
    X_imputed.loc[missing, col] = regressor.predict(X.loc[missing].drop(columns=[col]))


X_train_label, X_val_label, y_train_label, y_val_label = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
train_model(X_train_label, y_train_label, X_val_label, y_val_label)


Random Forest Results:
Accuracy: 0.7976

Gradient Boosting Results:
Accuracy: 0.7857

Logistic Regression Results:
Accuracy: 0.8214

XGBoost Results:
Accuracy: 0.8333


#6 Bayesian Ridge + RandomForest

In [26]:
train_BayesianRidge = train_data.copy()

In [35]:
numeric_cols = ['Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Platelets', 'Prothrombin']
cat_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']

# Impute Numerical Features using Bayesian Ridge
numerical_data = train_BayesianRidge[numeric_cols]
iterative_imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42)
numerical_imputed = pd.DataFrame(iterative_imputer.fit_transform(numerical_data), columns=numeric_cols)

# Impute Categorical Features using Random Forest
categorical_data = train_BayesianRidge[categorical_cols].copy()
for col in categorical_cols:
    missing = categorical_data[col].isnull()
    not_missing = ~missing

    # Encode categorical values to integers for RandomForestClassifier
    unique_classes = pd.factorize(categorical_data[col])[1]  # Get unique classes (including NaN)
    categorical_data[col] = pd.factorize(categorical_data[col])[0]

    if missing.sum() > 0:
        # Train RandomForestClassifier
        clf = RandomForestClassifier(n_estimators=10, random_state=42)
        clf.fit(categorical_data.loc[not_missing, :].drop(columns=[col]), categorical_data.loc[not_missing, col])

        # Predict missing values
        predictions = clf.predict(categorical_data.loc[missing, :].drop(columns=[col]))
        categorical_data.loc[missing, col] = predictions

    # Decode back to original categories
    categorical_data[col] = categorical_data[col].astype(int).map(dict(enumerate(unique_classes)))

# Combine Imputed Numerical and Categorical Data
imputed_data = pd.concat([numerical_imputed, categorical_data], axis=1)

X_train_label, X_val_label, y_train_label, y_val_label = train_test_split(imputed_data, y, test_size=0.2, random_state=42)
train_model(X_train_label, y_train_label, X_val_label, y_val_label)


Random Forest Results:
Accuracy: 0.8214

Gradient Boosting Results:
Accuracy: 0.8571

Logistic Regression Results:
Accuracy: 0.8095

XGBoost Results:
Accuracy: 0.7857
