In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_classification
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, PowerTransformer
from sklearn.model_selection import cross_val_score, train_test_split, KFold, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna

In [2]:
train = pd.read_csv('train.csv')
original = pd.read_csv('original.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('sample_submission.csv')

In [3]:
def process(train, test, original):
    df_train = train.drop(['id', 'Status'], axis=1)
    original = original.dropna()
    df_original = original.drop(['ID', 'Status'], axis=1)
    df_train = pd.concat([df_train, df_original])
    df_test = test.drop(['id'], axis=1)
    
    # Categorical Cols
    # Train
    categorical_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
    encoder = OneHotEncoder(drop='first')
    encoder.fit(df_train[categorical_cols])
    df_train_cat = pd.DataFrame(encoder.transform(df_train[categorical_cols]).toarray(), columns=encoder.get_feature_names_out())
    # Test
    df_test_cat = pd.DataFrame(encoder.transform(df_test[categorical_cols]).toarray(), columns=encoder.get_feature_names_out())
    
    # Numerical Cols
    # Train
    df_train_num = df_train.drop(categorical_cols, axis=1)
    df_train_num = np.log1p(df_train_num)
    scaler = StandardScaler()
    df_train_num = pd.DataFrame(scaler.fit_transform(df_train_num), columns=df_train_num.columns)
    p_transformer = PowerTransformer()
    df_train_num = pd.DataFrame(p_transformer.fit_transform(df_train_num), columns=df_train_num.columns)
    # Test
    df_test_num = df_test.drop(categorical_cols, axis=1)
    df_test_num = np.log1p(df_test_num)
    df_test_num = pd.DataFrame(scaler.transform(df_test_num), columns=df_test_num.columns)
    df_test_num = pd.DataFrame(p_transformer.transform(df_test_num), columns=df_test_num.columns)
    
    # Combine Num/Cat
    train_final = pd.concat([df_train_num, df_train_cat], axis=1)
    test_final = pd.concat([df_test_num, df_test_cat], axis=1)
    
    # PCA
    pca = PCA(n_components=3)
    pca_train = pca.fit_transform(train_final)
    pca_features_train = pd.DataFrame(pca_train, columns=['PCA1', 'PCA2', 'PCA3'])
    train_final = pd.concat([train_final, pca_features_train], axis=1)
    
    pca_test = pca.transform(test_final)
    pca_features_test = pd.DataFrame(pca_test, columns=['PCA1', 'PCA2', 'PCA3'])
    test_final = pd.concat([test_final, pca_features_test], axis=1)
    
    # Feature Engineering
    # https://www.kaggle.com/code/ashishkumarak/ps3e26-liver-cirrhosis-survival-prediction#%F0%9F%92%BB-Feature-Engineering
    threshold_platelets = 150
    train_final['thrombocytopenia'] = np.where(train_final['Platelets'] < threshold_platelets, 1, 0)
    test_final['thrombocytopenia'] = np.where(test_final['Platelets'] < threshold_platelets, 1, 0)
    threshold_alk_phos_upper = 147  # Upper limit of normal range
    threshold_alk_phos_lower = 44   # Lower limit of normal range
    train_final['elevated_alk_phos'] = np.where((train_final['Alk_Phos'] > threshold_alk_phos_upper) | (train_final['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
    test_final['elevated_alk_phos'] = np.where((test_final['Alk_Phos'] > threshold_alk_phos_upper) | (test_final['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
    normal_copper_range = (62, 140)
    train_final['normal_copper'] = np.where((train_final['Copper'] >= normal_copper_range[0]) & (train_final['Copper'] <= normal_copper_range[1]), 1, 0)
    test_final['normal_copper'] = np.where((test_final['Copper'] >= normal_copper_range[0]) & (test_final['Copper'] <= normal_copper_range[1]), 1, 0)
    normal_albumin_range = (3.4, 5.4)
    train_final['normal_albumin'] = np.where((train_final['Albumin'] >= normal_albumin_range[0]) & (train_final['Albumin'] <= normal_albumin_range[1]), 1, 0)
    test_final['normal_albumin'] = np.where((test_final['Albumin'] >= normal_albumin_range[0]) & (test_final['Albumin'] <= normal_albumin_range[1]), 1, 0)
    normal_bilirubin_range = (0.2, 1.2)
    train_final['normal_bilirubin'] = np.where((train_final['Bilirubin'] >= normal_bilirubin_range[0]) & (train_final['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
    test_final['normal_bilirubin'] = np.where((test_final['Bilirubin'] >= normal_bilirubin_range[0]) & (test_final['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
    
    # Encode Target
    le_encoder = LabelEncoder()
    y = le_encoder.fit_transform(train.Status)
    y_original = le_encoder.fit_transform(original.Status)
    
    # Split X, y
    X_train = train_final
    y_train = np.concatenate([y, y_original])
    
    X_test = test_final
    
    return X_train, y_train, X_test

In [4]:
# df_train = train.drop(['id', 'Status'], axis=1)
# original = original.dropna()
# df_original = original.drop(['ID', 'Status'], axis=1)
# X = pd.concat([df_train, df_original])
# y = pd.concat([train.Status, original.Status])
# categorical_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']

# class MyTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self.encoder = OneHotEncoder(drop='first')
#         self.scaler = StandardScaler()
#         self.p_transformer = PowerTransformer()
#         self.le_encoder = LabelEncoder()
        
#     def fit(self, X, y):
#         X_cat = pd.DataFrame(self.encoder.fit_transform(X[categorical_cols]).toarray(), columns=self.encoder.get_feature_names_out())
#         X_num = X.drop(categorical_cols, axis=1)
#         try:
#             X_num = np.log1p(X_num)
#         except Exception as E:
#             print(E)
#         X_num = pd.DataFrame(self.scaler.fit_transform(X_num), columns=X_num.columns)
#         X_num = pd.DataFrame(self.p_transformer.fit_transform(X_num), columns=X_num.columns)
#         X = pd.concat([X_num, X_cat], axis=1)
        
#         # https://www.kaggle.com/code/ashishkumarak/ps3e26-liver-cirrhosis-survival-prediction#%F0%9F%92%BB-Feature-Engineering
#         threshold_platelets = 150
#         X['thrombocytopenia'] = np.where(X['Platelets'] < threshold_platelets, 1, 0)
#         threshold_alk_phos_upper = 147  # Upper limit of normal range
#         threshold_alk_phos_lower = 44   # Lower limit of normal range
#         X['elevated_alk_phos'] = np.where((X['Alk_Phos'] > threshold_alk_phos_upper) | (X['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
#         normal_copper_range = (62, 140)
#         X['normal_copper'] = np.where((X['Copper'] >= normal_copper_range[0]) & (X['Copper'] <= normal_copper_range[1]), 1, 0)
#         normal_albumin_range = (3.4, 5.4)
#         X['normal_albumin'] = np.where((X['Albumin'] >= normal_albumin_range[0]) & (X['Albumin'] <= normal_albumin_range[1]), 1, 0)
#         normal_bilirubin_range = (0.2, 1.2)
#         X['normal_bilirubin'] = np.where((X['Bilirubin'] >= normal_bilirubin_range[0]) & (X['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
            
#         y = self.le_encoder.fit_transform(y)
#         return self

#     def transform(self, X):
#         X_cat = pd.DataFrame(self.encoder.transform(X[categorical_cols]).toarray(), columns=self.encoder.get_feature_names_out())
#         X_num = X.drop(categorical_cols, axis=1)
#         X_num = np.log1p(X_num)
#         X_num = pd.DataFrame(self.scaler.transform(X_num), columns=X_num.columns)
#         X_num = pd.DataFrame(self.p_transformer.transform(X_num), columns=X_num.columns)
#         X = pd.concat([X_num, X_cat], axis=1)
        
#         threshold_platelets = 150
#         X['thrombocytopenia'] = np.where(X['Platelets'] < threshold_platelets, 1, 0)
#         threshold_alk_phos_upper = 147  # Upper limit of normal range
#         threshold_alk_phos_lower = 44   # Lower limit of normal range
#         X['elevated_alk_phos'] = np.where((X['Alk_Phos'] > threshold_alk_phos_upper) | (X['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
#         normal_copper_range = (62, 140)
#         X['normal_copper'] = np.where((X['Copper'] >= normal_copper_range[0]) & (X['Copper'] <= normal_copper_range[1]), 1, 0)
#         normal_albumin_range = (3.4, 5.4)
#         X['normal_albumin'] = np.where((X['Albumin'] >= normal_albumin_range[0]) & (X['Albumin'] <= normal_albumin_range[1]), 1, 0)
#         normal_bilirubin_range = (0.2, 1.2)
#         X['normal_bilirubin'] = np.where((X['Bilirubin'] >= normal_bilirubin_range[0]) & (X['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
        
#         return X

In [5]:
# scores = abs(cross_val_predict(make_pipeline(MyTransformer(), RandomForestClassifier()), X, y, cv=5, method='predict_proba'))
# print("RF Log Loss:", round(log_loss(y, scores), 4))

# scores = abs(cross_val_predict(make_pipeline(MyTransformer(), XGBClassifier(verbosity=0, use_label_encoder=False)), X, y, cv=5, method='predict_proba'))
# print("XGB Log Loss:", round(log_loss(y, scores), 4))

# scores = abs(cross_val_predict(make_pipeline(MyTransformer(), LGBMClassifier(verbose=-1)), X, y, cv=5, method='predict_proba'))
# print("LGBM Log Loss:", round(log_loss(y, scores), 4))

# scores = abs(cross_val_predict(make_pipeline(MyTransformer(), CatBoostClassifier(verbose=0)), X, y, cv=5, method='predict_proba'))
# print("CatBoost Log Loss:", round(log_loss(y, scores), 4))

In [6]:
X, y, test_final = process(train, test, original)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

In [7]:
def xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-10, 1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-10, 1, log=True),
        'gamma': trial.suggest_float('gamma', 1e-10, 1, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'mlogloss', 
        'eval_metric': 'logloss',
        'verbosity': 0,
        'use_label_encoder': False,
        'random_state': 0,
        'n_jobs': -1
    }
    
    model = XGBClassifier(**params)

    model.fit(X, y)
    
    cv_scores = abs(cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv))
    
    return np.mean(cv_scores)

def lgbm(trial):
    params = {
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'metric': 'multi_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'verbose': -1,
        'random_state': 0
    }

    model = LGBMClassifier(**params)

    model.fit(X, y)
    
    cv_scores = abs(cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv))
    
    return np.mean(cv_scores)

def catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0),
        'random_seed': 0,
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass',
        'logging_level': 'Silent'
    }
    
    model = CatBoostClassifier(**params)
    
    model.fit(X, y)
    
    cv_scores = abs(cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv))
    
    return np.mean(cv_scores)

In [8]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(xgb, n_trials=100)
# study.best_params

In [9]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(lgbm, n_trials=100)
# study.best_params

In [10]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(catboost, n_trials=100)
# study.best_params

[32m[I 2023-12-26 16:27:52,968][0m A new study created in memory with name: no-name-394d393b-902a-4804-8d54-28dad4f505e4[0m


{'iterations': 950,
 'depth': 3,
 'learning_rate': 0.07759505269501321,
 'l2_leaf_reg': 1.3999226082174185}

In [12]:
xgb_params = {
    'n_estimators': 807,
    'max_depth': 9,
    'learning_rate': 0.0188649127427861,
    'subsample': 0.8274515628887853,
    'colsample_bytree': 0.11623733296601546,
    'reg_alpha': 0.0017239053749286994,
    'reg_lambda': 0.001115134043916317,
    'gamma': 6.23346663072776e-05,
    'min_child_weight': 9,
    
    'objective': 'mlogloss', 
    'eval_metric': 'logloss',
    'verbosity': 0,
    'use_label_encoder': False
}

lgbm_params = {
    'n_estimators': 422,
    'max_depth': 27,
    'num_leaves': 31,
    'learning_rate': 0.017847511069651788,
    'min_child_samples': 96,
    'subsample': 0.26791725059736793,
    'colsample_bytree': 0.37621860984436856,
    'reg_alpha': 3.7054057635827013e-09,
    'reg_lambda': 4.861640891517785e-08,
    
    'verbose': -1
}

catboost_params = {
    'iterations': 950,
    'depth': 3,
    'learning_rate': 0.07759505269501321,
    'l2_leaf_reg': 1.3999226082174185,
    
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'logging_level': 'Silent'
}

In [13]:
xgb_model = XGBClassifier(**xgb_params)
lgbm_model = LGBMClassifier(**lgbm_params)
catboost_model = CatBoostClassifier(**catboost_params)

In [14]:
models = {
    'xgb': xgb_model,
    'ltgbm': lgbm_model,
    'catboost': catboost_model
}

In [15]:
results_ensemble_models = {}

for name, model in models.items():
    print(name)
    res=[]
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)
        res.append(log_loss(y_test, y_pred))
    results_ensemble_models[name] = res

xgb
ltgbm
catboost


In [16]:
for name, result in results_ensemble_models.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))

----------
xgb
0.42128913385978456
0.012098906142766559
----------
ltgbm
0.43070148874977054
0.011599518861663459
----------
catboost
0.44118162297442554
0.011323651482178992


In [17]:
final_model = VotingClassifier(estimators=[('xgb', xgb_model),
                                           ('lgbm', lgbm_model)],
                               voting='soft')

In [18]:
results_ensemble = []

r_ensemble = abs(cross_val_score(final_model, X, y, scoring='neg_log_loss'))
results_ensemble.append(r_ensemble)
print(f'Log Loss: {np.mean(results_ensemble).round(4)}')

Log Loss: 0.4199


In [19]:
final_model.fit(X, y)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.11623733296601546,
                                            enable_categorical=False,
                                            eval_metric='logloss',
                                            gamma=6.23346663072776e-05,
                                            gpu_id=-1, importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.0188649127427861,
                                            max_delta_step=0, max_depth=...
                                            subsample=0.8274515628887853,
                                            tree_method='exact',
                       

In [20]:
final_predictions = final_model.predict_proba(test_final)

res = pd.DataFrame(final_predictions, columns=['Status_C', 'Status_CL', 'Status_D'])
res['id'] = test.id
res = res[['id', 'Status_C', 'Status_CL', 'Status_D']]
res.to_csv('submission_pca.csv', index=False)

In [16]:
# catboost.fit(X, y)
# pred = catboost.predict_proba(test_final)

# res = pd.DataFrame(pred, columns=['Status_C', 'Status_CL', 'Status_D'])
# res['id'] = test.id
# res = res[['id', 'Status_C', 'Status_CL', 'Status_D']]
# res.to_csv('submission.csv', index=False)