In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, PowerTransformer
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, log_loss

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna

In [2]:
train = pd.read_csv('train.csv')
original = pd.read_csv('original.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('sample_submission.csv')

In [3]:
def process(train, test, original):
    df_train = train.drop(['id', 'Status'], axis=1)
    original = original.dropna()
    df_original = original.drop(['ID', 'Status'], axis=1)
    df_train = pd.concat([df_train, df_original])
    df_test = test.drop(['id'], axis=1)
    
    # Categorical Cols
    # Train
    categorical_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
    encoder = OneHotEncoder(drop='first')
    encoder.fit(df_train[categorical_cols])
    df_train_cat = pd.DataFrame(encoder.transform(df_train[categorical_cols]).toarray(), columns=encoder.get_feature_names_out())
    # Test
    df_test_cat = pd.DataFrame(encoder.transform(df_test[categorical_cols]).toarray(), columns=encoder.get_feature_names_out())
    
    # Numerical Cols
    # Train
    df_train_num = df_train.drop(categorical_cols, axis=1)
    scaler = StandardScaler()
    scaler.fit(df_train_num)
    df_train_num = pd.DataFrame(scaler.transform(df_train_num), columns=df_train_num.columns)
    # Test
    df_test_num = df_test.drop(categorical_cols, axis=1)
    df_test_num = pd.DataFrame(scaler.transform(df_test_num), columns=df_test_num.columns)
    
    # Combine Num/Cat
    train_final = pd.concat([df_train_num, df_train_cat], axis=1)
    test_final = pd.concat([df_test_num, df_test_cat], axis=1)
    
    # Encode Target
    le_encoder = LabelEncoder()
    y = le_encoder.fit_transform(train.Status)
    y_original = le_encoder.fit_transform(original.Status)
    
    # Split X, y
    X_train = train_final
    y_train = np.concatenate([y, y_original])
    
    X_test = test_final
    
    return X_train, y_train, X_test

In [6]:
X, y, test_final = process(train, test, original)

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

In [8]:
rf = RandomForestClassifier()
rf.fit(X, y)

# y_pred = rf.predict_proba(X_test)
print(f"RF Log Loss: {abs(cross_val_score(rf, X, y, scoring='neg_log_loss', cv=cv).mean())}")

xgb = XGBClassifier(verbosity=0, use_label_encoder=False)
xgb.fit(X, y)

print(f"XGB Log Loss: {abs(cross_val_score(xgb, X, y, scoring='neg_log_loss', cv=cv).mean())}")

lgbm = LGBMClassifier(verbose=-1)
lgbm.fit(X, y)

print(f"LGBM Log Loss: {abs(cross_val_score(lgbm, X, y, scoring='neg_log_loss', cv=cv).mean())}")

catboost = CatBoostClassifier(verbose=0)
catboost.fit(X, y)

print(f"CatBoost Log Loss: {abs(cross_val_score(catboost, X, y, scoring='neg_log_loss', cv=cv).mean())}")

RF Log Loss: 0.5103886681658997
XGB Log Loss: 0.49322068184159873
LGBM Log Loss: 0.4553787702144372
CatBoost Log Loss: 0.456030211002837


In [9]:
def rf(trial):
    max_depth = trial.suggest_int('max_depth', 1, 100)
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    random_state = 0
          
    model = RandomForestClassifier(
        max_depth=max_depth,
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        random_state=random_state
    )
    
    model.fit(X, y)
    
    cv_scores = abs(cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv))
    
    return np.mean(cv_scores)

def xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-10, 1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-10, 1, log=True),
        'gamma': trial.suggest_float('gamma', 1e-10, 1, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'mlogloss', 
        'eval_metric': 'logloss',
        'verbosity': 0,
        'use_label_encoder': False,
        'random_state': 0,
        'n_jobs': -1
    }
    
    model = XGBClassifier(**params)

    model.fit(X, y)
    
    cv_scores = abs(cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv))
    
    return np.mean(cv_scores)

def lgbm(trial):
    params = {
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'metric': 'multi_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'verbose': -1,
        'random_state': 0
    }

    model = LGBMClassifier(**params)

    model.fit(X, y)
    
    cv_scores = abs(cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv))
    
    return np.mean(cv_scores)

def catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0),
        'random_seed': 0,
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass',
        'logging_level': 'Silent'
    }
    
    model = CatBoostClassifier(**params)
    
    model.fit(X, y)
    
    cv_scores = abs(cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv))
    
    return np.mean(cv_scores)

In [10]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(rf, n_trials=100)
# study.best_params

In [11]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(xgb, n_trials=100)
# study.best_params

In [12]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(lgbm, n_trials=100)
# study.best_params

In [13]:
# study = optuna.create_study(direction='minimize')
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study.optimize(catboost, n_trials=100)
# study.best_params

In [14]:
rf_params = {
    'max_depth': 51,
    'n_estimators': 441,
    'min_samples_leaf': 3,
    'min_samples_split': 4
}

xgb_params = {
    'n_estimators': 493,
    'max_depth': 10,
    'learning_rate': 0.029422764737469166,
    'subsample': 0.9673567402087949,
    'colsample_bytree': 0.10677749949795919,
    'reg_alpha': 0.7911540877278536,
    'reg_lambda': 0.019175317980448966,
    'gamma': 1.437410086537951e-09,
    'min_child_weight': 2,
    
    'objective': 'mlogloss', 
    'eval_metric': 'logloss',
    'verbosity': 0,
    'use_label_encoder': False
}

lgbm_params = {
    'n_estimators': 395,
    'max_depth': 38,
    'num_leaves': 95,
    'learning_rate': 0.02169163462617219,
    'min_child_samples': 72,
    'subsample': 0.8499041194587593,
    'colsample_bytree': 0.12198327413783293,
    'reg_alpha': 2.4245399917324964e-07,
    'reg_lambda': 3.515465615561991e-07,
    'verbose': -1
}

catboost_params = {
    'iterations': 866,
    'depth': 5,
    'learning_rate': 0.055409157348978444,
    'l2_leaf_reg': 3.0358925329416366,
    
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'logging_level': 'Silent'
}

In [15]:
rf_model = RandomForestClassifier(**rf_params)
xgb_model = XGBClassifier(**xgb_params)
lgbm_model = LGBMClassifier(**lgbm_params)
catboost_model = CatBoostClassifier(**catboost_params)

In [16]:
models = {
    'rf': rf_model,
    'xgb': xgb_model,
    'ltgbm': lgbm_model,
    'catboost': catboost_model
}

In [17]:
results_ensemble_models = {}

for name, model in models.items():
    print(name)
    res=[]
    for i, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)#[:, 1].reshape(-1)
        res.append(log_loss(y_test, y_pred))
    results_ensemble_models[name] = res

rf
xgb
ltgbm
catboost


In [18]:
for name, result in results_ensemble_models.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))

----------
rf
0.4524705042347875
0.010268871384998064
----------
xgb
0.4169351478855834
0.009326398997272563
----------
ltgbm
0.41859683835370787
0.011281942151841277
----------
catboost
0.43798087389648843
0.011490234969515446


In [19]:
final_model = VotingClassifier(estimators=[('xgb', xgb_model),
                                           ('lgbm', lgbm_model), 
                                           ('catboost', catboost_model)],
                               voting='soft', 
                               weights=[.4, .4, .2])

In [20]:
results_ensemble = []

r_ensemble = abs(cross_val_score(final_model, X, y, scoring='neg_log_loss'))
results_ensemble.append(r_ensemble)
print(f'Log Loss: {np.mean(results_ensemble).round(4)}')

Log Loss: 0.4136


In [21]:
final_model.fit(X, y)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.10677749949795919,
                                            enable_categorical=False,
                                            eval_metric='logloss',
                                            gamma=1.437410086537951e-09,
                                            gpu_id=-1, importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.029422764737469166,
                                            max_delta_step=0, max_dep...
                              LGBMClassifier(colsample_bytree=0.12198327413783293,
                                             learning_rate=0.02169163462617219,

In [None]:
final_predictions = final_model.predict_proba(test_final)

res = pd.DataFrame(final_predictions, columns=['Status_C', 'Status_CL', 'Status_D'])
res['id'] = test.id
res = res[['id', 'Status_C', 'Status_CL', 'Status_D']]
res.to_csv('submission.csv', index=False)

In [16]:
# catboost.fit(X, y)
# pred = catboost.predict_proba(test_final)

# res = pd.DataFrame(pred, columns=['Status_C', 'Status_CL', 'Status_D'])
# res['id'] = test.id
# res = res[['id', 'Status_C', 'Status_CL', 'Status_D']]
# res.to_csv('submission.csv', index=False)