### This notebook will serve as a template code for anyone starting a proper machine learning project.

## 1. Data and Library Load

In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression
from sklearn.svm import SVC                          # SVM

# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna

In [96]:
# set configs
is_tuning = True
is_scaling = True
is_pca = False
apply_vif = False
sampling_method = 'under' # 'under' or 'over'
if is_tuning:
    n_trials=50
    
# Keras model compile
learning_rate = 1e-2
batch_size = 32
epochs = 10

In [97]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [98]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

# def balance_loglossv2(y_true, y_pred):
#     from sklearn.metrics import log_loss
    
#     target_mean = y_true.mean()
#     w0 = 1/(1-target_mean)
#     w1 = 1/target_mean
#     sample_weight = [w0 if y == 0 else w1 for y in y_true]
#     loss = log_loss(y_true, y_pred, sample_weight=sample_weight)
    
#     return loss

def b_logloss_keras(y_true, y_pred):
    #(2, ) -> (1, ) inverse_one_hot encoding
    #y_true = y_true[:, 0] * (1 - y_true[:, 1]) + y_true[:, 1] * (1 - y_true[:, 0])
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score

In [99]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')
metadata = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
print(train.shape, test.shape)

(617, 58) (5, 57)


In [100]:
#train.EJ.value_counts()
#train.Class.value_counts() # class imbalance

## categorical feature -> EJ
## 결측치들이 좀 있다 -> KNNImputer
## class imbalance가 있다 -> sampling (undersampling vs oversampling)

# undersampling : majority class에서 데이터를 sampling해서 1:1의 비율로 맞춰주는 기법.
#                e.g. class0 : 1000, class1 : 100 ----> class0: 100, class1: 100

# oversampling : minority class에서 데이터를 생성(generation)하여 1:1로 크기를 맞춰주는 기법.
#                e.g. class0 : 1000, class1 : 100 -----> class0 : 1000, class1 : 1000

## 2. EDA

In [101]:
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1

train = train.drop(columns=["Id"])

In [102]:
imp = KNNImputer()
data = imp.fit_transform(train)
train = pd.DataFrame(columns=train.columns,
                    data=data)
train

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1.0
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0.0
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0.0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0.0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0.0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0.0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0.0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0.0


In [103]:
# check missing data
train[train.isnull().any(axis=1)]

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class


In [104]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [105]:
# remove all features when VIF is over 10.
if apply_vif:
    top_vif = 100

    while(top_vif > 5):
        vif_df, remove_col, top_vif = check_vif(train)
        print(remove_col, top_vif)
        if top_vif < 5:
            break
        train = train.drop(columns=remove_col)

    display(train)

In [106]:
# feature selection via Feature Importance
X = train.drop(columns=["Class"])
y = train['Class']

rf = RandomForestClassifier()
rf.fit(X, y)
fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
selected_cols = fi_df.sort_values(by="importance", ascending=False)[:10]["feature"].values
selected_cols

array(['DU', 'FL', 'GL', 'DA', 'CR', 'DI', 'AF', 'AB', 'FD ', 'BC'],
      dtype=object)

## 3. Data preprocessing

In [107]:
# class imbalance handling
## 1. undersampling
if sampling_method == 'under':
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=c1.shape[0]) # 509 -> 108
    train = pd.concat([c0, c1])
    print(train.shape)

(108, 57) (509, 57)
(216, 57)


In [108]:
# # oversampling전
# df = train[selected_cols]
# df["Class"] = train["Class"]
# pd.pivot_table(index="Class", data=df)

In [109]:
## 2. oversampling -> SMOTE
if sampling_method == 'over':
    X = train[selected_cols]
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # fit_resample 함수가 자동으로 minority class를 y 기준으로 찾아서, 1:1로 맞춰줌.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled

In [110]:
# # SMOTE 적용 후
# df = X_resampled.copy()
# df["Class"] = y_resampled
# pd.pivot_table(index="Class", data=df)

In [111]:
#3 hybrid approach
## class 0 : 509 -> 300
## class 1 : 108 -> 300
# class imbalance handling
## 1. undersampling
if sampling_method == 'hybrid':
    N=300
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=N) # 509 -> 300
    train = pd.concat([c0, c1])
    print(train.shape)
    
    X = train[selected_cols]
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # fit_resample 함수가 자동으로 minority class를 y 기준으로 찾아서, 1:1로 맞춰줌.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled

In [112]:
# to make OOF prediction
from sklearn.model_selection import train_test_split

#X = train.drop(columns=["Class"])
X = train[selected_cols]
y = train['Class']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(183, 10) (33, 10) (183,) (33,)


### feature scaling

- Use StandardScaler

In [113]:
from sklearn.preprocessing import StandardScaler

if is_scaling:
    scaler = StandardScaler()
    data_ = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=X_train.columns)
    data_ = scaler.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=X_val.columns)
    display(X_train)

Unnamed: 0,DU,FL,GL,DA,CR,DI,AF,AB,FD,BC
0,-0.183137,-0.178877,-0.721583,-0.528486,-0.363361,-0.616390,-1.252596,-0.371225,-0.108240,-0.152155
1,-0.254279,-0.533679,1.459620,-0.590467,0.472201,-0.259141,-0.117621,-0.434960,-0.132366,-0.152155
2,-0.254279,-0.533679,1.459620,0.362826,1.003869,-1.055136,-0.397176,-0.530561,-0.132366,-0.092919
3,-0.254193,-0.496279,-0.021642,1.176495,0.637934,-0.359297,-0.504276,-0.460453,-0.116935,-0.122596
4,-0.069880,-0.028440,-0.705577,0.093260,0.339786,-0.360947,-0.116977,1.088287,0.012842,-0.134059
...,...,...,...,...,...,...,...,...,...,...
178,0.000745,0.127586,-0.725154,0.603003,-0.253925,-0.418235,-0.430262,-0.415839,-0.072302,-0.105181
179,-0.244719,-0.333869,-0.650212,-0.874350,-0.405010,0.284073,-0.135461,-0.377599,-0.092206,-0.144603
180,-0.254279,-0.533679,1.459620,-0.389534,-1.265561,1.066867,0.568318,0.100407,-0.132366,-0.087706
181,0.133811,1.105255,-0.732516,-1.140211,-0.993551,1.201920,0.889754,0.094034,-0.131562,-0.152155


In [114]:
if is_pca:
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=0.90, random_state=42)
    data_ = pca.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    data_ = pca.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

    display(X_train)

## 4. Fitting and Evaluation

In [115]:
# Model ensemble of SVM, Logistic Regression, XGBoost, RandomForest, Simple NN.

svm = SVC(random_state=42)
lr = LogisticRegression(random_state=42, max_iter=300)
xgb = XGBClassifier(max_depth=3, colsample_bytree=0.8, reg_lambda=1, objective='binary:logistic', random_state=42)
rf = RandomForestClassifier(max_depth=3, max_features=0.8, criterion='log_loss', random_state=42)
nn = Sequential([
    Input(shape=(X_train.shape[1],)),
    #Dense(30), ReLU(), Dropout(0.2),
    #Dense(20), ReLU(), Dropout(0.2),
    #Dense(10), ReLU(), Dropout(0.1),
    Dense(5), ReLU(), Dropout(0.2),
    Dense(2), Softmax()
])
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 5)                 55        
                                                                 
 re_lu_2 (ReLU)              (None, 5)                 0         
                                                                 
 dropout_2 (Dropout)         (None, 5)                 0         
                                                                 
 dense_5 (Dense)             (None, 2)                 12        
                                                                 
 softmax_2 (Softmax)         (None, 2)                 0         
                                                                 
Total params: 67
Trainable params: 67
Non-trainable params: 0
_________________________________________________________________


In [116]:
optimizer = Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.9)   # [0.8, 0.2] <--> [0.9, 0] // [0, 0.9]
scheduler = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.5,
                              patience=10,
                              min_lr=1e-6)
earlystopper = EarlyStopping(monitor='val_loss',
                             patience=20,
                             min_delta=1e-2)


nn.compile(optimizer=optimizer, loss=loss_fn, metrics=[b_logloss_keras])

nn_y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
nn_y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)

In [117]:
print("\nFitting LogisticRegression...")
lr.fit(X_train, y_train)
print("\nFitting SVM...")
svm.fit(X_train, y_train)
print("\nFitting RandomForest...")
rf.fit(X_train, y_train)
print("\nFitting XGBoost...")
xgb.fit(X_train, y_train)
# print("\nFitting MLP...")
# history = nn.fit(X_train, nn_y_train,
#                 batch_size=batch_size,
#                 epochs=epochs,
#                 validation_data=[X_val, nn_y_val],
#                 callbacks=[scheduler, earlystopper])


Fitting LogisticRegression...

Fitting SVM...

Fitting RandomForest...

Fitting XGBoost...


In [118]:
# ## loss visualize
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2,1)
# plt.plot(history.history['loss'],'b-', label = "training")
# plt.plot(history.history['val_loss'], 'r:', label = "validation")
# plt.title("model - loss")
# plt.legend()

# plt.subplot(1, 2, 2)
# plt.title("model - val_logloss")

# plt.plot(history.history['b_logloss_keras'], 'b-', label = "training")
# plt.plot(history.history['val_b_logloss_keras'], 'r:', label = "validation")

# plt.legend()
# plt.tight_layout()
# plt.show()

In [119]:
# set metric
evaluation_metric = balance_logloss
evaluation_metric_keras = b_logloss_keras

In [120]:
print("--- Prediction with XGB ---")
pred_train = xgb.predict_proba(X_train)
pred_val = xgb.predict_proba(X_val)

train_score = evaluation_metric(y_train, pred_train)
val_score = evaluation_metric(y_val, pred_val)

print("Train Score : %.4f" % train_score)
print("Test Score : %.4f" % val_score)

# print("--- Prediction with MLP ---")
# pred_train = nn.predict(X_train)
# pred_val = nn.predict(X_val)

# train_score = evaluation_metric_keras(nn_y_train, pred_train)
# val_score = evaluation_metric_keras(nn_y_val, pred_val)

# print("Train Score : %.4f" % train_score)
# print("Validation Score : %.4f" % val_score)

--- Prediction with XGB ---
Train Score : 0.0232
Test Score : 0.3960


## 5. (Super)Hyper-parameter Tuning

Let's try hyper-parameter tuning using optuna, an AutoML framework.

Optuna defines a target function to optimize and then optimizes that function.

For each model, we define an optimizer function separately and then burn optuna.

In [121]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [122]:
def svm_optimizer(trial, X, y, K):
    C = trial.suggest_categorical('C', [1, 5, 10])
    kernel = trial.suggest_categorical('kernel', ['rbf'])
    

    model = SVC(C=C,
                kernel=kernel,
                class_weight='balanced', # if class imbalanced
                probability=True,
                cache_size=1000,
                random_state=42
               )
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [123]:
def lr_optimizer(trial, X, y, K):

    C = trial.suggest_categorical('C', [5, 10, 20, 50, 100])
    solver = trial.suggest_categorical('solver', ['liblinear', 'newton-cg', 'newton-cholesky', 'saga'])    
    

    model = LogisticRegression(C=C,
                               solver=solver,
                               max_iter=500,
                               class_weight='balanced',
                               random_state=42,
                               n_jobs=-1)
    
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [124]:
def xgb_optimizer(trial, X, y, K):
    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])
    
    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda,
                          scale_pos_weight=4.71)
    
    
    folds = StratifiedKFold(n_splits=K)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [125]:
K = 4 # set K of K-Fold
opt_func = partial(rf_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize") # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-16 04:59:52,374][0m A new study created in memory with name: no-name-d4ffddc4-b48d-4857-9fdc-064053f5c1e9[0m
[32m[I 2023-06-16 04:59:53,453][0m Trial 0 finished with value: 0.3852976910980026 and parameters: {'n_estimators': 100, 'max_depth': 4, 'max_features': 0.7}. Best is trial 0 with value: 0.3852976910980026.[0m
[32m[I 2023-06-16 04:59:54,528][0m Trial 1 finished with value: 0.4094831944593792 and parameters: {'n_estimators': 100, 'max_depth': 5, 'max_features': 0.8}. Best is trial 0 with value: 0.3852976910980026.[0m
[32m[I 2023-06-16 04:59:55,088][0m Trial 2 finished with value: 0.5405278480805846 and parameters: {'n_estimators': 50, 'max_depth': 8, 'max_features': 0.8}. Best is trial 0 with value: 0.3852976910980026.[0m
[32m[I 2023-06-16 04:59:55,639][0m Trial 3 finished with value: 0.537937325412134 and parameters: {'n_estimators': 50, 'max_depth': 9, 'max_features': 0.8}. Best is trial 0 with value: 0.3852976910980026.[0m
[32m[I 2023-06-16 04:59

In [126]:
K = 4
opt_func = partial(lr_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    lr_study = optuna.create_study(direction="minimize") 
    lr_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-16 05:00:42,845][0m A new study created in memory with name: no-name-16344d44-69d6-4712-8a00-4b95d54061c0[0m
[32m[I 2023-06-16 05:00:45,152][0m Trial 0 finished with value: 0.4127726940370331 and parameters: {'C': 50, 'solver': 'newton-cholesky'}. Best is trial 0 with value: 0.4127726940370331.[0m
[32m[I 2023-06-16 05:00:45,237][0m Trial 1 finished with value: 0.43556616649770524 and parameters: {'C': 5, 'solver': 'newton-cg'}. Best is trial 0 with value: 0.4127726940370331.[0m

'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.


'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.


'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.


'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 4.

[32m[I 2023-06-16 05:00:45,271][0m Trial 2 finished with value: 0.4190425730754674 and parameters: {'C': 20, 's

In [127]:
K = 4 
opt_func = partial(svm_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    svm_study = optuna.create_study(direction="minimize") 
    svm_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-16 05:00:47,736][0m A new study created in memory with name: no-name-0f4dbd3d-a5d4-4550-85e7-56f38255975a[0m
[32m[I 2023-06-16 05:00:47,786][0m Trial 0 finished with value: 0.4436699226709506 and parameters: {'C': 10, 'kernel': 'rbf'}. Best is trial 0 with value: 0.4436699226709506.[0m
[32m[I 2023-06-16 05:00:47,830][0m Trial 1 finished with value: 0.4387213616986463 and parameters: {'C': 5, 'kernel': 'rbf'}. Best is trial 1 with value: 0.4387213616986463.[0m
[32m[I 2023-06-16 05:00:47,874][0m Trial 2 finished with value: 0.4436699226709506 and parameters: {'C': 10, 'kernel': 'rbf'}. Best is trial 1 with value: 0.4387213616986463.[0m
[32m[I 2023-06-16 05:00:47,917][0m Trial 3 finished with value: 0.44694004878038185 and parameters: {'C': 1, 'kernel': 'rbf'}. Best is trial 1 with value: 0.4387213616986463.[0m
[32m[I 2023-06-16 05:00:47,962][0m Trial 4 finished with value: 0.4387213616986463 and parameters: {'C': 5, 'kernel': 'rbf'}. Best is trial 1 with v

In [128]:
K = 4
opt_func = partial(xgb_optimizer, X=X_train, y=y_train, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize")
    xgb_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-16 05:00:50,149][0m A new study created in memory with name: no-name-8adf06d7-0bb1-4fe8-917f-308abcccd899[0m
[32m[I 2023-06-16 05:00:51,154][0m Trial 0 finished with value: 0.4135132427621825 and parameters: {'n_estimators': 500, 'max_depth': 4, 'colsample_bytree': 0.5, 'learning_rate': 0.005005049128827971, 'reg_lambda': 0.1}. Best is trial 0 with value: 0.4135132427621825.[0m
[32m[I 2023-06-16 05:00:53,301][0m Trial 1 finished with value: 0.42158760151111707 and parameters: {'n_estimators': 500, 'max_depth': 5, 'colsample_bytree': 0.7, 'learning_rate': 0.007691550171475883, 'reg_lambda': 1}. Best is trial 0 with value: 0.4135132427621825.[0m
[32m[I 2023-06-16 05:00:58,193][0m Trial 2 finished with value: 0.4204337778574835 and parameters: {'n_estimators': 2000, 'max_depth': 8, 'colsample_bytree': 0.6, 'learning_rate': 0.0013275253626411157, 'reg_lambda': 2}. Best is trial 0 with value: 0.4135132427621825.[0m
[32m[I 2023-06-16 05:00:59,369][0m Trial 3 fini

In [129]:
# save all studies
if is_tuning:
    with open("rm_study.pk", 'wb') as f:
        pickle.dump(rf_study, f)
    with open("lr_study.pk", 'wb') as f:
        pickle.dump(lr_study, f)
    with open("svm_study.pk", 'wb') as f:
        pickle.dump(svm_study, f)
    with open("xgb_study.pk", 'wb') as f:
        pickle.dump(xgb_study, f)
    
    nn.save("./simple_nn_model.keras")

In [130]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [131]:
if is_tuning:
    display_experiment_log(rf_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
0,0,0.385298,2023-06-16 04:59:52.377563,2023-06-16 04:59:53.453135,0 days 00:00:01.075572,4,0.7,100,COMPLETE
1,1,0.409483,2023-06-16 04:59:53.455199,2023-06-16 04:59:54.528291,0 days 00:00:01.073092,5,0.8,100,COMPLETE
2,2,0.540528,2023-06-16 04:59:54.530169,2023-06-16 04:59:55.088250,0 days 00:00:00.558081,8,0.8,50,COMPLETE
3,3,0.537937,2023-06-16 04:59:55.090358,2023-06-16 04:59:55.639151,0 days 00:00:00.548793,9,0.8,50,COMPLETE
4,4,0.385107,2023-06-16 04:59:55.641179,2023-06-16 04:59:56.671916,0 days 00:00:01.030737,5,0.6,100,COMPLETE
5,5,0.373349,2023-06-16 04:59:56.673650,2023-06-16 04:59:57.231422,0 days 00:00:00.557772,6,0.8,50,COMPLETE
6,6,0.385745,2023-06-16 04:59:57.233094,2023-06-16 04:59:58.304283,0 days 00:00:01.071189,7,0.7,100,COMPLETE
7,7,0.537402,2023-06-16 04:59:58.306057,2023-06-16 04:59:58.961471,0 days 00:00:00.655414,10,0.6,50,COMPLETE
8,8,0.529374,2023-06-16 04:59:58.963250,2023-06-16 04:59:59.540000,0 days 00:00:00.576750,5,0.8,50,COMPLETE
9,9,0.394436,2023-06-16 04:59:59.542043,2023-06-16 05:00:00.097306,0 days 00:00:00.555263,4,0.8,50,COMPLETE


Best Score: 0.3614
Best params:  {'n_estimators': 50, 'max_depth': 9, 'max_features': 0.7}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
20,20,0.36137,2023-06-16 05:00:16.407702,2023-06-16 05:00:16.961494,0 days 00:00:00.553792,9,0.7,50,COMPLETE


In [132]:
if is_tuning:
    display_experiment_log(lr_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_solver,state
0,0,0.412773,2023-06-16 05:00:42.849449,2023-06-16 05:00:45.152252,0 days 00:00:02.302803,50,newton-cholesky,COMPLETE
1,1,0.435566,2023-06-16 05:00:45.155790,2023-06-16 05:00:45.237211,0 days 00:00:00.081421,5,newton-cg,COMPLETE
2,2,0.419043,2023-06-16 05:00:45.238824,2023-06-16 05:00:45.271586,0 days 00:00:00.032762,20,liblinear,COMPLETE
3,3,0.426487,2023-06-16 05:00:45.273158,2023-06-16 05:00:45.328188,0 days 00:00:00.055030,10,newton-cholesky,COMPLETE
4,4,0.410125,2023-06-16 05:00:45.329838,2023-06-16 05:00:45.365724,0 days 00:00:00.035886,100,liblinear,COMPLETE
5,5,0.426495,2023-06-16 05:00:45.367061,2023-06-16 05:00:45.443667,0 days 00:00:00.076606,10,newton-cg,COMPLETE
6,6,0.426487,2023-06-16 05:00:45.445130,2023-06-16 05:00:45.500300,0 days 00:00:00.055170,10,newton-cholesky,COMPLETE
7,7,0.435572,2023-06-16 05:00:45.501940,2023-06-16 05:00:45.558294,0 days 00:00:00.056354,5,newton-cholesky,COMPLETE
8,8,0.435572,2023-06-16 05:00:45.560649,2023-06-16 05:00:45.614633,0 days 00:00:00.053984,5,newton-cholesky,COMPLETE
9,9,0.412773,2023-06-16 05:00:45.616186,2023-06-16 05:00:45.672189,0 days 00:00:00.056003,50,newton-cholesky,COMPLETE


Best Score: 0.4101
Best params:  {'C': 100, 'solver': 'liblinear'}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_solver,state
4,4,0.410125,2023-06-16 05:00:45.329838,2023-06-16 05:00:45.365724,0 days 00:00:00.035886,100,liblinear,COMPLETE
11,11,0.410125,2023-06-16 05:00:45.796618,2023-06-16 05:00:45.829678,0 days 00:00:00.033060,100,liblinear,COMPLETE
12,12,0.410125,2023-06-16 05:00:45.831122,2023-06-16 05:00:45.863979,0 days 00:00:00.032857,100,liblinear,COMPLETE
13,13,0.410125,2023-06-16 05:00:45.865505,2023-06-16 05:00:45.898639,0 days 00:00:00.033134,100,liblinear,COMPLETE
14,14,0.410125,2023-06-16 05:00:45.900310,2023-06-16 05:00:45.932361,0 days 00:00:00.032051,100,liblinear,COMPLETE
15,15,0.410125,2023-06-16 05:00:45.933660,2023-06-16 05:00:45.965872,0 days 00:00:00.032212,100,liblinear,COMPLETE
17,17,0.410125,2023-06-16 05:00:46.084038,2023-06-16 05:00:46.117639,0 days 00:00:00.033601,100,liblinear,COMPLETE
18,18,0.410125,2023-06-16 05:00:46.119004,2023-06-16 05:00:46.152869,0 days 00:00:00.033865,100,liblinear,COMPLETE
19,19,0.410125,2023-06-16 05:00:46.154281,2023-06-16 05:00:46.185946,0 days 00:00:00.031665,100,liblinear,COMPLETE
21,21,0.410125,2023-06-16 05:00:46.267652,2023-06-16 05:00:46.302502,0 days 00:00:00.034850,100,liblinear,COMPLETE


In [133]:
if is_tuning:
    display_experiment_log(svm_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_kernel,state
0,0,0.44367,2023-06-16 05:00:47.740760,2023-06-16 05:00:47.785870,0 days 00:00:00.045110,10,rbf,COMPLETE
1,1,0.438721,2023-06-16 05:00:47.787423,2023-06-16 05:00:47.829970,0 days 00:00:00.042547,5,rbf,COMPLETE
2,2,0.44367,2023-06-16 05:00:47.831427,2023-06-16 05:00:47.873983,0 days 00:00:00.042556,10,rbf,COMPLETE
3,3,0.44694,2023-06-16 05:00:47.875440,2023-06-16 05:00:47.916991,0 days 00:00:00.041551,1,rbf,COMPLETE
4,4,0.438721,2023-06-16 05:00:47.918519,2023-06-16 05:00:47.962345,0 days 00:00:00.043826,5,rbf,COMPLETE
5,5,0.44694,2023-06-16 05:00:47.963847,2023-06-16 05:00:48.008993,0 days 00:00:00.045146,1,rbf,COMPLETE
6,6,0.44694,2023-06-16 05:00:48.010586,2023-06-16 05:00:48.056152,0 days 00:00:00.045566,1,rbf,COMPLETE
7,7,0.44694,2023-06-16 05:00:48.057657,2023-06-16 05:00:48.101680,0 days 00:00:00.044023,1,rbf,COMPLETE
8,8,0.438721,2023-06-16 05:00:48.103264,2023-06-16 05:00:48.148682,0 days 00:00:00.045418,5,rbf,COMPLETE
9,9,0.44367,2023-06-16 05:00:48.150207,2023-06-16 05:00:48.195488,0 days 00:00:00.045281,10,rbf,COMPLETE


Best Score: 0.4387
Best params:  {'C': 5, 'kernel': 'rbf'}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_kernel,state
1,1,0.438721,2023-06-16 05:00:47.787423,2023-06-16 05:00:47.829970,0 days 00:00:00.042547,5,rbf,COMPLETE
4,4,0.438721,2023-06-16 05:00:47.918519,2023-06-16 05:00:47.962345,0 days 00:00:00.043826,5,rbf,COMPLETE
8,8,0.438721,2023-06-16 05:00:48.103264,2023-06-16 05:00:48.148682,0 days 00:00:00.045418,5,rbf,COMPLETE
10,10,0.438721,2023-06-16 05:00:48.196963,2023-06-16 05:00:48.241071,0 days 00:00:00.044108,5,rbf,COMPLETE
11,11,0.438721,2023-06-16 05:00:48.242603,2023-06-16 05:00:48.284995,0 days 00:00:00.042392,5,rbf,COMPLETE
12,12,0.438721,2023-06-16 05:00:48.286588,2023-06-16 05:00:48.329519,0 days 00:00:00.042931,5,rbf,COMPLETE
13,13,0.438721,2023-06-16 05:00:48.331061,2023-06-16 05:00:48.375892,0 days 00:00:00.044831,5,rbf,COMPLETE
14,14,0.438721,2023-06-16 05:00:48.377384,2023-06-16 05:00:48.422174,0 days 00:00:00.044790,5,rbf,COMPLETE
15,15,0.438721,2023-06-16 05:00:48.423873,2023-06-16 05:00:48.469852,0 days 00:00:00.045979,5,rbf,COMPLETE
16,16,0.438721,2023-06-16 05:00:48.471354,2023-06-16 05:00:48.529772,0 days 00:00:00.058418,5,rbf,COMPLETE


In [134]:
if is_tuning:
    display_experiment_log(xgb_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,0.413513,2023-06-16 05:00:50.153019,2023-06-16 05:00:51.154388,0 days 00:00:01.001369,0.5,0.005005,4,500,0.1,COMPLETE
1,1,0.421588,2023-06-16 05:00:51.156618,2023-06-16 05:00:53.300854,0 days 00:00:02.144236,0.7,0.007692,5,500,1.0,COMPLETE
2,2,0.420434,2023-06-16 05:00:53.302867,2023-06-16 05:00:58.193091,0 days 00:00:04.890224,0.6,0.001328,8,2000,2.0,COMPLETE
3,3,0.412528,2023-06-16 05:00:58.195024,2023-06-16 05:00:59.369023,0 days 00:00:01.173999,0.5,0.009849,5,500,1.0,COMPLETE
4,4,0.434395,2023-06-16 05:00:59.370978,2023-06-16 05:01:00.352767,0 days 00:00:00.981789,0.7,0.004945,4,500,2.0,COMPLETE
5,5,0.482369,2023-06-16 05:01:00.354754,2023-06-16 05:01:03.731895,0 days 00:00:03.377141,0.5,0.009406,5,2000,1.0,COMPLETE
6,6,0.422941,2023-06-16 05:01:03.733843,2023-06-16 05:01:08.981789,0 days 00:00:05.247946,0.5,0.001011,10,2000,1.0,COMPLETE
7,7,0.463587,2023-06-16 05:01:08.983780,2023-06-16 05:01:13.181943,0 days 00:00:04.198163,0.7,0.004318,10,2000,0.5,COMPLETE
8,8,0.409256,2023-06-16 05:01:13.184080,2023-06-16 05:01:17.292789,0 days 00:00:04.108709,0.5,0.002471,5,2000,2.0,COMPLETE
9,9,0.420021,2023-06-16 05:01:17.294677,2023-06-16 05:01:19.734692,0 days 00:00:02.440015,0.7,0.003875,6,1000,2.0,COMPLETE


Best Score: 0.3938
Best params:  {'n_estimators': 2000, 'max_depth': 6, 'colsample_bytree': 0.5, 'learning_rate': 0.0017461909745651355, 'reg_lambda': 0.1}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
24,24,0.393805,2023-06-16 05:02:17.650122,2023-06-16 05:02:22.674643,0 days 00:00:05.024521,0.5,0.001746,6,2000,0.1,COMPLETE


## 6. Test Prediction and Make Submission

In [135]:
## preprocessing in same way
X_test = test[X.columns].fillna(X.mean())
if is_scaling:
    X_test = scaler.transform(X_test)
    X_test = pd.DataFrame(data=X_test, columns=X.columns)

if is_pca:
    data_ = pca.transform(X_test)
    X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

X_test

Unnamed: 0,DU,FL,GL,DA,CR,DI,AF,AB,FD,BC
0,-0.254623,-0.54584,-0.732647,-2.075146,-2.608664,-1.655132,-1.312602,-0.881099,-0.134879,-0.162522
1,-0.254623,-0.54584,-0.732647,-2.075146,-2.608664,-1.655132,-1.312602,-0.881099,-0.134879,-0.162522
2,-0.254623,-0.54584,-0.732647,-2.075146,-2.608664,-1.655132,-1.312602,-0.881099,-0.134879,-0.162522
3,-0.254623,-0.54584,-0.732647,-2.075146,-2.608664,-1.655132,-1.312602,-0.881099,-0.134879,-0.162522
4,-0.254623,-0.54584,-0.732647,-2.075146,-2.608664,-1.655132,-1.312602,-0.881099,-0.134879,-0.162522


In [136]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    lr_best_params = lr_study.best_params
    xgb_best_params = xgb_study.best_params
    svm_best_params = svm_study.best_params

    best_rf = RandomForestClassifier(**rf_best_params)
    best_lr = LogisticRegression(**lr_best_params)
    best_xgb = XGBClassifier(**xgb_best_params)
    best_svm = SVC(**svm_best_params, probability=True)

### 위에서 만든 testset으로 OOF Prediction 성능 확인 필요!

In [137]:
# first ensebmle model, then check it.
best_rf.fit(X_train, y_train)
best_lr.fit(X_train, y_train)
best_xgb.fit(X_train, y_train)
best_svm.fit(X_train, y_train)

# OOF-prediction
v_rf = best_rf.predict_proba(X_val)
v_lr = best_lr.predict_proba(X_val)
v_xgb = best_xgb.predict_proba(X_val)
v_svm = best_svm.predict_proba(X_val)
print(v_rf.shape, v_lr.shape, v_xgb.shape, v_svm.shape)

preds_rf = best_rf.predict_proba(X_test)
preds_lr = best_lr.predict_proba(X_test)
preds_xgb = best_xgb.predict_proba(X_test)
preds_svm = best_svm.predict_proba(X_test)
print(preds_rf.shape, preds_lr.shape, preds_xgb.shape, preds_svm.shape)

(33, 2) (33, 2) (33, 2) (33, 2)
(5, 2) (5, 2) (5, 2) (5, 2)


In [138]:
# # MLP predictions
# v_nn = nn.predict(X_val)
# preds_nn = nn.predict(X_test)
# print(v_nn.shape, preds_nn.shape)

In [139]:
# OOF prediction
#ensembles = np.mean([v_rf, v_lr, v_xgb, v_svm, v_nn], axis=0)
ensembles = np.mean([v_rf, v_lr, v_xgb, v_svm], axis=0)
print("OOF prediction logloss : %.4f" % evaluation_metric(y_val, ensembles))

OOF prediction logloss : 0.3699


In [140]:
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


In [141]:
#voting_weights = [0.1, 0.1, 0.25, 0.25, 0.3]
voting_weights = [0.2, 0.2, 0.2, 0.2, 0.2]
submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_nn[:, 0] + voting_weights[4]*preds_svm[:, 0]
submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_nn[:, 1] + voting_weights[4]*preds_svm[:, 1]
submission

NameError: name 'preds_nn' is not defined

In [None]:
submission.to_csv("submission.csv", index=False)