#### [Version 25] TPESampler is added for using optuna as reproducible way.

**[Version 24] cuml LinearSVC is added.**

**[Version 23] cuml SVC is added.**

**[Version 22] Make config variable and refactor code.**

**[Version 21] Sampling method is applied to relieve class imbalance.**

**[~Version 20] Use KNNImputer, Standard Scaling, model ensemble(Random Forest & Support Vector Machine & XGBoost & Shallow Neural Network & Logistic Regression)**

This code will give you a general idea of how to do a machine learning project using scikit-learn and optuna.

It performs the necessary preprocessing, tunes the models with optuna, combines the tuned models, and performs ensemble (voting).

You can submit the finished result without any problem.

Below is the version history, so please refer to it when using it.

## 1. Data and Library Load

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression

# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna
from optuna.samplers import TPESampler

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
# set configs
is_tuning = True
if is_tuning:
    sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
is_scaling = True
is_pca = False
apply_vif = False
is_cuml = True
is_debug = True
sampling_method = 'hybrid' # 'under' or 'over'
if is_tuning:
    n_trials=30

# import SVC
if is_cuml:
    from cuml.svm import SVC, LinearSVC
else:
    from sklearn.svm import SVC
    
# Keras model compile
learning_rate = 1e-2
batch_size = 32
epochs = 10

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [4]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

def b_logloss_keras(y_true, y_pred):
    #(2, ) -> (1, ) inverse_one_hot encoding
    #y_true = y_true[:, 0] * (1 - y_true[:, 1]) + y_true[:, 1] * (1 - y_true[:, 0])
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score

In [5]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')
metadata = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
print(train.shape, test.shape)

(617, 58) (5, 57)


In [6]:
test

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#train.EJ.value_counts()
#train.Class.value_counts() # class imbalance

## categorical feature -> EJ
## class imbalance가 있다 -> sampling (undersampling vs oversampling)

# undersampling : A technique for sampling data from a majority class and fitting it to a 1:1 ratio.
#                e.g. class0 : 1000, class1 : 100 ----> class0: 100, class1: 100

# oversampling : A technique for generating data from a minority class and sizing it 1:1 ratio.
#                e.g. class0 : 1000, class1 : 100 -----> class0 : 1000, class1 : 1000

## 2. EDA

In [8]:
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1

train = train.drop(columns=["Id"])

In [9]:
imp = KNNImputer()
data = imp.fit_transform(train)
train = pd.DataFrame(columns=train.columns,
                    data=data)
train

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1.0
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0.0
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0.0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0.0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0.0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0.0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0.0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0.0


In [10]:
# check missing data
train[train.isnull().any(axis=1)]

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class


In [11]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [12]:
# remove all features when VIF is over 10.
if apply_vif:
    top_vif = 100

    while(top_vif > 5):
        vif_df, remove_col, top_vif = check_vif(train)
        print(remove_col, top_vif)
        if top_vif < 5:
            break
        train = train.drop(columns=remove_col)

    display(train)

In [13]:
# # feature selection via Feature Importance
# X = train.drop(columns=["Class"])
# y = train['Class']

# rf = RandomForestClassifier()
# rf.fit(X, y)
# fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
# selected_cols = fi_df.sort_values(by="importance", ascending=False)["feature"].values
# selected_cols

### 우선은 범위를 0.15이상으로 해보자

In [14]:
X = train.drop(columns=["Class"])
y = train['Class']

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
selected_cols=fi_df[fi_df["importance"]>=0.015]
selected_cols = selected_cols.sort_values(by='importance', ascending=False)["feature"].values
selected_cols


array(['DU', 'FL', 'GL', 'DA', 'CR', 'DI', 'AF', 'AB', 'FD ', 'BC', 'FR',
       'EH', 'DE', 'EE', 'CC', 'FE', 'BQ', 'DH', 'FI', 'EB', 'DL', 'AM'],
      dtype=object)

## 3. Data preprocessing

In [15]:
# class imbalance handling
## 1. undersampling
if sampling_method == 'under':
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=c1.shape[0]) # 509 -> 108
    train = pd.concat([c0, c1])
    print(train.shape)

In [16]:
## before oversampling
# df = train[selected_cols]
# df["Class"] = train["Class"]
# pd.pivot_table(index="Class", data=df)

In [17]:
## 2. oversampling -> SMOTE
if sampling_method == 'over':
    X = train[selected_cols]
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # fit_resample 함수가 자동으로 minority class를 y 기준으로 찾아서, 1:1로 맞춰줌.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled

In [18]:
# After SMOTE
# df = X_resampled.copy()
# df["Class"] = y_resampled
# pd.pivot_table(index="Class", data=df)

In [19]:
# 3. hybrid approach
## class0 : 509 -> 300
## class1 : 108 -> 300
if sampling_method == 'hybrid':
    N = 300
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape)
    c0 = c0.sample(n=N) # 509 -> 300
    train = pd.concat([c0, c1])
    print(train.shape)
    
    X = train[selected_cols]
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # The fit_resample function automatically finds the minority class by y and fits it 1:1.
    X_resampled, y_resampled = smote.fit_resample(X, y) # 300, 108 --> 300, 300
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled
    display(train)

(108, 57) (509, 57)
(408, 57)
(600, 22) (600,)


Unnamed: 0,DU,FL,GL,DA,CR,DI,AF,AB,FD,BC,...,EE,CC,FE,BQ,DH,FI,EB,DL,AM,Class
0,0.013794,0.794236,9.801000,31.618120,0.587475,154.063800,3331.381680,0.376024,4.636797,3.092320,...,2.184802,0.847975,12578.179320,45.017005,0.265101,13.363512,5.661468,79.463960,41.955059,0.0
1,1.455267,4.272654,0.242104,58.615160,0.921975,173.193765,4570.765320,0.452938,6.328842,7.042056,...,3.853636,0.778161,9247.148874,10.917555,0.374421,11.571787,8.347308,107.939800,15.175300,0.0
2,0.006897,2.456055,3.564000,47.559100,0.893025,60.232470,192.593280,0.162374,3.686877,2.168138,...,2.728987,0.579243,2564.145893,78.999230,0.281499,16.395662,7.495614,205.544600,11.347408,0.0
3,0.006897,1.633415,3.564000,48.212880,1.009950,110.386035,3093.425500,0.632404,2.451981,1.229900,...,3.502939,0.658518,9687.460422,72.043880,0.292431,7.486654,6.792348,107.496720,90.788555,0.0
4,0.005518,0.173229,21.978000,83.400600,0.851100,97.739400,5266.084560,0.329021,0.296850,1.229900,...,5.328982,0.926574,6472.462974,30.169808,0.409950,8.983434,6.467220,80.640560,15.466418,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.328642,3.445798,1.299001,62.237706,0.613078,103.980445,3363.297236,0.502494,4.190957,2.779509,...,1.550944,0.493170,7045.240471,166.439566,0.458726,10.737528,7.830864,90.657656,18.012764,1.0
596,10.529594,9.223532,0.105375,33.940339,0.755736,115.784904,1063.996903,0.411451,15.377253,3.572601,...,1.048692,0.584869,8226.392232,172.661026,0.287786,10.040700,6.630482,83.770171,105.940884,1.0
597,14.168004,18.525305,0.036568,50.148571,0.671807,127.977774,3988.333495,0.395103,11.040993,1.229900,...,0.821806,0.722569,3778.517526,94.471046,0.194496,13.318273,9.237563,60.417823,39.440192,1.0
598,4.967864,18.811686,4.432417,20.701586,0.405643,279.306170,7496.277990,0.871742,0.372689,4.178754,...,1.906641,0.522347,20955.836101,150.030485,0.260448,6.802149,9.385201,88.920100,439.896202,1.0


In [20]:
# to make OOF prediction
from sklearn.model_selection import train_test_split

#X = train.drop(columns=["Class"])
X = train[selected_cols]
y = train['Class']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(510, 22) (90, 22) (510,) (90,)


### feature scaling

- Use StandardScaler

In [21]:
# from sklearn.preprocessing import StandardScaler

# if is_scaling:
#     scaler = StandardScaler()
#     data_ = scaler.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=X_train.columns)
#     data_ = scaler.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=X_val.columns)
#     display(X_train)

In [22]:
# if is_pca:
#     from sklearn.decomposition import PCA
    
#     pca = PCA(n_components=0.90, random_state=42)
#     data_ = pca.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
#     data_ = pca.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

#     display(X_train)

## 4. Fitting and Evaluation


- does not need anymore.

In [23]:
# Model ensemble of SVM, Logistic Regression, XGBoost, RandomForest, Simple NN.
# The NN model performed too poorly, so we excluded it now.

# svm = SVC(random_state=42)
# lr = LogisticRegression(random_state=42, max_iter=300)
# xgb = XGBClassifier(max_depth=3, colsample_bytree=0.8, reg_lambda=1, objective='binary:logistic', random_state=42)
# rf = RandomForestClassifier(max_depth=3, max_features=0.8, criterion='log_loss', random_state=42)
# nn = Sequential([
#     Input(shape=(X_train.shape[1],)),
#     #Dense(30), ReLU(), Dropout(0.2),
#     #Dense(20), ReLU(), Dropout(0.2),
#     #Dense(10), ReLU(), Dropout(0.1),
#     Dense(5), ReLU(), Dropout(0.2),
#     Dense(2), Softmax()
# ])
# nn.summary()

In [24]:
# optimizer = Adam(learning_rate=learning_rate)
# loss_fn = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.9)   # [0.8, 0.2] <--> [0.9, 0] // [0, 0.9]
# scheduler = ReduceLROnPlateau(monitor='val_loss',
#                               factor=0.5,
#                               patience=10,
#                               min_lr=1e-6)
# earlystopper = EarlyStopping(monitor='val_loss',
#                              patience=20,
#                              min_delta=1e-2)


# nn.compile(optimizer=optimizer, loss=loss_fn, metrics=[b_logloss_keras])

# nn_y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
# nn_y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)

In [25]:
# print("\nFitting LogisticRegression...")
# lr.fit(X_train, y_train)
# print("\nFitting SVM...")
# svm.fit(X_train, y_train)
# print("\nFitting RandomForest...")
# rf.fit(X_train, y_train)
# print("\nFitting XGBoost...")
# xgb.fit(X_train, y_train)
# print("\nFitting MLP...")
# history = nn.fit(X_train, nn_y_train,
#                 batch_size=batch_size,
#                 epochs=epochs,
#                 validation_data=[X_val, nn_y_val],
#                 callbacks=[scheduler, earlystopper])

In [26]:
# ## loss visualize
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2,1)
# plt.plot(history.history['loss'],'b-', label = "training")
# plt.plot(history.history['val_loss'], 'r:', label = "validation")
# plt.title("model - loss")
# plt.legend()

# plt.subplot(1, 2, 2)
# plt.title("model - val_logloss")

# plt.plot(history.history['b_logloss_keras'], 'b-', label = "training")
# plt.plot(history.history['val_b_logloss_keras'], 'r:', label = "validation")

# plt.legend()
# plt.tight_layout()
# plt.show()

In [27]:
# set metric
evaluation_metric = balance_logloss
#평가
#evaluation_metric_keras = b_logloss_keras

In [28]:
# print("--- Prediction with XGB ---")
# pred_train = xgb.predict_proba(X_train)
# pred_val = xgb.predict_proba(X_val)

# train_score = evaluation_metric(y_train, pred_train)
# val_score = evaluation_metric(y_val, pred_val)

# print("Train Score : %.4f" % train_score)
# print("Test Score : %.4f" % val_score)

# print("--- Prediction with MLP ---")
# pred_train = nn.predict(X_train)
# pred_val = nn.predict(X_val)

# train_score = evaluation_metric_keras(nn_y_train, pred_train)
# val_score = evaluation_metric_keras(nn_y_val, pred_val)

# print("Train Score : %.4f" % train_score)
# print("Validation Score : %.4f" % val_score)

## 5. (Super)Hyper-parameter Tuning

Let's try hyper-parameter tuning using optuna, an AutoML framework.

Optuna defines a target function to optimize and then optimizes that function.

For each model, we define an optimizer function separately and then burn optuna.

In [29]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [30]:
def svm_optimizer(trial, X, y, K):
    C = trial.suggest_int('C', 1, 100)
    kernel = trial.suggest_categorical('kernel', ['rbf', 'linear'])
    

    model = SVC(C=C,
                kernel=kernel,
                #class_weight='balanced', # if class imbalanced
                probability=True,
                #cache_size=1000,
                random_state=42
               )
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [31]:
def lr_optimizer(trial, X, y, K):

    C = trial.suggest_int('C', 1, 100)
    solver = trial.suggest_categorical('solver', ['liblinear', 'newton-cg', 'newton-cholesky', 'saga'])    
    

    model = LogisticRegression(C=C,
                               solver=solver,
                               max_iter=500,
                               class_weight='balanced',
                               random_state=42,
                               n_jobs=-1)
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [32]:
def xgb_optimizer(trial, X, y, K):
    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])
    
    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda)
#                          scale_pos_weight=4.71)  ## we set class imbalance by using sampling method.
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [33]:
K = 4 # set K of K-Fold
opt_func = partial(rf_optimizer, X=X, y=y, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize", sampler=sampler) # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-22 00:23:37,896][0m A new study created in memory with name: no-name-656747a1-5df9-4b55-90d2-c8ae062c8ff5[0m
[32m[I 2023-06-22 00:23:39,568][0m Trial 0 finished with value: 0.21543058350660388 and parameters: {'n_estimators': 100, 'max_depth': 8, 'max_features': 0.6}. Best is trial 0 with value: 0.21543058350660388.[0m
[32m[I 2023-06-22 00:23:40,325][0m Trial 1 finished with value: 0.26970186825835374 and parameters: {'n_estimators': 50, 'max_depth': 4, 'max_features': 0.6}. Best is trial 0 with value: 0.21543058350660388.[0m
[32m[I 2023-06-22 00:23:43,986][0m Trial 2 finished with value: 0.2566438513278475 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.8}. Best is trial 0 with value: 0.21543058350660388.[0m
[32m[I 2023-06-22 00:23:47,277][0m Trial 3 finished with value: 0.21139525140563648 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.6}. Best is trial 3 with value: 0.21139525140563648.[0m
[32m[I 2023-06

In [34]:
K = 4
opt_func = partial(lr_optimizer, X=X, y=y, K=K)

if is_tuning:
    lr_study = optuna.create_study(direction="minimize", sampler=sampler) 
    lr_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-22 00:24:31,374][0m A new study created in memory with name: no-name-f7f140c3-e942-4515-ba0f-fe72d2d1a673[0m
[32m[I 2023-06-22 00:24:32,903][0m Trial 0 finished with value: 0.4126445899894359 and parameters: {'C': 78, 'solver': 'newton-cholesky'}. Best is trial 0 with value: 0.4126445899894359.[0m
[32m[I 2023-06-22 00:24:33,022][0m Trial 1 finished with value: 0.38721948561649017 and parameters: {'C': 73, 'solver': 'liblinear'}. Best is trial 1 with value: 0.38721948561649017.[0m
[32m[I 2023-06-22 00:24:33,118][0m Trial 2 finished with value: 0.37672887007014033 and parameters: {'C': 87, 'solver': 'liblinear'}. Best is trial 2 with value: 0.37672887007014033.[0m
[32m[I 2023-06-22 00:24:33,179][0m Trial 3 finished with value: 0.37750131116010066 and parameters: {'C': 33, 'solver': 'newton-cholesky'}. Best is trial 2 with value: 0.37672887007014033.[0m
[32m[I 2023-06-22 00:24:33,580][0m Trial 4 finished with value: 0.647051664012497 and parameters: {'C': 1

In [35]:
K = 4 
#opt_func = partial(svm_optimizer, X=X, y=y, K=K)

if is_tuning:
    if is_cuml:
        #svm = SVC(C=5, probability=True)
        best_loss = 9999.0
        folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True) 

        for C in [1, 2, 5, 10, 100]:
            losses = []
            svm = LinearSVC(C=C, probability=True) ## cuml version. (faster model)
            
            for train_idx, val_idx in folds.split(X, y):
                X_train = X.iloc[train_idx, :]
                y_train = y.iloc[train_idx]
                X_val = X.iloc[val_idx, :]
                y_val = y.iloc[val_idx]
                
                svm.fit(X_train, y_train)
                preds = svm.predict_proba(X_val).values
                loss = evaluation_metric(y_val, preds)
                losses.append(loss)
                
            avg_loss = np.mean(losses)
            if avg_loss < best_loss:
                best_loss = avg_loss

        print("SVM log loss : %.4f" % best_loss)
    else:
        svm_study = optuna.create_study(direction="minimize", sampler=sampler)
        svm_study.optimize(opt_func, n_trials=n_trials)

[W] [00:25:02.213812] L-BFGS: max iterations reached
[W] [00:25:02.214023] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [00:25:02.628237] L-BFGS: max iterations reached
[W] [00:25:02.628441] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [00:25:03.024677] L-BFGS: max iterations reached
[W] [00:25:03.024859] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [00:25:03.423133] L-BFGS: max iterations reached
[W] [00:25:03.423321] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the inpu

In [36]:
K = 4
opt_func = partial(xgb_optimizer, X=X, y=y, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize", sampler=sampler)
    xgb_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-22 00:25:09,873][0m A new study created in memory with name: no-name-1b624efc-3cf5-4124-9d35-4b0b30565991[0m
[32m[I 2023-06-22 00:25:14,964][0m Trial 0 finished with value: 0.36441519071658457 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'colsample_bytree': 0.8, 'learning_rate': 0.0010625691747807163, 'reg_lambda': 0.1}. Best is trial 0 with value: 0.36441519071658457.[0m
[32m[I 2023-06-22 00:25:21,262][0m Trial 1 finished with value: 0.2005831590977808 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'colsample_bytree': 0.7, 'learning_rate': 0.0032660406624282776, 'reg_lambda': 0.1}. Best is trial 1 with value: 0.2005831590977808.[0m
[32m[I 2023-06-22 00:25:23,470][0m Trial 2 finished with value: 0.17297184364094093 and parameters: {'n_estimators': 500, 'max_depth': 5, 'colsample_bytree': 0.5, 'learning_rate': 0.009870854086995406, 'reg_lambda': 1}. Best is trial 2 with value: 0.17297184364094093.[0m
[32m[I 2023-06-22 00:25:27,098][0m Trial

In [37]:
# save all studies
if is_tuning:
    with open("rm_study.pk", 'wb') as f:
        pickle.dump(rf_study, f)
    with open("lr_study.pk", 'wb') as f:
        pickle.dump(lr_study, f)
    with open("xgb_study.pk", 'wb') as f:
        pickle.dump(xgb_study, f)
        
    if is_cuml:
        with open("svm_model.pk", 'wb') as f:
            pickle.dump(svm, f)
    else:
        with open("svm_study.pk", 'wb') as f:
            pickle.dump(svm_study, f)

    #nn.save("./simple_nn_model.keras")

In [38]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [39]:
if is_tuning:
    display_experiment_log(rf_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
0,0,0.215431,2023-06-22 00:23:37.897924,2023-06-22 00:23:39.568267,0 days 00:00:01.670343,8,0.6,100,COMPLETE
1,1,0.269702,2023-06-22 00:23:39.569663,2023-06-22 00:23:40.325501,0 days 00:00:00.755838,4,0.6,50,COMPLETE
2,2,0.256644,2023-06-22 00:23:40.327353,2023-06-22 00:23:43.985828,0 days 00:00:03.658475,7,0.8,200,COMPLETE
3,3,0.211395,2023-06-22 00:23:43.987139,2023-06-22 00:23:47.276644,0 days 00:00:03.289505,7,0.6,200,COMPLETE
4,4,0.226655,2023-06-22 00:23:47.278053,2023-06-22 00:23:50.803819,0 days 00:00:03.525766,5,0.8,200,COMPLETE
5,5,0.257348,2023-06-22 00:23:50.805272,2023-06-22 00:23:51.743107,0 days 00:00:00.937835,8,0.8,50,COMPLETE
6,6,0.204951,2023-06-22 00:23:51.744503,2023-06-22 00:23:53.646107,0 days 00:00:01.901604,8,0.8,100,COMPLETE
7,7,0.220759,2023-06-22 00:23:53.650317,2023-06-22 00:23:55.944580,0 days 00:00:02.294263,10,0.8,100,COMPLETE
8,8,0.240902,2023-06-22 00:23:55.945863,2023-06-22 00:23:57.745033,0 days 00:00:01.799170,6,0.8,100,COMPLETE
9,9,0.250397,2023-06-22 00:23:57.746356,2023-06-22 00:24:01.058738,0 days 00:00:03.312382,4,0.8,200,COMPLETE


Best Score: 0.1976
Best params:  {'n_estimators': 50, 'max_depth': 10, 'max_features': 0.7}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
16,16,0.197563,2023-06-22 00:24:11.626983,2023-06-22 00:24:12.519917,0 days 00:00:00.892934,10,0.7,50,COMPLETE


In [40]:
if is_tuning:
    display_experiment_log(lr_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_solver,state
0,0,0.412645,2023-06-22 00:24:31.375468,2023-06-22 00:24:32.902794,0 days 00:00:01.527326,78,newton-cholesky,COMPLETE
1,1,0.387219,2023-06-22 00:24:32.904814,2023-06-22 00:24:33.021885,0 days 00:00:00.117071,73,liblinear,COMPLETE
2,2,0.376729,2023-06-22 00:24:33.023190,2023-06-22 00:24:33.118626,0 days 00:00:00.095436,87,liblinear,COMPLETE
3,3,0.377501,2023-06-22 00:24:33.119911,2023-06-22 00:24:33.179161,0 days 00:00:00.059250,33,newton-cholesky,COMPLETE
4,4,0.647052,2023-06-22 00:24:33.180522,2023-06-22 00:24:33.579719,0 days 00:00:00.399197,12,saga,COMPLETE
5,5,0.366482,2023-06-22 00:24:33.580991,2023-06-22 00:24:33.707815,0 days 00:00:00.126824,50,liblinear,COMPLETE
6,6,0.646264,2023-06-22 00:24:33.709068,2023-06-22 00:24:34.099385,0 days 00:00:00.390317,4,saga,COMPLETE
7,7,0.364098,2023-06-22 00:24:34.100697,2023-06-22 00:24:35.191889,0 days 00:00:01.091192,25,newton-cg,COMPLETE
8,8,0.426041,2023-06-22 00:24:35.193302,2023-06-22 00:24:36.223143,0 days 00:00:01.029841,29,newton-cg,COMPLETE
9,9,0.404287,2023-06-22 00:24:36.224585,2023-06-22 00:24:36.281836,0 days 00:00:00.057251,88,newton-cholesky,COMPLETE


Best Score: 0.3527
Best params:  {'C': 31, 'solver': 'newton-cg'}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_solver,state
13,13,0.352661,2023-06-22 00:24:39.044118,2023-06-22 00:24:39.955263,0 days 00:00:00.911145,31,newton-cg,COMPLETE


In [41]:
if is_tuning:
    if is_cuml:
        print(svm.predict_proba(X_val))
        print("SVM log loss : %.4f" % evaluation_metric(y_val, svm.predict_proba(X_val).values))
    else:
        display_experiment_log(svm_study)

            0         1
0    0.649071  0.350929
1    0.915584  0.084416
2    0.820494  0.179506
3    0.905235  0.094765
4    0.931473  0.068527
..        ...       ...
145  0.095199  0.904801
146  0.773266  0.226734
147  0.064476  0.935524
148  0.061834  0.938166
149  0.725065  0.274935

[150 rows x 2 columns]
SVM log loss : 0.4615


In [42]:
if is_tuning:
    display_experiment_log(xgb_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,0.364415,2023-06-22 00:25:09.875719,2023-06-22 00:25:14.963650,0 days 00:00:05.087931,0.8,0.001063,4,1000,0.1,COMPLETE
1,1,0.200583,2023-06-22 00:25:14.967216,2023-06-22 00:25:21.262164,0 days 00:00:06.294948,0.7,0.003266,7,1000,0.1,COMPLETE
2,2,0.172972,2023-06-22 00:25:21.265920,2023-06-22 00:25:23.470303,0 days 00:00:02.204383,0.5,0.009871,5,500,1.0,COMPLETE
3,3,0.312727,2023-06-22 00:25:23.473908,2023-06-22 00:25:27.097080,0 days 00:00:03.623172,0.7,0.002679,8,500,1.0,COMPLETE
4,4,0.152119,2023-06-22 00:25:27.099478,2023-06-22 00:25:37.257864,0 days 00:00:10.158386,0.7,0.00407,5,2000,0.5,COMPLETE
5,5,0.203939,2023-06-22 00:25:37.261636,2023-06-22 00:25:43.290981,0 days 00:00:06.029345,0.8,0.006698,7,1000,2.0,COMPLETE
6,6,0.3868,2023-06-22 00:25:43.295177,2023-06-22 00:25:45.643017,0 days 00:00:02.347840,0.6,0.001913,4,500,0.1,COMPLETE
7,7,0.178596,2023-06-22 00:25:45.646766,2023-06-22 00:25:48.737830,0 days 00:00:03.091064,0.7,0.008643,8,500,0.1,COMPLETE
8,8,0.14761,2023-06-22 00:25:48.741571,2023-06-22 00:25:56.515447,0 days 00:00:07.773876,0.5,0.006192,6,2000,1.0,COMPLETE
9,9,0.158403,2023-06-22 00:25:56.519220,2023-06-22 00:26:01.818045,0 days 00:00:05.298825,0.5,0.009355,10,1000,0.5,COMPLETE


Best Score: 0.1412
Best params:  {'n_estimators': 2000, 'max_depth': 6, 'colsample_bytree': 0.5, 'learning_rate': 0.0073396762513940825, 'reg_lambda': 0.5}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
14,14,0.141225,2023-06-22 00:26:36.227395,2023-06-22 00:26:43.262719,0 days 00:00:07.035324,0.5,0.00734,6,2000,0.5,COMPLETE


## 6. Test Prediction and Make Submission

In [43]:
## preprocessing in same way
X_test = test[X.columns].fillna(X.mean())
# if is_scaling:
#     X_test = scaler.transform(X_test)
#     X_test = pd.DataFrame(data=X_test, columns=X.columns)

# if is_pca:
#     data_ = pca.transform(X_test)
#     X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

X_test

Unnamed: 0,DU,FL,GL,DA,CR,DI,AF,AB,FD,BC,...,DE,EE,CC,FE,BQ,DH,FI,EB,DL,AM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    lr_best_params = lr_study.best_params
    xgb_best_params = xgb_study.best_params
    if is_cuml:
        pass
    else:
        svm_best_params = svm_study.best_params

    best_rf = RandomForestClassifier(**rf_best_params)
    best_lr = LogisticRegression(**lr_best_params)
    best_xgb = XGBClassifier(**xgb_best_params)
    if is_cuml:
        best_svm = svm
    else:
        best_svm = SVC(**svm_best_params, probability=True)

### 위에서 만든 testset으로 OOF Prediction 성능 확인 필요!

In [45]:
# first ensebmle model, then check it.
best_rf.fit(X_train, y_train)
best_lr.fit(X_train, y_train)
best_xgb.fit(X_train, y_train)
best_svm.fit(X_train, y_train)

# OOF-prediction
v_rf = best_rf.predict_proba(X_val)
v_lr = best_lr.predict_proba(X_val)
v_xgb = best_xgb.predict_proba(X_val)
v_svm = best_svm.predict_proba(X_val)
print(v_rf.shape, v_lr.shape, v_xgb.shape, v_svm.shape)


newton-cg failed to converge. Increase the number of iterations.



[W] [00:28:45.503591] L-BFGS: max iterations reached
[W] [00:28:45.503799] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
(150, 2) (150, 2) (150, 2) (150, 2)


In [46]:
# # MLP predictions
# v_nn = nn.predict(X_val)
# preds_nn = nn.predict(X_test)
# print(v_nn.shape, preds_nn.shape)

In [47]:
# OOF prediction
#ensembles = np.mean([v_rf, v_lr, v_xgb, v_svm, v_nn], axis=0)
ensembles = np.mean([v_rf, v_xgb, v_svm], axis=0)
#ensembles = np.mean([v_rf, v_xgb], axis=0)
print("OOF prediction logloss : %.4f" % evaluation_metric(y_val, ensembles))

OOF prediction logloss : 0.2372


In [48]:
# model finalization and make prediction
best_rf.fit(X, y)
best_lr.fit(X, y)
best_xgb.fit(X, y)
best_svm.fit(X, y)

preds_rf = best_rf.predict_proba(X_test)
preds_lr = best_lr.predict_proba(X_test)
preds_xgb = best_xgb.predict_proba(X_test)
preds_svm = best_svm.predict_proba(X_test).values
print(preds_rf.shape, preds_lr.shape, preds_xgb.shape, preds_svm.shape)


newton-cg failed to converge. Increase the number of iterations.



(5, 2) (5, 2) (5, 2) (5, 2)


In [49]:
v_rf = best_rf.predict_proba(X_val)
v_lr = best_lr.predict_proba(X_val)
v_xgb = best_xgb.predict_proba(X_val)
v_svm = best_svm.predict_proba(X_val).values
# ensembles = np.mean([v_rf, v_xgb, v_svm], axis=0)
ensembles = np.mean([v_rf, v_xgb, v_svm], axis=0)
print("(After finalization)OOF prediction logloss : %.4f" % evaluation_metric(y_val, ensembles))

(After finalization)OOF prediction logloss : 0.1273


In [50]:
print(preds_rf)
print(preds_lr)
print(preds_xgb)
#print(pre)


[[0.62 0.38]
 [0.62 0.38]
 [0.62 0.38]
 [0.62 0.38]
 [0.62 0.38]]
[[0.07982102 0.92017898]
 [0.07982102 0.92017898]
 [0.07982102 0.92017898]
 [0.07982102 0.92017898]
 [0.07982102 0.92017898]]
[[0.86787    0.13213003]
 [0.86787    0.13213003]
 [0.86787    0.13213003]
 [0.86787    0.13213003]
 [0.86787    0.13213003]]


In [51]:
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


In [52]:
#voting_weights = [0.1, 0.1, 0.25, 0.25, 0.3]
#voting_weights = [0.2, 0.2, 0.2, 0.2, 0.2]
#voting_weights = [0.25, 0.25, 0.25, 0.25]
voting_weights = [0.5, 0.5]
#voting_weights = [0.35, 0.35, 0.3]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_nn[:, 0] + voting_weights[4]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_nn[:, 1] + voting_weights[4]*preds_svm[:, 1]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_svm[:, 1]
submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_svm[:, 0]
submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_svm[:, 1]

submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.543328,0.456672
1,010ebe33f668,0.543328,0.456672
2,02fa521e1838,0.543328,0.456672
3,040e15f562a2,0.543328,0.456672
4,046e85c7cc7f,0.543328,0.456672


In [53]:
submission.to_csv("submission.csv", index=False)