#### [Version 25] TPESampler is added for using optuna as reproducible way.

**[Version 24] cuml LinearSVC is added.**

**[Version 23] cuml SVC is added.**

**[Version 22] Make config variable and refactor code.**

**[Version 21] Sampling method is applied to relieve class imbalance.**

**[~Version 20] Use KNNImputer, Standard Scaling, model ensemble(Random Forest & Support Vector Machine & XGBoost & Shallow Neural Network & Logistic Regression)**

This code will give you a general idea of how to do a machine learning project using scikit-learn and optuna.

It performs the necessary preprocessing, tunes the models with optuna, combines the tuned models, and performs ensemble (voting).

You can submit the finished result without any problem.

Below is the version history, so please refer to it when using it.

## 1. Data and Library Load

In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression

# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna
from optuna.samplers import TPESampler

In [86]:
# set configs
is_tuning = True
if is_tuning:
    sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
is_scaling = False
is_scaling2 = False
is_pca = False
apply_vif = False
is_cuml = True
is_debug = True
sampling_method = 'over' # 'under' or 'over'
if is_tuning:
    n_trials=30

# import SVC
if is_cuml:
    from cuml.svm import SVC, LinearSVC
else:
    from sklearn.svm import SVC
    
# Keras model compile
learning_rate = 1e-2
batch_size = 32
epochs = 10

In [87]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [88]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

def b_logloss_keras(y_true, y_pred):
    #(2, ) -> (1, ) inverse_one_hot encoding
    #y_true = y_true[:, 0] * (1 - y_true[:, 1]) + y_true[:, 1] * (1 - y_true[:, 0])
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score

In [89]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')
metadata = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
print(train.shape, test.shape)

(617, 58) (5, 57)


In [90]:
test

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
#train.EJ.value_counts()
#train.Class.value_counts() # class imbalance

## categorical feature -> EJ
## class imbalance가 있다 -> sampling (undersampling vs oversampling)

# undersampling : A technique for sampling data from a majority class and fitting it to a 1:1 ratio.
#                e.g. class0 : 1000, class1 : 100 ----> class0: 100, class1: 100

# oversampling : A technique for generating data from a minority class and sizing it 1:1 ratio.
#                e.g. class0 : 1000, class1 : 100 -----> class0 : 1000, class1 : 1000

## 2. EDA

In [92]:
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1

train = train.drop(columns=["Id","AR","AY","BZ","AH"])

In [93]:
imp = KNNImputer()
data = imp.fit_transform(train)
train = pd.DataFrame(columns=train.columns,
                    data=data)
train

Unnamed: 0,AB,AF,AM,AX,AZ,BC,BD,BN,BP,BQ,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,22.394407,0.699861,9.812214,5.555634,4126.58731,22.5984,175.638726,152.707705,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1.0
1,0.145282,978.76416,36.968889,3.632190,13.517790,1.229900,5496.92824,19.4205,155.868030,14.754720,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0.0
2,0.470030,2635.10654,32.360553,6.732840,12.824570,1.229900,5135.78024,26.4825,128.988531,219.320160,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0.0
3,0.252107,3819.65177,77.112203,3.685344,11.053708,1.229900,4169.67738,23.6577,237.282264,11.050410,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0.0
4,0.380297,3733.04844,14.103738,3.942255,3.396778,102.151980,5728.73412,24.0108,324.546318,149.717165,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,9.513984,3.499305,8.545512,2.804172,4157.68439,21.1860,167.877117,27.287375,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0.0
613,0.435846,5462.03438,46.551007,5.979825,12.622906,3.777550,5654.07556,27.1887,285.628059,344.644105,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0.0
614,0.427300,2459.10720,55.355778,8.070549,15.408390,1.229900,5888.87769,20.4798,178.661133,103.988995,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0.0
615,0.363205,1263.53524,23.685856,7.981959,7.524588,1.229900,4517.86560,19.0674,119.162529,65.340173,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0.0


In [94]:
# check missing data
train[train.isnull().any(axis=1)]

Unnamed: 0,AB,AF,AM,AX,AZ,BC,BD,BN,BP,BQ,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class


In [95]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [96]:
# remove all features when VIF is over 10.
if apply_vif:
    top_vif = 100

    while(top_vif > 5):
        vif_df, remove_col, top_vif = check_vif(train)
        print(remove_col, top_vif)
        if top_vif < 5:
            break
        train = train.drop(columns=remove_col)

    display(train)

In [97]:
train

Unnamed: 0,AB,AF,AM,AX,AZ,BC,BD,BN,BP,BQ,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,22.394407,0.699861,9.812214,5.555634,4126.58731,22.5984,175.638726,152.707705,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1.0
1,0.145282,978.76416,36.968889,3.632190,13.517790,1.229900,5496.92824,19.4205,155.868030,14.754720,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0.0
2,0.470030,2635.10654,32.360553,6.732840,12.824570,1.229900,5135.78024,26.4825,128.988531,219.320160,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0.0
3,0.252107,3819.65177,77.112203,3.685344,11.053708,1.229900,4169.67738,23.6577,237.282264,11.050410,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0.0
4,0.380297,3733.04844,14.103738,3.942255,3.396778,102.151980,5728.73412,24.0108,324.546318,149.717165,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,9.513984,3.499305,8.545512,2.804172,4157.68439,21.1860,167.877117,27.287375,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0.0
613,0.435846,5462.03438,46.551007,5.979825,12.622906,3.777550,5654.07556,27.1887,285.628059,344.644105,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0.0
614,0.427300,2459.10720,55.355778,8.070549,15.408390,1.229900,5888.87769,20.4798,178.661133,103.988995,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0.0
615,0.363205,1263.53524,23.685856,7.981959,7.524588,1.229900,4517.86560,19.0674,119.162529,65.340173,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0.0


In [98]:
# # feature selection via Feature Importance
# X = train.drop(columns=["Class"])
# y = train['Class']

# rf = RandomForestClassifier()
# rf.fit(X, y)
# fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
# selected_cols = fi_df.sort_values(by="importance", ascending=False)["feature"].values
# selected_cols

### 우선은 범위를 0.15이상으로 해보자

In [99]:
# X = train.drop(columns=["Class"])
# y = train['Class']

# rf = RandomForestClassifier(random_state=42)
# rf.fit(X, y)
# fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
# selected_cols=fi_df[fi_df["importance"]>=0.015]
# selected_cols = selected_cols.sort_values(by='importance', ascending=False)["feature"].values
# selected_cols


## 3. Data preprocessing

In [100]:
# class imbalance handling
## 1. undersampling
if sampling_method == 'under':
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=c1.shape[0]) # 509 -> 108
    train = pd.concat([c0, c1])
    print(train.shape)


In [101]:
## before oversampling
# df = train[selected_cols]
# df["Class"] = train["Class"]
# pd.pivot_table(index="Class", data=df)

In [102]:
## 2. oversampling -> SMOTE
if sampling_method == 'over':
#     X = train[selected_cols]
    X = train.drop(columns=["Class"])
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # fit_resample 함수가 자동으로 minority class를 y 기준으로 찾아서, 1:1로 맞춰줌.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled
    
    display(train)

(1018, 52) (1018,)


Unnamed: 0,AB,AF,AM,AX,AZ,BC,BD,BN,BP,BQ,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.033290,22.394407,0.699861,9.812214,5.555634,4126.587310,22.598400,175.638726,152.707705,...,7.298162,1.738550,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1.0
1,0.145282,978.764160,36.968889,3.632190,13.517790,1.229900,5496.928240,19.420500,155.868030,14.754720,...,0.173229,0.497060,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0.0
2,0.470030,2635.106540,32.360553,6.732840,12.824570,1.229900,5135.780240,26.482500,128.988531,219.320160,...,7.709560,0.975560,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0.0
3,0.252107,3819.651770,77.112203,3.685344,11.053708,1.229900,4169.677380,23.657700,237.282264,11.050410,...,6.122162,0.497060,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0.0
4,0.380297,3733.048440,14.103738,3.942255,3.396778,102.151980,5728.734120,24.010800,324.546318,149.717165,...,8.153058,48.501340,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,1.155454,7765.902089,83.583142,6.858764,12.591016,19.815860,5103.215400,20.605389,250.848486,91.990118,...,5.591759,0.852688,0.897227,13.136824,72.611063,4389.532908,41.622466,42.833882,0.071657,1.0
1014,0.260561,3093.427667,24.675984,4.059334,9.338525,4.116623,6089.906010,21.261362,302.308120,135.311779,...,14.634665,1.367253,0.206000,31.195220,76.178279,2502.988473,30.941968,52.579150,0.057795,1.0
1015,0.472959,7338.643022,19.601582,5.170608,10.895696,3.939001,3902.433665,18.928616,150.875151,101.146836,...,6.311454,0.877324,1.338959,15.057463,72.611063,1840.273858,37.198788,78.883867,0.285694,1.0
1016,0.319812,2499.331768,20.018933,3.692002,12.307878,5.334254,4789.315070,24.845685,237.340155,43.618964,...,10.107483,1.922875,0.861598,19.214067,72.611063,15864.203322,29.456832,38.933610,0.068886,1.0


In [103]:
# After SMOTE
# df = X_resampled.copy()
# df["Class"] = y_resampled
# pd.pivot_table(index="Class", data=df)

In [104]:
# 3. hybrid approach
## class0 : 509 -> 300
## class1 : 108 -> 300
if sampling_method == 'hybrid':
    N = 300
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape)
    c0 = c0.sample(n=N) # 509 -> 300
    train = pd.concat([c0, c1])
    print(train.shape)
    
    X = train.drop(columns=["Class"])
    y = train['Class']

    smote = SMOTE(k_neighbors=5)
    # The fit_resample function automatically finds the minority class by y and fits it 1:1.
    X_resampled, y_resampled = smote.fit_resample(X, y) # 300, 108 --> 300, 300
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled
    display(train)

In [105]:
# to make OOF prediction
from sklearn.model_selection import train_test_split

X = train.drop(columns=["Class"])
#X = train[selected_cols]
y = train['Class']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(865, 52) (153, 52) (865,) (153,)


### feature scaling

- Use StandardScaler

In [106]:
from sklearn.preprocessing import StandardScaler

if is_scaling:
    scaler = StandardScaler()
    data_ = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=X_train.columns)
    data_ = scaler.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=X_val.columns)
    display(X_train)

In [107]:
from sklearn.preprocessing import MinMaxScaler

if is_scaling2:
    scaler = MinMaxScaler()
    data_ = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=X_train.columns)
    data_ = scaler.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=X_val.columns)
    display(X_train) 


In [108]:
if is_pca:
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=0.90, random_state=42)
    data_ = pca.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    data_ = pca.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

    display(X_train)

## 4. Fitting and Evaluation


- does not need anymore.

In [109]:
# Model ensemble of SVM, Logistic Regression, XGBoost, RandomForest, Simple NN.
# The NN model performed too poorly, so we excluded it now.

# svm = SVC(random_state=42)
# lr = LogisticRegression(random_state=42, max_iter=300)
# xgb = XGBClassifier(max_depth=3, colsample_bytree=0.8, reg_lambda=1, objective='binary:logistic', random_state=42)
# rf = RandomForestClassifier(max_depth=3, max_features=0.8, criterion='log_loss', random_state=42)
# nn = Sequential([
#     Input(shape=(X_train.shape[1],)),
#     #Dense(30), ReLU(), Dropout(0.2),
#     #Dense(20), ReLU(), Dropout(0.2),
#     #Dense(10), ReLU(), Dropout(0.1),
#     Dense(5), ReLU(), Dropout(0.2),
#     Dense(2), Softmax()
# ])
# nn.summary()

In [110]:
# optimizer = Adam(learning_rate=learning_rate)
# loss_fn = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.9)   # [0.8, 0.2] <--> [0.9, 0] // [0, 0.9]
# scheduler = ReduceLROnPlateau(monitor='val_loss',
#                               factor=0.5,
#                               patience=10,
#                               min_lr=1e-6)
# earlystopper = EarlyStopping(monitor='val_loss',
#                              patience=20,
#                              min_delta=1e-2)


# nn.compile(optimizer=optimizer, loss=loss_fn, metrics=[b_logloss_keras])

# nn_y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
# nn_y_val = tf.keras.utils.to_categorical(y_val, num_classes=2)

In [111]:
# print("\nFitting LogisticRegression...")
# lr.fit(X_train, y_train)
# print("\nFitting SVM...")
# svm.fit(X_train, y_train)
# print("\nFitting RandomForest...")
# rf.fit(X_train, y_train)
# print("\nFitting XGBoost...")
# xgb.fit(X_train, y_train)
# print("\nFitting MLP...")
# history = nn.fit(X_train, nn_y_train,
#                 batch_size=batch_size,
#                 epochs=epochs,
#                 validation_data=[X_val, nn_y_val],
#                 callbacks=[scheduler, earlystopper])

In [112]:
# ## loss visualize
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2,1)
# plt.plot(history.history['loss'],'b-', label = "training")
# plt.plot(history.history['val_loss'], 'r:', label = "validation")
# plt.title("model - loss")
# plt.legend()

# plt.subplot(1, 2, 2)
# plt.title("model - val_logloss")

# plt.plot(history.history['b_logloss_keras'], 'b-', label = "training")
# plt.plot(history.history['val_b_logloss_keras'], 'r:', label = "validation")

# plt.legend()
# plt.tight_layout()
# plt.show()

In [113]:
# set metric
evaluation_metric = balance_logloss
#평가
#evaluation_metric_keras = b_logloss_keras

In [114]:
# print("--- Prediction with XGB ---")
# pred_train = xgb.predict_proba(X_train)
# pred_val = xgb.predict_proba(X_val)

# train_score = evaluation_metric(y_train, pred_train)
# val_score = evaluation_metric(y_val, pred_val)

# print("Train Score : %.4f" % train_score)
# print("Test Score : %.4f" % val_score)

# print("--- Prediction with MLP ---")
# pred_train = nn.predict(X_train)
# pred_val = nn.predict(X_val)

# train_score = evaluation_metric_keras(nn_y_train, pred_train)
# val_score = evaluation_metric_keras(nn_y_val, pred_val)

# print("Train Score : %.4f" % train_score)
# print("Validation Score : %.4f" % val_score)

## 5. (Super)Hyper-parameter Tuning

Let's try hyper-parameter tuning using optuna, an AutoML framework.

Optuna defines a target function to optimize and then optimizes that function.

For each model, we define an optimizer function separately and then burn optuna.

In [115]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [116]:
def svm_optimizer(trial, X, y, K):
    C = trial.suggest_int('C', 1, 100)
    kernel = trial.suggest_categorical('kernel', ['rbf', 'linear'])
    

    model = SVC(C=C,
                kernel=kernel,
                #class_weight='balanced', # if class imbalanced
                probability=True,
                #cache_size=1000,
                random_state=42
               )
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [117]:
def lr_optimizer(trial, X, y, K):

    C = trial.suggest_int('C', 1, 100)
    solver = trial.suggest_categorical('solver', ['liblinear', 'newton-cg', 'newton-cholesky', 'saga'])    
    

    model = LogisticRegression(C=C,
                               solver=solver,
                               max_iter=500,
                               class_weight='balanced',
                               random_state=42,
                               n_jobs=-1)
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [118]:
def xgb_optimizer(trial, X, y, K):
    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])
    
    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda)
#                          scale_pos_weight=4.71)  ## we set class imbalance by using sampling method.
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [119]:
K = 4 # set K of K-Fold
opt_func = partial(rf_optimizer, X=X, y=y, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize", sampler=sampler) # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-22 05:46:49,293][0m A new study created in memory with name: no-name-e7c816f8-88fe-465c-9381-81b3104663d8[0m
[32m[I 2023-06-22 05:46:54,498][0m Trial 0 finished with value: 0.1754623854376599 and parameters: {'n_estimators': 100, 'max_depth': 8, 'max_features': 0.6}. Best is trial 0 with value: 0.1754623854376599.[0m
[32m[I 2023-06-22 05:46:56,351][0m Trial 1 finished with value: 0.2518999218467712 and parameters: {'n_estimators': 50, 'max_depth': 4, 'max_features': 0.6}. Best is trial 0 with value: 0.1754623854376599.[0m
[32m[I 2023-06-22 05:47:07,575][0m Trial 2 finished with value: 0.1741961211429051 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.8}. Best is trial 2 with value: 0.1741961211429051.[0m
[32m[I 2023-06-22 05:47:16,666][0m Trial 3 finished with value: 0.17233130320826945 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.6}. Best is trial 3 with value: 0.17233130320826945.[0m
[32m[I 2023-06-22 0

In [120]:
K = 4
opt_func = partial(lr_optimizer, X=X, y=y, K=K)

if is_tuning:
    lr_study = optuna.create_study(direction="minimize", sampler=sampler) 
    lr_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-22 05:49:11,242][0m A new study created in memory with name: no-name-076a4e1f-aff5-408d-9f9c-4cebfd4a0116[0m
[32m[I 2023-06-22 05:49:12,766][0m Trial 0 finished with value: 0.29771179454858077 and parameters: {'C': 78, 'solver': 'newton-cholesky'}. Best is trial 0 with value: 0.29771179454858077.[0m

'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.


'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.


'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.


'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.

[32m[I 2023-06-22 05:49:14,420][0m Trial 1 finished with value: 0.28808596567270933 and parameters: {'C': 73, 'solver': 'liblinear'}. Best is trial 1 with value: 0.28808596567270933.[0m

'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 2.


'n_jobs' > 1 

In [121]:
K = 4 
#opt_func = partial(svm_optimizer, X=X, y=y, K=K)

if is_tuning:
    if is_cuml:
        #svm = SVC(C=5, probability=True)
        best_loss = 9999.0
        folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True) 

        for C in [1, 2, 5, 10, 100]:
            losses = []
            svm = LinearSVC(C=C, probability=True) ## cuml version. (faster model)
            
            for train_idx, val_idx in folds.split(X, y):
                X_train = X.iloc[train_idx, :]
                y_train = y.iloc[train_idx]
                X_val = X.iloc[val_idx, :]
                y_val = y.iloc[val_idx]
                
                svm.fit(X_train, y_train)
                preds = svm.predict_proba(X_val).values
                loss = evaluation_metric(y_val, preds)
                losses.append(loss)
                
            avg_loss = np.mean(losses)
            if avg_loss < best_loss:
                best_loss = avg_loss

        print("SVM log loss : %.4f" % best_loss)
    else:
        svm_study = optuna.create_study(direction="minimize", sampler=sampler)
        svm_study.optimize(opt_func, n_trials=n_trials)

[W] [05:50:47.523782] L-BFGS: max iterations reached
[W] [05:50:47.524012] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [05:50:47.912923] L-BFGS: max iterations reached
[W] [05:50:47.913128] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [05:50:48.349717] L-BFGS: max iterations reached
[W] [05:50:48.349922] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [05:50:48.770545] L-BFGS: max iterations reached
[W] [05:50:48.770753] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the inpu

In [122]:
K = 4
opt_func = partial(xgb_optimizer, X=X, y=y, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize", sampler=sampler)
    xgb_study.optimize(opt_func, n_trials=n_trials)

[32m[I 2023-06-22 05:50:55,145][0m A new study created in memory with name: no-name-6db212e6-18ac-4445-9b9d-e6321859a4b0[0m
[32m[I 2023-06-22 05:51:11,390][0m Trial 0 finished with value: 0.3536051771524234 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'colsample_bytree': 0.8, 'learning_rate': 0.0010625691747807163, 'reg_lambda': 0.1}. Best is trial 0 with value: 0.3536051771524234.[0m
[32m[I 2023-06-22 05:51:32,023][0m Trial 1 finished with value: 0.15124681176092153 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'colsample_bytree': 0.7, 'learning_rate': 0.0032660406624282776, 'reg_lambda': 0.1}. Best is trial 1 with value: 0.15124681176092153.[0m
[32m[I 2023-06-22 05:51:39,441][0m Trial 2 finished with value: 0.1293304422513642 and parameters: {'n_estimators': 500, 'max_depth': 5, 'colsample_bytree': 0.5, 'learning_rate': 0.009870854086995406, 'reg_lambda': 1}. Best is trial 2 with value: 0.1293304422513642.[0m
[32m[I 2023-06-22 05:51:51,381][0m Trial 3

In [123]:
# save all studies
if is_tuning:
    with open("rm_study.pk", 'wb') as f:
        pickle.dump(rf_study, f)
    with open("lr_study.pk", 'wb') as f:
        pickle.dump(lr_study, f)
    with open("xgb_study.pk", 'wb') as f:
        pickle.dump(xgb_study, f)
        
    if is_cuml:
        with open("svm_model.pk", 'wb') as f:
            pickle.dump(svm, f)
    else:
        with open("svm_study.pk", 'wb') as f:
            pickle.dump(svm_study, f)

    #nn.save("./simple_nn_model.keras")

In [124]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [125]:
if is_tuning:
    display_experiment_log(rf_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
0,0,0.175462,2023-06-22 05:46:49.294830,2023-06-22 05:46:54.498568,0 days 00:00:05.203738,8,0.6,100,COMPLETE
1,1,0.2519,2023-06-22 05:46:54.500061,2023-06-22 05:46:56.350621,0 days 00:00:01.850560,4,0.6,50,COMPLETE
2,2,0.174196,2023-06-22 05:46:56.352128,2023-06-22 05:47:07.574716,0 days 00:00:11.222588,7,0.8,200,COMPLETE
3,3,0.172331,2023-06-22 05:47:07.576451,2023-06-22 05:47:16.665749,0 days 00:00:09.089298,7,0.6,200,COMPLETE
4,4,0.215633,2023-06-22 05:47:16.667238,2023-06-22 05:47:27.514893,0 days 00:00:10.847655,5,0.8,200,COMPLETE
5,5,0.168784,2023-06-22 05:47:27.516443,2023-06-22 05:47:30.353824,0 days 00:00:02.837381,8,0.8,50,COMPLETE
6,6,0.167994,2023-06-22 05:47:30.355686,2023-06-22 05:47:36.082943,0 days 00:00:05.727257,8,0.8,100,COMPLETE
7,7,0.17464,2023-06-22 05:47:36.084357,2023-06-22 05:47:41.792879,0 days 00:00:05.708522,10,0.8,100,COMPLETE
8,8,0.1727,2023-06-22 05:47:41.794313,2023-06-22 05:47:47.266443,0 days 00:00:05.472130,6,0.8,100,COMPLETE
9,9,0.237765,2023-06-22 05:47:47.267960,2023-06-22 05:47:57.035800,0 days 00:00:09.767840,4,0.8,200,COMPLETE


Best Score: 0.1654
Best params:  {'n_estimators': 50, 'max_depth': 10, 'max_features': 0.7}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
22,22,0.165389,2023-06-22 05:48:38.407197,2023-06-22 05:48:41.054851,0 days 00:00:02.647654,10,0.7,50,COMPLETE


In [126]:
if is_tuning:
    display_experiment_log(lr_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_solver,state
0,0,0.297712,2023-06-22 05:49:11.243988,2023-06-22 05:49:12.765187,0 days 00:00:01.521199,78,newton-cholesky,COMPLETE
1,1,0.288086,2023-06-22 05:49:12.769047,2023-06-22 05:49:14.420084,0 days 00:00:01.651037,73,liblinear,COMPLETE
2,2,0.301741,2023-06-22 05:49:14.422116,2023-06-22 05:49:16.020368,0 days 00:00:01.598252,87,liblinear,COMPLETE
3,3,0.331805,2023-06-22 05:49:16.022492,2023-06-22 05:49:16.194798,0 days 00:00:00.172306,33,newton-cholesky,COMPLETE
4,4,0.602916,2023-06-22 05:49:16.196834,2023-06-22 05:49:17.837497,0 days 00:00:01.640663,12,saga,COMPLETE
5,5,0.296442,2023-06-22 05:49:17.839359,2023-06-22 05:49:19.518317,0 days 00:00:01.678958,50,liblinear,COMPLETE
6,6,0.610384,2023-06-22 05:49:19.520239,2023-06-22 05:49:21.261520,0 days 00:00:01.741281,4,saga,COMPLETE
7,7,0.35312,2023-06-22 05:49:21.267721,2023-06-22 05:49:36.178954,0 days 00:00:14.911233,25,newton-cg,COMPLETE
8,8,0.322958,2023-06-22 05:49:36.180882,2023-06-22 05:49:51.535788,0 days 00:00:15.354906,29,newton-cg,COMPLETE
9,9,0.338137,2023-06-22 05:49:51.537852,2023-06-22 05:49:51.715099,0 days 00:00:00.177247,88,newton-cholesky,COMPLETE


Best Score: 0.2828
Best params:  {'C': 99, 'solver': 'liblinear'}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_solver,state
19,19,0.282818,2023-06-22 05:50:18.533913,2023-06-22 05:50:20.399968,0 days 00:00:01.866055,99,liblinear,COMPLETE


In [127]:
if is_tuning:
    if is_cuml:
        print(svm.predict_proba(X_val))
        print("SVM log loss : %.4f" % evaluation_metric(y_val, svm.predict_proba(X_val).values))
    else:
        display_experiment_log(svm_study)

                0         1
0    7.994130e-01  0.200587
1    1.030492e-07  1.000000
2    1.800139e-02  0.981999
3    2.442572e-02  0.975574
4    1.241448e-01  0.875855
..            ...       ...
249  5.308338e-01  0.469166
250  3.401162e-01  0.659884
251  6.847622e-02  0.931524
252  3.736550e-01  0.626345
253  8.645417e-01  0.135458

[254 rows x 2 columns]
SVM log loss : 0.4758


In [128]:
if is_tuning:
    display_experiment_log(xgb_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,0.353605,2023-06-22 05:50:55.147574,2023-06-22 05:51:11.390295,0 days 00:00:16.242721,0.8,0.001063,4,1000,0.1,COMPLETE
1,1,0.151247,2023-06-22 05:51:11.394375,2023-06-22 05:51:32.022759,0 days 00:00:20.628384,0.7,0.003266,7,1000,0.1,COMPLETE
2,2,0.12933,2023-06-22 05:51:32.024920,2023-06-22 05:51:39.440860,0 days 00:00:07.415940,0.5,0.009871,5,500,1.0,COMPLETE
3,3,0.294636,2023-06-22 05:51:39.442787,2023-06-22 05:51:51.380620,0 days 00:00:11.937833,0.7,0.002679,8,500,1.0,COMPLETE
4,4,0.109833,2023-06-22 05:51:51.382519,2023-06-22 05:52:20.398270,0 days 00:00:29.015751,0.7,0.00407,5,2000,0.5,COMPLETE
5,5,0.126169,2023-06-22 05:52:20.400252,2023-06-22 05:52:41.243792,0 days 00:00:20.843540,0.8,0.006698,7,1000,2.0,COMPLETE
6,6,0.376728,2023-06-22 05:52:41.245737,2023-06-22 05:52:47.432464,0 days 00:00:06.186727,0.6,0.001913,4,500,0.1,COMPLETE
7,7,0.146507,2023-06-22 05:52:47.434364,2023-06-22 05:52:57.488579,0 days 00:00:10.054215,0.7,0.008643,8,500,0.1,COMPLETE
8,8,0.088164,2023-06-22 05:52:57.492481,2023-06-22 05:53:20.413308,0 days 00:00:22.920827,0.5,0.006192,6,2000,1.0,COMPLETE
9,9,0.099373,2023-06-22 05:53:20.415371,2023-06-22 05:53:33.287233,0 days 00:00:12.871862,0.5,0.009355,10,1000,0.5,COMPLETE


Best Score: 0.0844
Best params:  {'n_estimators': 2000, 'max_depth': 10, 'colsample_bytree': 0.5, 'learning_rate': 0.007426089851264448, 'reg_lambda': 0.5}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
12,12,0.084442,2023-06-22 05:54:18.572152,2023-06-22 05:54:39.617736,0 days 00:00:21.045584,0.5,0.007426,10,2000,0.5,COMPLETE


## 6. Test Prediction and Make Submission

In [129]:
## preprocessing in same way
X_test = test[X.columns].fillna(X.mean())
# if is_scaling:
#     X_test = scaler.transform(X_test)
#     X_test = pd.DataFrame(data=X_test, columns=X.columns)

# if is_pca:
#     data_ = pca.transform(X_test)
#     X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

X_test.EJ = lb.fit_transform(X_test.EJ)

In [130]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    lr_best_params = lr_study.best_params
    xgb_best_params = xgb_study.best_params
    if is_cuml:
        pass
    else:
        svm_best_params = svm_study.best_params

    best_rf = RandomForestClassifier(**rf_best_params)
    best_lr = LogisticRegression(**lr_best_params)
    best_xgb = XGBClassifier(**xgb_best_params)
    if is_cuml:
        best_svm = svm
    else:
        best_svm = SVC(**svm_best_params, probability=True)

### 위에서 만든 testset으로 OOF Prediction 성능 확인 필요!

In [131]:
# first ensebmle model, then check it.
best_rf.fit(X_train, y_train)
best_lr.fit(X_train, y_train)
best_xgb.fit(X_train, y_train)
best_svm.fit(X_train, y_train)

# OOF-prediction
v_rf = best_rf.predict_proba(X_val)
v_lr = best_lr.predict_proba(X_val)
v_xgb = best_xgb.predict_proba(X_val)
v_svm = best_svm.predict_proba(X_val)
print(v_rf.shape, v_lr.shape, v_xgb.shape, v_svm.shape)

[W] [06:01:31.830684] L-BFGS: max iterations reached
[W] [06:01:31.830932] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
(254, 2) (254, 2) (254, 2) (254, 2)


In [132]:
# # MLP predictions
# v_nn = nn.predict(X_val)
# preds_nn = nn.predict(X_test)
# print(v_nn.shape, preds_nn.shape)

In [133]:
# OOF prediction
#ensembles = np.mean([v_rf, v_lr, v_xgb, v_svm, v_nn], axis=0)
ensembles = np.mean([v_rf, v_xgb, v_svm], axis=0)
#ensembles = np.mean([v_rf, v_xgb], axis=0)
print("OOF prediction logloss : %.4f" % evaluation_metric(y_val, ensembles))

OOF prediction logloss : 0.1673


In [134]:
# model finalization and make prediction
best_rf.fit(X, y)
best_lr.fit(X, y)
best_xgb.fit(X, y)
best_svm.fit(X, y)

preds_rf = best_rf.predict_proba(X_test)
preds_lr = best_lr.predict_proba(X_test)
preds_xgb = best_xgb.predict_proba(X_test)
preds_svm = best_svm.predict_proba(X_test)
print(preds_rf.shape, preds_lr.shape, preds_xgb.shape, preds_svm.shape)


Liblinear failed to converge, increase the number of iterations.



[W] [06:01:41.796665] L-BFGS: max iterations reached
[W] [06:01:41.796868] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
(5, 2) (5, 2) (5, 2) (5, 2)


In [135]:
v_rf = best_rf.predict_proba(X_val)
v_lr = best_lr.predict_proba(X_val)
v_xgb = best_xgb.predict_proba(X_val)
v_svm = best_svm.predict_proba(X_val).values
# ensembles = np.mean([v_rf, v_xgb, v_svm], axis=0)
ensembles = np.mean([v_rf, v_xgb, v_svm], axis=0)
print("(After finalization)OOF prediction logloss : %.4f" % evaluation_metric(y_val, ensembles))

(After finalization)OOF prediction logloss : 0.1059


In [136]:
print(preds_rf)
print(preds_lr)
print(preds_xgb)
#print(pre)


[[0.66 0.34]
 [0.66 0.34]
 [0.66 0.34]
 [0.66 0.34]
 [0.66 0.34]]
[[0.50005195 0.49994805]
 [0.50005195 0.49994805]
 [0.50005195 0.49994805]
 [0.50005195 0.49994805]
 [0.50005195 0.49994805]]
[[0.9769662  0.02303381]
 [0.9769662  0.02303381]
 [0.9769662  0.02303381]
 [0.9769662  0.02303381]
 [0.9769662  0.02303381]]


In [137]:
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


In [138]:
#voting_weights = [0.1, 0.1, 0.25, 0.25, 0.3]
#voting_weights = [0.2, 0.2, 0.2, 0.2, 0.2]
#voting_weights = [0.25, 0.25, 0.25, 0.25]
voting_weights = [0.5, 0.5]
#voting_weights = [0.35, 0.35, 0.3]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_nn[:, 0] + voting_weights[4]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_nn[:, 1] + voting_weights[4]*preds_svm[:, 1]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_svm[:, 1]
submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_svm[:, 0]
submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_svm[:, 1]

submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.572297,0.427703
1,010ebe33f668,0.572297,0.427703
2,02fa521e1838,0.572297,0.427703
3,040e15f562a2,0.572297,0.427703
4,046e85c7cc7f,0.572297,0.427703


In [139]:
submission.to_csv("submission.csv", index=False)