# Import

In [54]:
import pandas as pd
import numpy as np
import os
import sys
import random
import math
import warnings 
warnings.filterwarnings(action='ignore')

import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, recall_score, precision_score

from imblearn.combine import SMOTETomek

import optuna
from optuna.samplers import TPESampler


# Data Load

In [33]:
train_df = pd.read_csv('./train.csv')
valid_df = pd.read_csv('./val.csv')

In [34]:
train_df = train_df[['ID','V14','V12','V10','V3','V9','V6','V16','V17','V7','V2','V4','V11']]
valid_df = valid_df[['ID','V14','V12','V10','V3','V9','V6','V16','V17','V7','V2','V4','V11','Class']]

In [35]:
train_df.head()

Unnamed: 0,ID,V14,V12,V10,V3,V9,V6,V16,V17,V7,V2,V4,V11
0,3,-0.165946,0.066084,0.207643,1.773209,-1.514654,1.800499,-2.890083,1.109969,0.791461,-1.340163,0.37978,0.624501
1,4,-0.287924,0.178228,-0.054952,1.792993,-1.387024,1.247203,-1.059647,-0.684093,0.237609,-0.185226,-0.863291,-0.226487
2,6,-0.137134,0.359894,-0.371407,1.141109,-0.568671,-0.029728,0.401726,-0.058133,0.476201,0.960523,-0.168252,1.341262
3,8,-1.323865,0.291474,1.249376,1.07438,0.615375,0.428118,-0.076127,-1.222127,1.120631,1.417964,-0.492199,-0.619468
4,9,0.074355,-0.110452,-0.41043,-0.113192,-0.392048,3.721818,-0.210077,-0.499768,0.370145,0.286157,-0.271526,-0.705117


In [36]:
valid_df.head()

Unnamed: 0,ID,V14,V12,V10,V3,V9,V6,V16,V17,V7,V2,V4,V11,Class
0,10,-0.443523,0.83639,-0.366846,1.044367,-0.736727,-0.246761,0.739453,-0.54098,0.651583,1.119593,-0.222187,1.017614,0
1,22,0.983739,0.406774,0.724396,-0.171479,-1.191311,1.696038,-0.602232,0.402484,0.107712,0.328461,2.109204,1.69033,0
2,63,0.788864,-0.660863,0.757952,0.194008,-0.896287,-1.04443,0.295404,-0.287878,0.531588,0.575068,2.598192,-0.448937,0
3,69,0.314166,1.38516,-0.225495,0.387585,0.226278,1.650358,-1.333716,1.002869,-0.427576,-0.323684,0.544474,1.944545,0
4,83,-2.837136,0.066851,6.051521,1.483691,4.009259,-0.824575,-0.303774,-1.942303,2.065426,2.600138,-2.418473,2.5725,0


# MissForest
https://github.com/epsilon-machine/missingpy/blob/master/missingpy/missforest.py

In [20]:
train_df['data_cd']='train'
valid_df['data_cd']='valid'

In [21]:
df = pd.concat([train_df.drop(columns=['ID']), valid_df.drop(columns=['ID'])])

df.set_index('data_cd', drop=True, inplace=True)
df['Class'] = df['Class'].astype('category')
cat_cols = [df.columns.get_loc(col) for col in df.select_dtypes(['category']).columns.tolist()]
print(cat_cols)

[12]


In [22]:
# 현재 df 데이터프레임에서 Class
# train은 결측치로 되어있음

# 이 결측치를 대체하는 방법 중 MissForest 방법 사용
df

Unnamed: 0_level_0,V14,V12,V10,V3,V9,V6,V16,V17,V7,V2,V4,V11,Class
data_cd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
train,-0.165946,0.066084,0.207643,1.773209,-1.514654,1.800499,-2.890083,1.109969,0.791461,-1.340163,0.379780,0.624501,
train,-0.287924,0.178228,-0.054952,1.792993,-1.387024,1.247203,-1.059647,-0.684093,0.237609,-0.185226,-0.863291,-0.226487,
train,-0.137134,0.359894,-0.371407,1.141109,-0.568671,-0.029728,0.401726,-0.058133,0.476201,0.960523,-0.168252,1.341262,
train,-1.323865,0.291474,1.249376,1.074380,0.615375,0.428118,-0.076127,-1.222127,1.120631,1.417964,-0.492199,-0.619468,
train,0.074355,-0.110452,-0.410430,-0.113192,-0.392048,3.721818,-0.210077,-0.499768,0.370145,0.286157,-0.271526,-0.705117,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
valid,1.018502,0.748099,-0.077753,-0.313252,-1.099446,-0.565669,-0.914161,0.143397,1.066075,1.433992,0.926044,0.534689,0.0
valid,-0.317789,0.162691,-1.245088,-0.907599,-0.730854,-0.760802,0.482847,0.681412,0.758545,0.588379,-0.418847,0.874312,0.0
valid,0.753473,-0.690868,0.076605,-0.168583,-0.417116,-0.605641,-1.129904,0.319074,1.253430,1.302212,0.981577,-1.291228,0.0
valid,0.626211,0.559047,0.267772,-1.196825,0.435402,-0.726571,0.225361,-0.605252,0.017050,-0.175233,0.234580,0.523316,0.0


In [23]:
imputer = MissForest(max_iter=10, n_estimators=28000, random_state=42, n_jobs=-1, verbose=1)
df_imputed = imputer.fit_transform(df, cat_vars=cat_cols)
df_imputed = pd.DataFrame(df_imputed, columns=df.columns, index=df.index)
df_imputed

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 11234 tasks  

Iteration: 0


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 11234 tasks  

Iteration: 1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 11234 tasks  

Iteration: 2


[Parallel(n_jobs=8)]: Done 28000 out of 28000 | elapsed:   17.4s finished


Unnamed: 0_level_0,V14,V12,V10,V3,V9,V6,V16,V17,V7,V2,V4,V11,Class
data_cd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
train,-0.165946,0.066084,0.207643,1.773209,-1.514654,1.800499,-2.890083,1.109969,0.791461,-1.340163,0.379780,0.624501,0.0
train,-0.287924,0.178228,-0.054952,1.792993,-1.387024,1.247203,-1.059647,-0.684093,0.237609,-0.185226,-0.863291,-0.226487,0.0
train,-0.137134,0.359894,-0.371407,1.141109,-0.568671,-0.029728,0.401726,-0.058133,0.476201,0.960523,-0.168252,1.341262,0.0
train,-1.323865,0.291474,1.249376,1.074380,0.615375,0.428118,-0.076127,-1.222127,1.120631,1.417964,-0.492199,-0.619468,0.0
train,0.074355,-0.110452,-0.410430,-0.113192,-0.392048,3.721818,-0.210077,-0.499768,0.370145,0.286157,-0.271526,-0.705117,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
valid,1.018502,0.748099,-0.077753,-0.313252,-1.099446,-0.565669,-0.914161,0.143397,1.066075,1.433992,0.926044,0.534689,0.0
valid,-0.317789,0.162691,-1.245088,-0.907599,-0.730854,-0.760802,0.482847,0.681412,0.758545,0.588379,-0.418847,0.874312,0.0
valid,0.753473,-0.690868,0.076605,-0.168583,-0.417116,-0.605641,-1.129904,0.319074,1.253430,1.302212,0.981577,-1.291228,0.0
valid,0.626211,0.559047,0.267772,-1.196825,0.435402,-0.726571,0.225361,-0.605252,0.017050,-0.175233,0.234580,0.523316,0.0


In [39]:
# train dataset labeling 완료
train_df2 = df_imputed[df_imputed.index=='train']
train_df2.reset_index(drop=True, inplace=True)
train_df2['ID'] = train_df['ID']
train_df2

Unnamed: 0,V14,V12,V10,V3,V9,V6,V16,V17,V7,V2,V4,V11,Class,ID
0,-0.165946,0.066084,0.207643,1.773209,-1.514654,1.800499,-2.890083,1.109969,0.791461,-1.340163,0.379780,0.624501,0.0,3
1,-0.287924,0.178228,-0.054952,1.792993,-1.387024,1.247203,-1.059647,-0.684093,0.237609,-0.185226,-0.863291,-0.226487,0.0,4
2,-0.137134,0.359894,-0.371407,1.141109,-0.568671,-0.029728,0.401726,-0.058133,0.476201,0.960523,-0.168252,1.341262,0.0,6
3,-1.323865,0.291474,1.249376,1.074380,0.615375,0.428118,-0.076127,-1.222127,1.120631,1.417964,-0.492199,-0.619468,0.0,8
4,0.074355,-0.110452,-0.410430,-0.113192,-0.392048,3.721818,-0.210077,-0.499768,0.370145,0.286157,-0.271526,-0.705117,0.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113837,1.288401,2.834865,8.655320,-8.476671,4.893089,-1.394465,1.067864,0.586179,-3.632516,10.187818,-2.510473,-1.052365,0.0,284796
113838,0.002019,0.677729,0.120908,-0.999943,0.666458,-0.613638,-0.549982,-0.199950,0.190241,-0.143540,1.506772,-1.134176,0.0,284797
113839,0.237948,-0.066655,-0.284708,0.399806,0.106234,-1.343668,-0.245496,-0.174846,0.929369,0.712247,-0.463406,-0.612982,0.0,284798
113840,-0.731707,0.190916,-0.657422,-0.546012,-0.204064,-0.235973,0.599628,0.070441,0.812722,0.931005,-0.745097,0.644837,0.0,284802


In [52]:
valid_df2 = df_imputed[df_imputed.index=='valid']
valid_df2.reset_index(drop=True, inplace=True)
valid_df2['ID'] = valid_df['ID']
valid_df2

Unnamed: 0,V14,V12,V10,V3,V9,V6,V16,V17,V7,V2,V4,V11,Class,ID
0,-0.443523,0.836390,-0.366846,1.044367,-0.736727,-0.246761,0.739453,-0.540980,0.651583,1.119593,-0.222187,1.017614,0.0,10
1,0.983739,0.406774,0.724396,-0.171479,-1.191311,1.696038,-0.602232,0.402484,0.107712,0.328461,2.109204,1.690330,0.0,22
2,0.788864,-0.660863,0.757952,0.194008,-0.896287,-1.044430,0.295404,-0.287878,0.531588,0.575068,2.598192,-0.448937,0.0,63
3,0.314166,1.385160,-0.225495,0.387585,0.226278,1.650358,-1.333716,1.002869,-0.427576,-0.323684,0.544474,1.944545,0.0,69
4,-2.837136,0.066851,6.051521,1.483691,4.009259,-0.824575,-0.303774,-1.942303,2.065426,2.600138,-2.418473,2.572500,0.0,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28457,1.018502,0.748099,-0.077753,-0.313252,-1.099446,-0.565669,-0.914161,0.143397,1.066075,1.433992,0.926044,0.534689,0.0,284769
28458,-0.317789,0.162691,-1.245088,-0.907599,-0.730854,-0.760802,0.482847,0.681412,0.758545,0.588379,-0.418847,0.874312,0.0,284779
28459,0.753473,-0.690868,0.076605,-0.168583,-0.417116,-0.605641,-1.129904,0.319074,1.253430,1.302212,0.981577,-1.291228,0.0,284790
28460,0.626211,0.559047,0.267772,-1.196825,0.435402,-0.726571,0.225361,-0.605252,0.017050,-0.175233,0.234580,0.523316,0.0,284801


In [40]:
X = train_df2[['V14','V12','V10','V3','V9','V6','V16','V17','V7','V2','V4','V11']]
y = train_df2['Class']

# Over-Sampling

In [48]:
X_resampled, y_resampled =SMOTETomek(random_state=42).fit_resample(X, y)

In [49]:
X_resampled.head()

Unnamed: 0,V14,V12,V10,V3,V9,V6,V16,V17,V7,V2,V4,V11
0,-0.165946,0.066084,0.207643,1.773209,-1.514654,1.800499,-2.890083,1.109969,0.791461,-1.340163,0.37978,0.624501
1,-0.287924,0.178228,-0.054952,1.792993,-1.387024,1.247203,-1.059647,-0.684093,0.237609,-0.185226,-0.863291,-0.226487
2,-0.137134,0.359894,-0.371407,1.141109,-0.568671,-0.029728,0.401726,-0.058133,0.476201,0.960523,-0.168252,1.341262
3,-1.323865,0.291474,1.249376,1.07438,0.615375,0.428118,-0.076127,-1.222127,1.120631,1.417964,-0.492199,-0.619468
4,0.074355,-0.110452,-0.41043,-0.113192,-0.392048,3.721818,-0.210077,-0.499768,0.370145,0.286157,-0.271526,-0.705117


In [50]:
X_resampled['Class'] = y_resampled

# KNN + optuna

In [53]:
X_train = X_resampled.drop(columns=['Class'])
X_train = X_train[['V14','V12','V10','V3','V9','V6','V16','V17','V7','V2','V4','V11']]
y_train = X_resampled['Class']


X_valid = valid_df2.drop(columns=['ID', 'Class'])
X_valid = X_valid[['V14','V12','V10','V3','V9','V6','V16','V17','V7','V2','V4','V11']]
y_valid = valid_df2['Class']

In [55]:
sampler = TPESampler(seed=42)

def objective(trial):    
    params = {
        'n_neighbors':trial.suggest_int("n_neighbors", 1, 60),
        'weights': trial.suggest_categorical('weights', ["uniform", "distance"]),
        'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski']),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'leaf_size':trial.suggest_int("leaf_size", 30, 60),
        'p': trial.suggest_categorical('p', [1, 2]),
        'n_jobs': -1,
        

    }
    
    model = KNeighborsClassifier(**params)
    model.fit(X_train, y_train)
    
    y_valid_pred = model.predict(X_valid)
    return f1_score(y_valid, y_valid_pred, average="macro")

In [56]:
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

[32m[I 2022-11-30 16:53:57,601][0m A new study created in memory with name: no-name-c0e07a5e-78fa-4b64-b04f-e60876410aea[0m
[32m[I 2022-11-30 16:54:18,238][0m Trial 0 finished with value: 0.9236496787663914 and parameters: {'n_neighbors': 23, 'weights': 'uniform', 'metric': 'euclidean', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}. Best is trial 0 with value: 0.9236496787663914.[0m
[32m[I 2022-11-30 16:54:44,470][0m Trial 1 finished with value: 0.9165787375726882 and parameters: {'n_neighbors': 13, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree', 'leaf_size': 41, 'p': 2}. Best is trial 0 with value: 0.9236496787663914.[0m
[32m[I 2022-11-30 16:54:51,453][0m Trial 2 finished with value: 0.9236496787663914 and parameters: {'n_neighbors': 12, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'kd_tree', 'leaf_size': 39, 'p': 2}. Best is trial 0 with value: 0.9236496787663914.[0m
[32m[I 2022-11-30 16:55:00,403][0m Trial 3 finished with valu

In [57]:
best_model = study.best_trial
best_params = best_model.params

print("Best model:")
print("  F1-score: {}".format(round(best_model.value, 4)))
print("  params  : {}".format(best_params))

Best model:
  F1-score: 0.9385
  params  : {'n_neighbors': 3, 'weights': 'uniform', 'metric': 'minkowski', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}


In [59]:
X = X_resampled[['V14','V12','V10','V3','V9','V6','V16','V17','V7','V2','V4','V11']]
y= X_resampled['Class']

In [60]:
model = KNeighborsClassifier(**best_params)
model.fit(X, y)

# Inference

In [61]:
test = pd.read_csv("./test.csv")
X_test = test.drop(columns=['ID'])
X_test = X_test[['V14','V12','V10','V3','V9','V6','V16','V17','V7','V2','V4','V11']]
y_test_pred = model.predict(X_test)

# Submit

In [62]:
submit = pd.read_csv("./sample_submission.csv")
submit['Class'] = y_test_pred
submit.to_csv("./knn_smote.csv", index=False)