# LightGBM(GPU) Install

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd /content/LightGBM
!mkdir build
!cmake -DUSE_GPU=1 #avoid ..
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
%cd /content/LightGBM/python-package
!sudo python setup.py install --precompile

Cloning into 'LightGBM'...
remote: Enumerating objects: 26285, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 26285 (delta 36), reused 56 (delta 21), pack-reused 26199[K
Receiving objects: 100% (26285/26285), 19.04 MiB | 16.72 MiB/s, done.
Resolving deltas: 100% (19418/19418), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/external_libs/compute'...
remote: Enumerating objects: 21733, done.        
remote: Counting objects: 100% (5/5), done.        
remot

# Setup

In [14]:
import os
# python global seed
os.environ['PYTHONHASHSEED'] = str(42)
# tensorflow seed (not working for GPU)
# os.environ['TF_DETERMINISTIC_OPS'] = '1'
# os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
# weight and bias
os.environ["WANDB_API_KEY"] = "b838b62906ab267778c6e05b913ba6c4a27699b2"

In [15]:
from google.colab import drive
drive.mount('/content/drive')
# # TextVectorization layer in tf 2.6 don't support "sparse=True" option which is used on TF-IDF
# # !pip install -q tensorflow==2.6.0
!pip install -q tensorflow==2.8
!pip install -q tensorflow-recommenders==0.6
# !pip install -q scann==1.2.3
# !pip install -q scann==1.2.6
# # !pip install -q tensorflow-datasets
!pip install -q tensorflow-addons
# !pip install -q tensorflow-hub
# # !pip install -q keras-tuner

# !pip install -q transformers

!pip install -q statsmodels
# !pip install -q xgboost
# !pip install -q lightgbm
# !pip install -q catboost
!pip install -q missingpy

!pip install -q optuna
!pip install -q wandb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
def config_missingpy():
    try:
        with open("/usr/local/lib/python3.7/site-packages/missingpy/knnimpute.py", 'r') as file:
        # read a list of lines into data
            data = file.readlines()
            # now change the 2nd line, note that you have to add a newline
            data[12] = 'from sklearn.neighbors._base import _check_weights\n'
            data[13] = 'from sklearn.neighbors._base import _get_weights\n'
        # and write everything back
        with open("/usr/local/lib/python3.7/site-packages/missingpy/knnimpute.py", 'w') as file:
            file.writelines(data)
    except:
        print("path error : /usr/local/lib/python3.7/site-packages/missingpy/knnimpute.py")
    try:
        with open("/usr/local/lib/python3.7/dist-packages/missingpy/knnimpute.py", 'r') as file:
        # read a list of lines into data
            data = file.readlines()
            # now change the 2nd line, note that you have to add a newline
            data[12] = 'from sklearn.neighbors._base import _check_weights\n'
            data[13] = 'from sklearn.neighbors._base import _get_weights\n'
        # and write everything back
        with open("/usr/local/lib/python3.7/dist-packages/missingpy/knnimpute.py", 'w') as file:
            file.writelines(data)
    except:
        print("path error : /usr/local/lib/python3.7/site-packages/missingpy/knnimpute.py")

In [17]:
import sys
import shutil
from glob import glob
import multiprocessing as mp
import gc
from pathlib import Path
from scipy import stats
from scipy.special import boxcox, softmax
from scipy import sparse
import itertools

from multiprocessing import cpu_count
import copy
import pickle
import warnings
from datetime import datetime, timedelta
from time import time, sleep, mktime
from matplotlib import font_manager as fm, rc, rcParams
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import random as rnd
import psutil
from optuna import Trial, create_study
from optuna.samplers import TPESampler
import wandb

import numpy as np
from numpy import array, nan, random as np_rnd, where
from numpy import dot
from numpy.linalg import norm
import pandas as pd
from pandas import DataFrame as dataframe, Series as series, isna, read_csv
from pandas.tseries.offsets import DateOffset

from sklearn.model_selection import train_test_split as tts, StratifiedKFold, GroupKFold, GroupShuffleSplit, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, KBinsDiscretizer
from sklearn import metrics
# from sklearn.compose import ColumnTransformer
config_missingpy(); from missingpy import MissForest
# from sklearn.impute import KNNImputer
# from sklearn.feature_extraction.text import TfidfVectorizer

# try:
#     # RAPIDS config
#     os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
#     os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
#     os.environ['CONDA_PREFIX'] = '/usr/local'
#     import cudf as cd
#     import cupy as cp
#     from cuml.cluster import KMeans
#     from cuml.neighbors import NearestNeighbors
#     from cuml.metrics.cluster import silhouette_score
# except:
#     print("RAPIDS Import ERROR")

# import xgboost as xgb
import lightgbm as lgb
# import catboost as cat

# ===== tensorflow =====
import tensorflow as tf
from tensorflow import random as tf_rnd
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers
from tensorflow.keras import metrics as tf_metrics
from tensorflow.keras import callbacks as tf_callbacks
from tqdm.keras import TqdmCallback
import tensorflow_addons as tfa
from tensorflow.keras.utils import plot_model
from keras.utils.layer_utils import count_params

# import keras_tuner as kt
# from keras_tuner import HyperModel
import tensorflow_hub as tf_hub
import tensorflow_recommenders as tfrs

# # # ===== pytorch =====
# import torch
# from torch.utils.data import DataLoader
# from transformers import AutoTokenizer
# from transformers import AutoModel

import librosa

# GPU memory setting
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
  except RuntimeError as e:
    print(e)

warnings.filterwarnings(action='ignore')
rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 200)
# plt.rc('font', family='NanumSquareB')

path error : /usr/local/lib/python3.7/site-packages/missingpy/knnimpute.py


In [18]:
# ===== utility functions =====
# label encoding for categorical column with excepting na value
def seed_everything(seed=42):
    # python random module
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cp.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
    except:
        pass
def which(bool_list):
    return where(bool_list)[0]
def easyIO(x=None, path=None, op="r"):
    tmp = None
    if op == "r":
        with open(path, "rb") as f:
            tmp = pickle.load(f)
        return tmp
    elif op == "w":
        with open(path, "wb") as f:
            pickle.dump(x, f)
    else:
        print("Unknown operation type")
def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]
def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]
def orderElems(for_order, using_ref):
    return [i for i in using_ref if i in for_order]
# concatenate by row
def cbr(df1, df2):
    if type(df1) == series:
        tmp_concat = series(pd.concat([dataframe(df1), dataframe(df2)], axis=0, ignore_index=True).iloc[:,0])
        tmp_concat.reset_index(drop=True, inplace=True)
    elif type(df1) == dataframe:
        tmp_concat = pd.concat([df1, df2], axis=0, ignore_index=True)
        tmp_concat.reset_index(drop=True, inplace=True)
    elif type(df1) == np.ndarray:
        tmp_concat = np.concatenate([df1, df2], axis=0)
    else:
        print("Unknown Type: return 1st argument")
        tmp_concat = df1
    return tmp_concat
def change_width(ax, new_value):
    for patch in ax.patches :
        current_width = patch.get_width()
        adj_value = current_width - new_value
        # we change the bar width
        patch.set_width(new_value)
        # we recenter the bar
        patch.set_x(patch.get_x() + adj_value * .5)
def week_of_month(date):
    month = date.month
    week = 0
    while date.month == month:
        week += 1
        date -= timedelta(days=7)
    return week
def getSeason(date):
    month = date.month
    if month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    elif month in [9, 10, 11]:
        return "Fall"
    else:
        return "Winter"
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
def sigmoid(x):
    return 1/(1 + np.exp(-x))
def dispPerformance(result_dic):
    perf_table = dataframe()
    index_names = []
    for k, v in result_dic.items():
        index_names.append(k)
        perf_table = pd.concat([perf_table, series(v["performance"]).to_frame().T], ignore_index=True, axis=0)
    perf_table.index = index_names
    perf_table.sort_values(perf_table.columns[0], inplace=True)
    print(perf_table)
    return perf_table
def powspace(start, stop, power, num):
    start = np.power(start, 1/float(power))
    stop = np.power(stop, 1/float(power))
    return np.power(np.linspace(start, stop, num=num), power)
def xgb_custom_lossfunction(alpha = 1):
    def support_under_mse(label, pred):
        # grad : 1차 미분
        # hess : 2차 미분
        residual = (label - pred).astype("float")
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        hess = np.where(residual > 0, 2 * alpha, 2.0)
        return grad, hess
    return support_under_mse
def pd_flatten(df):
    df = df.unstack()
    df.index = [str(i) + "_" + str(j) for i, j in df.index]
    return df
def tf_losses_rmse(y_true, y_pred, sample_weight=None):
    return tf.sqrt(tf.reduce_mean((y_true - y_pred) ** 2)) if sample_weight is None else tf.sqrt(tf.reduce_mean(((y_true - y_pred) ** 2) * sample_weight))
def tf_loss_nmae(y_true, y_pred, sample_weight=False):
    mae = tf.reduce_mean(tf.math.abs(y_true - y_pred))
    score = tf.math.divide(mae, tf.reduce_mean(tf.math.abs(y_true)))
    return score
def text_extractor(string, lang="eng", spacing=True):
    # # 괄호를 포함한 괄호 안 문자 제거 정규식
    # re.sub(r'\([^)]*\)', '', remove_text)
    # # <>를 포함한 <> 안 문자 제거 정규식
    # re.sub(r'\<[^)]*\>', '', remove_text)
    if lang == "eng":
        text_finder = re.compile('[^ A-Za-z]') if spacing else re.compile('[^A-Za-z]')
    elif lang == "kor":
        text_finder = re.compile('[^ ㄱ-ㅣ가-힣+]') if spacing else re.compile('[^ㄱ-ㅣ가-힣+]')
    # default : kor + eng
    else:
        text_finder = re.compile('[^ A-Za-zㄱ-ㅣ가-힣+]') if spacing else re.compile('[^A-Za-zㄱ-ㅣ가-힣+]')
    return text_finder.sub('', string)
def memory_usage(message='debug'):
    # current process RAM usage
    p = psutil.Process()
    rss = p.memory_info().rss / 2 ** 20 # Bytes to MB
    print(f"[{message}] memory usage: {rss: 10.3f} MB")
    return rss
def cos_sim(a, b):
  return dot(a, b)/(norm(a) * norm(b))
class MyLabelEncoder:
    def __init__(self, preset={}):
        # dic_cat format -> {"col_name": {"value": replace}}
        self.dic_cat = preset
    def fit_transform(self, data_x, col_names):
        tmp_x = copy.deepcopy(data_x)
        for i in col_names:
            # if key is not in dic, update dic
            if i not in self.dic_cat.keys():
                tmp_dic = dict.fromkeys(sorted(set(tmp_x[i]).difference([nan])))
                label_cnt = 0
                for j in tmp_dic.keys():
                    tmp_dic[j] = label_cnt
                    label_cnt += 1
                self.dic_cat[i] = tmp_dic
            # transform value which is not in dic to nan
            tmp_x[i] = tmp_x[i].astype("object")
            conv = tmp_x[i].replace(self.dic_cat[i])
            for conv_idx, j in enumerate(conv):
                if j not in self.dic_cat[i].values():
                    conv[conv_idx] = nan
            # final return
            tmp_x[i] = conv.astype("float")
        return tmp_x
    def transform(self, data_x):
        tmp_x = copy.deepcopy(data_x)
        for i in self.dic_cat.keys():
            # transform value which is not in dic to nan
            tmp_x[i] = tmp_x[i].astype("object")
            conv = tmp_x[i].replace(self.dic_cat[i])
            for conv_idx, j in enumerate(conv):
                if j not in self.dic_cat[i].values():
                    conv[conv_idx] = nan
            # final return
            tmp_x[i] = conv.astype("float")
        return tmp_x
    def clear(self):
        self.dic_cat = {}
class MyOneHotEncoder:
    def __init__(self, label_preset={}):
        self.dic_cat = {}
        self.label_preset = label_preset
    def fit_transform(self, data_x, col_names):
        tmp_x = dataframe()
        for i in data_x:
            if i not in col_names:
                tmp_x = pd.concat([tmp_x, dataframe(data_x[i])], axis=1)
            else:
                if not ((data_x[i].dtype.name == "object") or (data_x[i].dtype.name == "category")):
                    print(F"WARNING : {i} is not object or category")
                self.dic_cat[i] = OneHotEncoder(sparse=False, handle_unknown="ignore")
                conv = self.dic_cat[i].fit_transform(dataframe(data_x[i])).astype("int")
                col_list = []
                for j in self.dic_cat[i].categories_[0]:
                    if i in self.label_preset.keys():
                        for k, v in self.label_preset[i].items():
                            if v == j:
                                col_list.append(str(i) + "_" + str(k))
                    else:
                        col_list.append(str(i) + "_" + str(j))
                conv = dataframe(conv, columns=col_list)
                tmp_x = pd.concat([tmp_x, conv], axis=1)
        return tmp_x
    def transform(self, data_x):
        tmp_x = dataframe()
        for i in data_x:
            if not i in list(self.dic_cat.keys()):
                tmp_x = pd.concat([tmp_x, dataframe(data_x[i])], axis=1)
            else:
                if not ((data_x[i].dtype.name == "object") or (data_x[i].dtype.name == "category")):
                    print(F"WARNING : {i} is not object or category")
                conv = self.dic_cat[i].transform(dataframe(data_x[i])).astype("int")
                col_list = []
                for j in self.dic_cat[i].categories_[0]:
                    if i in self.label_preset.keys():
                        for k, v in self.label_preset[i].items():
                            if v == j: col_list.append(str(i) + "_" + str(k))
                    else:
                        col_list.append(str(i) + "_" + str(j))
                conv = dataframe(conv, columns=col_list)
                tmp_x = pd.concat([tmp_x, conv], axis=1)
        return tmp_x
    def clear(self):
        self.dic_cat = {}
        self.label_preset = {}
class MyKNNImputer:
    def __init__(self, k=5):
        self.imputer = KNNImputer(n_neighbors=k)
        self.dic_cat = {}
    def fit_transform(self, x, cat_vars=None):
        if cat_vars is None:
            x_imp = dataframe(self.imputer.fit_transform(x), columns=x.columns)
        else:
            naIdx = dict.fromkeys(cat_vars)
            for i in cat_vars:
                self.dic_cat[i] = diff(list(sorted(set(x[i]))), [nan])
                naIdx[i] = list(which(array(x[i].isna())))
            x_imp = dataframe(self.imputer.fit_transform(x), columns=x.columns)

            # if imputed categorical value are not in the range, adjust the value
            for i in cat_vars:
                x_imp[i] = x_imp[i].apply(lambda x: int(round(x, 0)))
                for j in naIdx[i]:
                    if x_imp[i][j] not in self.dic_cat[i]:
                        if x_imp[i][j] < self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][0]
                        elif x_imp[i][j] > self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][len(self.dic_cat[i]) - 1]
        return x_imp
    def transform(self, x):
        if len(self.dic_cat.keys()) == 0:
            x_imp = dataframe(self.imputer.transform(x), columns=x.columns)
        else:
            naIdx = dict.fromkeys(self.dic_cat.keys())
            for i in self.dic_cat.keys():
                naIdx[i] = list(which(array(x[i].isna())))
            x_imp = dataframe(self.imputer.transform(x), columns=x.columns)

            # if imputed categorical value are not in the range, adjust the value
            for i in self.dic_cat.keys():
                x_imp[i] = x_imp[i].apply(lambda x: int(round(x, 0)))
                for j in naIdx[i]:
                    if x_imp[i][j] not in self.dic_cat[i]:
                        if x_imp[i][j] < self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][0]
                        elif x_imp[i][j] > self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][len(self.dic_cat[i]) - 1]
        return x_imp
    def clear(self):
        self.imputer = None
        self.dic_cat = {}
def remove_outlier(df, std=3, mode="remove"):
    tmp_df = df.copy()
    if mode == "remove":
        outlier_mask = (np.abs(stats.zscore(tmp_df)) > std).all(axis=1)
        print("found outlier :", outlier_mask.sum())
        tmp_df = tmp_df[~outlier_mask]
    elif mode == "interpolate":
        tmp_outlier = []
        for i in tmp_df:
            outlier_mask = (np.abs(stats.zscore(tmp_df[i])) > std)
            tmp_outlier.append(outlier_mask.sum())
            if tmp_outlier[-1] == 0:
                continue
            tmp_df[i][outlier_mask] = np.nan
            tmp_df[i] = tmp_df[i].interpolate(method='linear').bfill()
        print("found outlier :", np.sum(outlier_mask))
    return tmp_df
def convert_sparse_matrix_to_sparse_tensor(X, sorted=True):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape)) if sorted else tf.SparseTensor(indices, coo.data, coo.shape)
seed_everything()

In [19]:
folder_path = "/content/drive/MyDrive/Colab Notebooks/projects/Dacon/covid19_diagnostics/"

# Preprocessing

In [20]:
df_full = pd.read_csv(folder_path + 'open.zip (Unzipped Files)/train_data.csv')
df_test = pd.read_csv(folder_path + 'open.zip (Unzipped Files)/test_data.csv')

In [21]:
df_full.head()

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19
0,1,24,female,0,1,0
1,2,51,male,0,0,0
2,3,22,male,0,0,0
3,4,29,female,1,0,0
4,5,23,male,0,0,0


In [22]:
df_test.head()

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain
0,3806,48,female,1,0
1,3807,24,female,0,0
2,3808,29,male,0,0
3,3809,39,female,0,0
4,3810,34,male,0,0


In [23]:
def get_mfcc_feature(df, data_type):
    # Data Folder path
    root_folder = folder_path + 'open.zip (Unzipped Files)/'
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    return df
    print('Done.')

In [24]:
def get_vggish_feature(df, data_type):
    pretrained_vggish = tf_hub.load('https://tfhub.dev/google/vggish/1')
    avg_pooling = layers.GlobalAveragePooling1D()

    # Data Folder path
    root_folder = folder_path + 'open.zip (Unzipped Files)/'
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        # sampling rate : 16000 (16kHz)
        y, sr = librosa.load(path, sr=16000)

        # getting averaged vggish feature
        y = pretrained_vggish(y)
        features.append(avg_pooling(tf.expand_dims(y, 0)).numpy()[0])
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    feature_df = pd.DataFrame(features, columns=['vggish_' + str(x) for x in range(128)])
    if feature_df.isna().sum().sum() > 0:
        print("INFO : df includes na values")
    df = pd.concat([df, feature_df], axis=1)
    return df
    print('Done.')

In [26]:
CFG = {
    'SR':16000,
    'N_MFCC':64, # MFCC 벡터를 추출할 개수
    'SEED':41
}

df_full = get_mfcc_feature(df_full, 'train')
df_test = get_mfcc_feature(df_test, 'test')

100%|██████████| 3805/3805 [1:04:44<00:00,  1.02s/it]
100%|██████████| 5732/5732 [1:32:44<00:00,  1.03it/s]


In [27]:
easyIO(df_full, folder_path + "dataset/train_mfcc_64.csv", "w")
easyIO(df_test, folder_path + "dataset/test_mfcc_64.csv", "w")

In [None]:
# df_full = get_mfcc_feature(df_full, 'train')
# df_test = get_mfcc_feature(df_test, 'test')

100%|██████████| 3805/3805 [1:15:31<00:00,  1.19s/it]
100%|██████████| 5732/5732 [1:59:49<00:00,  1.25s/it]


In [None]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

# easyIO(df_full, folder_path + "dataset/train_mfcc.csv", "w")
# easyIO(df_test, folder_path + "dataset/test_mfcc.csv", "w")

In [None]:
# df_full = get_vggish_feature(df_full, 'train')
# df_test = get_vggish_feature(df_test, 'test')

100%|██████████| 5732/5732 [1:41:12<00:00,  1.06s/it]


INFO : df includes na values


In [None]:
# easyIO(df_full, folder_path + "dataset/train_vggish.csv", "w")
# easyIO(df_test, folder_path + "dataset/test_vggish.csv", "w")

# Quick Start Session

In [None]:
# audio_feature_type = "mfcc"
audio_feature_type = "vggish"

In [None]:
target_var = "covid19"
num_vars = ["age"]
bin_vars = ["gender", "respiratory_condition", "fever_or_muscle_pain"]
cat_vars = []

if audio_feature_type == "vggish":
    audio_vars = [audio_feature_type + "_" + str(i) for i in range(128)]
elif audio_feature_type == "mfcc":
    audio_vars = [audio_feature_type + "_" + str(i) for i in range(1,CFG['N_MFCC']+1)]

In [None]:
df_full = easyIO(None, folder_path + "dataset/train_" + audio_feature_type + ".csv", "r")
df_test = easyIO(None, folder_path + "dataset/test_" + audio_feature_type + ".csv", "r")

In [None]:
df_full.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

In [None]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Columns: 133 entries, age to vggish_127
dtypes: float32(128), int64(4), object(1)
memory usage: 2.0+ MB


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5732 entries, 0 to 5731
Columns: 132 entries, age to vggish_127
dtypes: float32(128), int64(3), object(1)
memory usage: 3.0+ MB


# Feature Enginnering

In [None]:
# for i in df_full:
#     print(df_full[i].value_counts(), "\n")

In [None]:
tmp = []
for i in df_full["gender"]:
    if i == "male":
        tmp.append(1.0)
    elif i == "female":
        tmp.append(0.0)
    else:
        tmp.append(nan)
df_full["gender"] = tmp
df_full = df_full.dropna().reset_index(drop=True)

tmp = []
for i in df_test["gender"]:
    if i == "male":
        tmp.append(1.0)
    elif i == "female":
        tmp.append(0.0)
    else:
        tmp.append(0.5)
df_test["gender"] = tmp

In [None]:
# age discretization for 8, 4 groups

df_full["age_disc_8"] = pd.cut(df_full["age"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, 70, np.inf], right=False, labels=list(range(8))).astype("float32")
df_full["age_disc_4"] = pd.cut(df_full["age"], bins=[-np.inf, 20, 40, 60, np.inf], right=False, labels=list(range(4))).astype("float32")

df_test["age_disc_8"] = pd.cut(df_test["age"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, 70, np.inf], right=False, labels=list(range(8))).astype("float32")
df_test["age_disc_4"] = pd.cut(df_test["age"], bins=[-np.inf, 20, 40, 60, np.inf], right=False, labels=list(range(4))).astype("float32")

df_full = pd.concat([df_full.drop("age_disc_8", axis=1), dataframe(tf.keras.utils.to_categorical(df_full["age_disc_8"], num_classes=8, dtype='float32'), columns=["age_disc8_" + str(i) for i in range(8)])], axis=1)
df_full = pd.concat([df_full.drop("age_disc_4", axis=1), dataframe(tf.keras.utils.to_categorical(df_full["age_disc_4"], num_classes=4, dtype='float32'), columns=["age_disc4_" + str(i) for i in range(4)])], axis=1)

df_test = pd.concat([df_test.drop("age_disc_8", axis=1), dataframe(tf.keras.utils.to_categorical(df_test["age_disc_8"], num_classes=8, dtype='float32'), columns=["age_disc8_" + str(i) for i in range(8)])], axis=1)
df_test = pd.concat([df_test.drop("age_disc_4", axis=1), dataframe(tf.keras.utils.to_categorical(df_test["age_disc_4"], num_classes=4, dtype='float32'), columns=["age_disc4_" + str(i) for i in range(4)])], axis=1)

In [None]:
# covid19 symptom interaction (2:both, 1:either, 0:neither)

df_full["symptom_class"] = (df_full["respiratory_condition"] + df_full["fever_or_muscle_pain"]).astype("float32")
df_test["symptom_class"] = (df_test["respiratory_condition"] + df_test["fever_or_muscle_pain"]).astype("float32")

df_full = pd.concat([df_full.drop("symptom_class", axis=1), dataframe(tf.keras.utils.to_categorical(df_full["symptom_class"], num_classes=3, dtype='float32'), columns=["symptom_class_" + str(i) for i in range(3)])], axis=1)
df_test = pd.concat([df_test.drop("symptom_class", axis=1), dataframe(tf.keras.utils.to_categorical(df_test["symptom_class"], num_classes=3, dtype='float32'), columns=["symptom_class_" + str(i) for i in range(3)])], axis=1)

In [None]:
# vulnerable class (age>=60 and symptom_class==2)

df_full["vulnerable"] = (df_full["symptom_class_2"].astype("bool") & df_full["age_disc4_3"].astype("bool")).astype("float32")
df_test["vulnerable"] = (df_test["symptom_class_2"].astype("bool") & df_test["age_disc4_3"].astype("bool")).astype("float32")

bin_vars += ["vulnerable"]

In [None]:
num_vars = diff(df_full.columns, bin_vars + cat_vars + audio_vars + [target_var])

In [None]:
df_full[num_vars + bin_vars + audio_vars] = df_full[num_vars + bin_vars + audio_vars].astype("float32")
df_full[target_var] = df_full[target_var].astype("int32")

df_test[num_vars + bin_vars + audio_vars] = df_test[num_vars + bin_vars + audio_vars].astype("float32")

In [None]:
num_vars

['age',
 'age_disc8_0',
 'age_disc8_1',
 'age_disc8_2',
 'age_disc8_3',
 'age_disc8_4',
 'age_disc8_5',
 'age_disc8_6',
 'age_disc8_7',
 'age_disc4_0',
 'age_disc4_1',
 'age_disc4_2',
 'age_disc4_3',
 'symptom_class_0',
 'symptom_class_1',
 'symptom_class_2']

In [None]:
bin_vars

['gender', 'respiratory_condition', 'fever_or_muscle_pain', 'vulnerable']

In [None]:
bin_vars += num_vars[1:]
num_vars = ["age"]

In [None]:
bin_vars

['gender',
 'respiratory_condition',
 'fever_or_muscle_pain',
 'vulnerable',
 'age_disc8_0',
 'age_disc8_1',
 'age_disc8_2',
 'age_disc8_3',
 'age_disc8_4',
 'age_disc8_5',
 'age_disc8_6',
 'age_disc8_7',
 'age_disc4_0',
 'age_disc4_1',
 'age_disc4_2',
 'age_disc4_3',
 'symptom_class_0',
 'symptom_class_1',
 'symptom_class_2']

In [None]:
cat_vars

[]

In [None]:
audio_vars

['vggish_0',
 'vggish_1',
 'vggish_2',
 'vggish_3',
 'vggish_4',
 'vggish_5',
 'vggish_6',
 'vggish_7',
 'vggish_8',
 'vggish_9',
 'vggish_10',
 'vggish_11',
 'vggish_12',
 'vggish_13',
 'vggish_14',
 'vggish_15',
 'vggish_16',
 'vggish_17',
 'vggish_18',
 'vggish_19',
 'vggish_20',
 'vggish_21',
 'vggish_22',
 'vggish_23',
 'vggish_24',
 'vggish_25',
 'vggish_26',
 'vggish_27',
 'vggish_28',
 'vggish_29',
 'vggish_30',
 'vggish_31',
 'vggish_32',
 'vggish_33',
 'vggish_34',
 'vggish_35',
 'vggish_36',
 'vggish_37',
 'vggish_38',
 'vggish_39',
 'vggish_40',
 'vggish_41',
 'vggish_42',
 'vggish_43',
 'vggish_44',
 'vggish_45',
 'vggish_46',
 'vggish_47',
 'vggish_48',
 'vggish_49',
 'vggish_50',
 'vggish_51',
 'vggish_52',
 'vggish_53',
 'vggish_54',
 'vggish_55',
 'vggish_56',
 'vggish_57',
 'vggish_58',
 'vggish_59',
 'vggish_60',
 'vggish_61',
 'vggish_62',
 'vggish_63',
 'vggish_64',
 'vggish_65',
 'vggish_66',
 'vggish_67',
 'vggish_68',
 'vggish_69',
 'vggish_70',
 'vggish_71',
 '

In [None]:
df_full.head()

Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,vggish_0,vggish_1,vggish_2,vggish_3,vggish_4,vggish_5,vggish_6,vggish_7,vggish_8,vggish_9,vggish_10,vggish_11,vggish_12,vggish_13,vggish_14,vggish_15,vggish_16,vggish_17,vggish_18,vggish_19,vggish_20,vggish_21,vggish_22,vggish_23,vggish_24,vggish_25,vggish_26,vggish_27,vggish_28,vggish_29,vggish_30,vggish_31,vggish_32,vggish_33,vggish_34,vggish_35,vggish_36,vggish_37,vggish_38,vggish_39,vggish_40,vggish_41,vggish_42,vggish_43,vggish_44,...,vggish_94,vggish_95,vggish_96,vggish_97,vggish_98,vggish_99,vggish_100,vggish_101,vggish_102,vggish_103,vggish_104,vggish_105,vggish_106,vggish_107,vggish_108,vggish_109,vggish_110,vggish_111,vggish_112,vggish_113,vggish_114,vggish_115,vggish_116,vggish_117,vggish_118,vggish_119,vggish_120,vggish_121,vggish_122,vggish_123,vggish_124,vggish_125,vggish_126,vggish_127,age_disc8_0,age_disc8_1,age_disc8_2,age_disc8_3,age_disc8_4,age_disc8_5,age_disc8_6,age_disc8_7,age_disc4_0,age_disc4_1,age_disc4_2,age_disc4_3,symptom_class_0,symptom_class_1,symptom_class_2,vulnerable
0,24.0,0.0,0.0,1.0,0,0.074367,-0.304656,0.444292,-0.57865,-0.150864,-0.474929,-0.407196,-0.495602,-0.878,0.271428,-0.53989,-0.369853,-0.417934,-0.303931,-0.1107,-0.402886,-0.192091,0.479554,-0.33311,-0.19015,0.298838,-0.561066,-0.413499,-0.085026,0.340623,0.327,0.929875,-0.025833,-0.176998,-0.227616,-0.287722,-0.334935,0.053041,0.741705,0.431359,-0.11028,-0.567165,0.028924,-0.363699,-0.180546,0.569059,-0.23553,0.069347,-0.232013,0.163835,...,0.03011,0.276897,-0.025295,-0.758403,-0.413647,0.010866,-0.096699,-0.015649,0.273723,0.238724,-0.394732,-0.30355,0.195753,-0.705073,0.280866,-0.297937,0.119591,-0.420765,-0.227821,-0.245124,-0.604481,-0.158429,-0.603614,-0.202446,-0.87717,0.174617,0.055384,-0.305561,-0.340612,0.682212,0.114898,-0.233344,-0.286952,-0.310891,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,51.0,1.0,0.0,0.0,0,-0.09308,-0.323049,0.228054,-0.253214,-0.201783,-0.593411,-0.281786,-0.253731,-0.278699,-0.072951,-0.490047,-0.503245,-0.350297,-0.339051,-0.002062,-0.231233,-0.1815,0.311492,-0.262562,-0.192817,0.545472,-0.518024,-0.137717,0.070471,0.228461,0.560039,0.63305,0.086444,-0.147594,-0.133464,-0.160392,-0.177724,0.185062,0.741338,0.196328,-0.07637,-0.549249,-0.055279,-0.254215,-0.142908,0.512294,-0.259097,0.063067,-0.161039,0.093234,...,0.027641,0.237238,-0.089823,-0.546613,-0.461019,-0.062111,0.3779,0.031675,-0.014577,0.253654,-0.366609,-0.412507,0.349866,-0.466986,0.31855,-0.39975,0.162004,-0.444358,-0.055705,-0.01406,-0.34255,-0.25498,-0.435774,0.006863,-0.590584,-0.026447,0.032024,-0.168996,-0.115205,0.099536,-0.089518,-0.368148,-0.261253,-0.418885,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,22.0,1.0,0.0,0.0,0,-0.1647,-0.172472,0.426343,-0.175452,0.01017,-0.466768,-0.258091,-0.289365,-0.301333,-0.068874,-0.358127,-0.365384,-0.336614,-0.281883,-0.151231,-0.172183,0.01382,0.290865,-0.238683,-0.257594,0.262282,-0.429282,-0.177642,0.039664,0.27347,0.383673,0.420238,0.081119,0.034558,-0.084735,-0.173926,-0.308173,0.111967,0.845682,0.424526,-0.100941,-0.512767,-0.11316,-0.115093,-0.165239,0.395894,-0.116302,0.221822,-0.130291,0.172963,...,-0.02254,0.287411,-0.111606,-0.499588,-0.290404,-0.099194,0.083804,-0.012489,0.108274,0.191909,-0.301285,-0.48636,0.326836,-0.340729,0.354032,-0.345735,0.107432,-0.253056,-0.178793,0.008702,-0.317845,0.021927,-0.378676,-0.128969,-0.65766,-0.003223,0.178623,-0.175769,-0.10084,0.230077,-0.01484,-0.122037,-0.041008,-0.299935,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,29.0,0.0,1.0,0.0,0,0.08807,-0.352619,0.365449,-0.385953,-0.079941,-0.321737,-0.384355,-0.383507,-0.441021,0.317373,-0.634525,-0.614604,-0.300173,-0.422857,-0.180514,-0.445056,-0.255017,0.234226,-0.319845,-0.33016,0.205797,-0.450726,-0.244148,0.105189,0.334995,0.458771,0.730664,-0.042996,-0.139413,-0.277734,-0.203148,-0.286275,0.150699,0.536051,0.327327,-0.108489,-0.634628,0.270626,-0.255406,-0.30275,0.563379,-0.224966,-0.0915,-0.209208,0.210078,...,-0.112439,0.254594,0.0731,-0.697025,-0.288771,-0.160099,0.091154,0.062764,0.260474,0.147259,-0.253866,-0.402676,0.284678,-0.506566,0.442841,-0.41639,0.04335,-0.337494,-0.098187,-0.183057,-0.53177,-0.114612,-0.496977,-0.225663,-0.748375,0.061227,-0.070348,-0.39837,-0.204521,0.566465,0.11668,-0.291255,-0.378922,-0.411382,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,23.0,1.0,0.0,0.0,0,-0.213872,-0.087589,0.194127,0.047982,-0.028293,-0.442092,-0.229419,-0.223341,-0.548548,0.003911,-0.370923,-0.303497,-0.118739,-0.098019,-0.071205,-0.242102,0.114097,0.210606,-0.001671,-0.155339,0.113149,-0.26741,-0.238237,0.022462,0.379822,0.415068,0.302623,0.063197,0.330626,-0.10919,-0.114288,-0.164801,-0.037144,0.464741,0.239383,-0.099023,-0.302174,-0.041982,-0.129017,-0.04468,0.290609,-0.001871,0.060366,-0.173853,0.300038,...,0.330181,0.518418,-0.106621,-0.396754,-0.224137,0.075243,0.167808,0.10396,0.08984,0.297089,-0.397926,-0.261434,0.334759,-0.288069,0.315628,-0.143351,-0.089417,0.233368,0.028403,-0.030995,-0.320734,0.033128,-0.283002,0.118382,-0.593944,0.07305,-0.171575,-0.130059,-0.189703,0.150264,0.068701,-0.183185,0.023927,-0.136294,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
df_test.head()

Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,vggish_0,vggish_1,vggish_2,vggish_3,vggish_4,vggish_5,vggish_6,vggish_7,vggish_8,vggish_9,vggish_10,vggish_11,vggish_12,vggish_13,vggish_14,vggish_15,vggish_16,vggish_17,vggish_18,vggish_19,vggish_20,vggish_21,vggish_22,vggish_23,vggish_24,vggish_25,vggish_26,vggish_27,vggish_28,vggish_29,vggish_30,vggish_31,vggish_32,vggish_33,vggish_34,vggish_35,vggish_36,vggish_37,vggish_38,vggish_39,vggish_40,vggish_41,vggish_42,vggish_43,vggish_44,vggish_45,...,vggish_94,vggish_95,vggish_96,vggish_97,vggish_98,vggish_99,vggish_100,vggish_101,vggish_102,vggish_103,vggish_104,vggish_105,vggish_106,vggish_107,vggish_108,vggish_109,vggish_110,vggish_111,vggish_112,vggish_113,vggish_114,vggish_115,vggish_116,vggish_117,vggish_118,vggish_119,vggish_120,vggish_121,vggish_122,vggish_123,vggish_124,vggish_125,vggish_126,vggish_127,age_disc8_0,age_disc8_1,age_disc8_2,age_disc8_3,age_disc8_4,age_disc8_5,age_disc8_6,age_disc8_7,age_disc4_0,age_disc4_1,age_disc4_2,age_disc4_3,symptom_class_0,symptom_class_1,symptom_class_2,vulnerable
0,48.0,0.0,1.0,0.0,-0.249435,-0.078036,-0.000848,0.146939,-0.045798,-0.159696,-0.1076,-0.118128,-0.229618,-0.085126,-0.235071,-0.259579,0.385549,0.086088,-0.055775,-0.087479,0.14316,0.361288,0.108084,-0.330335,-0.034617,-0.22137,-0.095338,-0.120964,0.247845,0.081271,0.015907,0.196405,0.30776,-0.038643,-0.072448,-0.123231,0.097034,0.271419,0.324465,-0.217953,-0.258507,-0.049462,-0.06454,-0.146689,0.439659,-0.045267,0.085254,-0.078951,0.541457,-0.1158,...,0.206394,0.211534,-0.039425,-0.022362,-0.067527,0.073929,0.098903,0.455746,0.028536,0.340663,-0.370271,-0.11477,0.18322,-0.307789,0.077042,-0.072964,-0.03734,0.489214,0.029793,-0.089756,-0.226815,0.011774,-0.221705,0.017363,-0.237012,-0.02108,-0.035114,-0.075009,-0.027227,-0.024461,0.132206,-0.164106,0.161094,0.090653,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,24.0,0.0,0.0,0.0,-0.365054,-0.158612,0.1049,0.102022,-0.092483,-0.316609,-0.133178,-0.212336,-0.312242,-0.101015,-0.244832,-0.253776,0.163712,-0.070663,-0.103565,-0.139592,0.103058,0.406807,0.004901,-0.35194,0.073389,-0.343818,-0.119598,-0.132139,0.204288,0.271953,0.231882,0.171854,0.29667,-0.11312,-0.056802,-0.17738,0.06866,0.481462,0.246493,-0.197,-0.326874,-0.126799,-0.089023,-0.181524,0.418555,-0.128327,0.135198,-0.115831,0.394721,-0.05956,...,0.181847,0.237135,-0.081053,-0.158384,-0.192924,-0.014655,0.253809,0.304555,0.006955,0.353097,-0.396016,-0.249069,0.200262,-0.315313,0.082205,-0.155889,0.002312,0.22383,-0.03376,-0.098017,-0.280393,0.0095,-0.320952,0.001696,-0.34381,-0.123523,-0.024368,-0.177301,-0.066536,-0.02295,0.021847,-0.32603,0.092071,-0.040008,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,29.0,1.0,0.0,0.0,-0.266173,-0.051474,-0.135684,0.300016,-0.04896,-0.072094,-0.043378,-0.105313,-0.176645,-0.053787,-0.157787,-0.235288,0.587219,0.159131,-0.093491,-0.038367,0.234486,0.322382,0.23274,-0.354743,-0.063508,-0.165087,-0.044461,-0.133523,0.226273,-0.02589,-0.119254,0.260348,0.438883,-0.014112,-0.03956,-0.134782,0.144375,0.078759,0.245981,-0.261044,-0.198053,-0.057812,-0.076063,-0.125583,0.376237,-0.013202,0.051288,-0.074818,0.592029,-0.133659,...,0.244246,0.253046,-0.031076,0.113678,-0.04972,0.099078,0.119885,0.541312,-0.055865,0.269268,-0.384827,-0.104771,0.138724,-0.228172,0.007556,-0.020042,-0.065018,0.687625,0.074215,-0.04564,-0.177862,0.054304,-0.169483,0.023931,-0.093769,-0.032683,-0.112096,-0.030724,-0.032954,-0.105866,0.207592,-0.156844,0.227877,0.175801,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,39.0,0.0,0.0,0.0,0.081596,-0.050033,0.091697,-0.195561,-0.050593,-0.308308,-0.194966,-0.248588,-0.25836,0.204068,-0.289227,-0.391996,-0.264257,-0.129608,-0.087989,-0.15957,-0.054177,0.388775,-0.044596,-0.044411,0.229416,-0.203256,-0.181956,-0.072175,0.544561,0.309737,0.507254,0.135759,0.06949,-0.173852,-0.133196,-0.101946,-0.019489,0.627335,0.28204,-0.153673,-0.309699,-0.13842,-0.322991,-0.158139,0.379478,0.034558,0.04664,-0.202109,0.188757,-0.107157,...,0.14331,0.389718,-0.006484,-0.479807,-0.220119,0.121994,-0.074072,0.04081,0.052372,0.176469,-0.292621,-0.362384,0.321456,-0.293191,0.136225,-0.115177,0.002313,-0.085969,-0.092449,-0.088201,-0.316369,-0.092986,-0.476835,0.087741,-0.567856,0.282586,0.031993,-0.020578,-0.174614,0.170744,0.138038,-0.023719,0.018652,-0.154509,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,34.0,1.0,0.0,0.0,-0.201897,-0.207558,0.470994,-0.310761,0.003336,-0.356504,-0.182039,-0.243932,-0.548647,-0.098514,-0.256118,-0.366254,-0.196931,-0.145483,-0.109275,-0.194539,0.027524,0.261817,-0.250724,-0.22427,0.219435,-0.439139,-0.232983,-0.032888,0.12531,0.373186,0.390432,0.04233,-0.040499,-0.108811,-0.235327,-0.236051,0.065434,0.777719,0.33053,-0.066482,-0.326282,-0.138608,-0.063585,-0.179068,0.56787,-0.308946,0.269137,-0.144893,0.251727,-0.050229,...,0.04737,0.288538,-0.127788,-0.439673,-0.255373,-0.06371,0.102492,0.096573,0.30008,0.282765,-0.288261,-0.193158,0.233706,-0.515813,0.223579,-0.186225,0.244542,-0.14701,-0.128894,-0.201853,-0.376645,-0.102247,-0.34113,-0.181266,-0.486127,0.070523,0.06055,-0.203526,-0.092903,0.160788,0.006748,-0.235371,-0.048243,-0.208136,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
print(df_full.isna().sum().sum())
print(df_test.isna().sum().sum())

0
896


In [None]:
df_test.isna().sum().index[df_test.isna().sum() > 0]

Index(['vggish_0', 'vggish_1', 'vggish_2', 'vggish_3', 'vggish_4', 'vggish_5', 'vggish_6', 'vggish_7', 'vggish_8', 'vggish_9',
       ...
       'vggish_118', 'vggish_119', 'vggish_120', 'vggish_121', 'vggish_122', 'vggish_123', 'vggish_124', 'vggish_125', 'vggish_126', 'vggish_127'], dtype='object', length=128)

In [None]:
# linear_model,
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# make_pipeline(StandardScaler(), LinearRegression())

imputor = IterativeImputer(
    estimator=make_pipeline(StandardScaler(), LinearRegression()),
    max_iter=10, random_state=42
)

# imputing na values on audio features in test dataset
imputor.fit(df_full.drop(target_var, axis=1).to_numpy())

df_test.iloc[:,:] = imputor.transform(df_test.to_numpy())

In [None]:
# # imputing na values on audio features in test dataset

# imputor = MissForest(max_iter=10, n_estimators=100, max_depth=4, min_samples_leaf=3, verbose=1, random_state=42)
# imputor.fit(df_full.drop(target_var, axis=1).to_numpy())

# df_test[:,:] = imputor.transform(df_test.to_numpy())

In [None]:
print(df_full.isna().sum().sum())
print(df_test.isna().sum().sum())

0
0


In [None]:
df_full = df_full.drop_duplicates().reset_index(drop=True)
df_full.sample(frac=1, random_state=42).reset_index(drop=True)

Unnamed: 0,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,vggish_0,vggish_1,vggish_2,vggish_3,vggish_4,vggish_5,vggish_6,vggish_7,vggish_8,vggish_9,vggish_10,vggish_11,vggish_12,vggish_13,vggish_14,vggish_15,vggish_16,vggish_17,vggish_18,vggish_19,vggish_20,vggish_21,vggish_22,vggish_23,vggish_24,vggish_25,vggish_26,vggish_27,vggish_28,vggish_29,vggish_30,vggish_31,vggish_32,vggish_33,vggish_34,vggish_35,vggish_36,vggish_37,vggish_38,vggish_39,vggish_40,vggish_41,vggish_42,vggish_43,vggish_44,...,vggish_94,vggish_95,vggish_96,vggish_97,vggish_98,vggish_99,vggish_100,vggish_101,vggish_102,vggish_103,vggish_104,vggish_105,vggish_106,vggish_107,vggish_108,vggish_109,vggish_110,vggish_111,vggish_112,vggish_113,vggish_114,vggish_115,vggish_116,vggish_117,vggish_118,vggish_119,vggish_120,vggish_121,vggish_122,vggish_123,vggish_124,vggish_125,vggish_126,vggish_127,age_disc8_0,age_disc8_1,age_disc8_2,age_disc8_3,age_disc8_4,age_disc8_5,age_disc8_6,age_disc8_7,age_disc4_0,age_disc4_1,age_disc4_2,age_disc4_3,symptom_class_0,symptom_class_1,symptom_class_2,vulnerable
0,27.0,1.0,0.0,0.0,0,0.237233,-0.073406,0.142055,-0.539462,-0.054550,-0.457476,-0.277106,-0.374962,-0.193999,0.135108,-0.685005,-0.552347,-0.382332,-0.179146,0.085340,-0.287859,-0.125214,0.629780,-0.346372,-0.098754,0.320120,-0.379453,-0.325220,-0.036740,0.455812,0.213303,0.531806,0.213881,-0.179417,-0.270110,-0.239606,-0.155629,-0.018713,0.639012,0.427209,-0.128684,-0.552983,-0.000528,-0.309293,-0.021275,0.377659,-0.134463,0.198480,-0.261788,0.014866,...,0.097233,0.221708,0.016513,-0.646305,-0.445976,0.163898,-0.216428,-0.002031,0.221547,0.330069,-0.362868,-0.170088,0.376592,-0.461542,0.362308,-0.227808,-0.194761,-0.259064,-0.121842,0.059296,-0.381911,-0.278796,-0.497189,0.052893,-0.858647,0.253602,0.176621,-0.052145,-0.088078,0.347476,0.031137,-0.037026,-0.059619,-0.262939,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,26.0,0.0,0.0,0.0,0,0.246134,-0.148192,0.244273,-0.538205,-0.337523,-0.593265,-0.311994,-0.429375,-0.532566,0.282096,-0.808344,-0.636284,-0.525758,-0.101947,0.089822,-0.137083,-0.029002,0.338129,-0.202859,-0.015267,0.312917,-0.102637,-0.262040,0.089277,0.297634,0.217412,0.472763,0.407411,-0.235386,-0.319075,0.004311,-0.177443,0.011335,0.692153,0.180619,0.222378,-0.572982,-0.159462,-0.438346,0.069036,0.315911,0.065443,0.149918,-0.115082,0.204928,...,-0.058818,0.199952,-0.109857,-0.661890,-0.535261,-0.008445,-0.197057,0.303818,-0.003170,0.225542,-0.452083,-0.424431,0.354409,-0.191299,0.330623,-0.153216,-0.113831,-0.202335,-0.015565,-0.017402,-0.653620,-0.330262,-0.408605,0.028116,-1.047375,0.246120,0.070484,-0.083004,-0.088040,0.252036,-0.007256,0.090935,-0.067936,-0.358746,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,27.0,1.0,0.0,0.0,0,-0.061355,-0.104770,0.196464,0.256682,0.008873,-0.406786,-0.341376,-0.238717,-0.307947,0.039246,-0.475410,-0.464350,-0.124527,-0.061451,-0.096154,-0.199233,0.069042,0.077799,0.014512,-0.261682,0.219502,-0.286834,-0.178618,0.106887,0.232501,0.102398,0.084113,0.075658,0.324012,-0.062375,-0.094696,-0.273715,0.143767,0.459655,0.268433,-0.109267,-0.437196,0.252963,-0.247513,-0.107284,0.291054,-0.064484,0.082671,-0.155126,0.336953,...,0.224578,0.291128,0.063938,-0.272569,-0.237854,-0.038205,0.077042,0.109512,0.081803,0.132673,-0.329243,-0.353951,0.336664,-0.255079,0.467882,-0.175494,-0.225324,0.365005,0.024953,0.074411,-0.251430,0.023905,-0.355071,-0.024995,-0.566465,-0.008698,-0.067261,-0.180592,-0.229741,0.200737,0.144086,-0.227922,-0.003502,-0.203552,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,37.0,1.0,0.0,0.0,0,-0.134299,-0.150072,0.405094,-0.510863,-0.086288,-0.418875,0.001268,-0.251929,-0.498573,0.072656,-0.201595,-0.357154,-0.282462,-0.169092,-0.099354,-0.152573,-0.107116,0.530403,-0.247494,-0.064046,0.335060,-0.395883,-0.224059,-0.209563,0.361936,0.483415,0.785008,0.148584,-0.128865,-0.204075,-0.192620,-0.088760,-0.030923,0.896775,0.330546,-0.102979,-0.244311,-0.307154,-0.237432,-0.107218,0.666889,-0.215102,0.207663,-0.194329,0.128014,...,0.059770,0.377729,-0.127123,-0.457038,-0.179448,0.072016,0.060581,0.187417,0.153641,0.205097,-0.260940,-0.244228,0.232359,-0.593012,0.237451,-0.104367,0.126822,-0.298137,-0.204943,-0.079213,-0.305575,-0.057126,-0.387948,-0.043517,-0.397912,0.062316,0.193401,-0.161362,-0.163191,0.223964,-0.052330,-0.260159,0.033872,-0.185345,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,30.0,1.0,0.0,0.0,0,-0.003999,-0.111768,0.203734,-0.315624,0.017499,-0.426326,-0.214551,-0.391016,-0.603783,0.163266,-0.374928,-0.397450,-0.407722,-0.112975,-0.052706,-0.255393,-0.050976,0.384551,-0.172650,-0.037434,0.243605,-0.092323,-0.337894,0.015879,0.259033,0.281155,0.412646,0.127118,0.044594,-0.146128,-0.161955,-0.105097,0.054556,0.359450,0.364263,-0.158441,-0.350316,-0.143393,-0.397220,0.020893,0.451105,-0.084920,0.002359,-0.116057,0.249334,...,0.063617,0.294652,-0.040682,-0.571301,-0.259738,0.110305,-0.209054,0.250293,0.106468,0.062130,-0.293578,-0.304958,0.187921,-0.595932,0.258083,-0.161055,-0.107944,-0.097554,-0.127796,-0.115905,-0.373250,-0.201690,-0.280232,-0.184360,-0.616527,0.065762,0.004534,-0.077136,-0.274606,0.431737,0.111949,-0.047648,-0.066311,-0.203175,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3769,12.0,1.0,0.0,0.0,0,-0.246177,-0.034868,-0.129657,0.322590,-0.035816,-0.070364,-0.049894,-0.113831,-0.177191,-0.038279,-0.144693,-0.224197,0.585655,0.157981,-0.093875,-0.038411,0.241772,0.322176,0.244930,-0.338993,-0.056917,-0.147660,-0.060325,-0.120323,0.223656,-0.026892,-0.111760,0.266951,0.456231,-0.000417,-0.045390,-0.142784,0.138059,0.080557,0.251560,-0.261386,-0.196583,-0.055117,-0.084223,-0.119749,0.373634,0.003988,0.048978,-0.088418,0.591958,...,0.249724,0.246064,-0.026176,0.104687,-0.049091,0.103767,0.124341,0.511819,-0.057119,0.265238,-0.373399,-0.099381,0.150342,-0.215783,0.000922,-0.016490,-0.073646,0.701981,0.070872,-0.037070,-0.181049,0.067465,-0.207613,0.023224,-0.092705,-0.034168,-0.087253,-0.025519,-0.038996,-0.108309,0.217582,-0.135895,0.222678,0.166042,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3770,40.0,1.0,0.0,0.0,0,-0.158730,-0.199019,0.178682,-0.004860,-0.096780,-0.397793,-0.399140,-0.317382,-0.261474,-0.093012,-0.539748,-0.491350,-0.229894,-0.181734,-0.172093,-0.309356,0.008652,0.062677,-0.019963,-0.379514,0.162992,-0.463177,-0.180228,0.160992,0.264425,0.313263,0.199507,-0.000034,0.079976,-0.126106,-0.204884,-0.226403,-0.036004,0.847588,0.416150,-0.128587,-0.468832,0.187249,-0.124072,-0.239662,0.444036,-0.207752,0.118965,-0.136653,0.361643,...,0.237841,0.110573,0.063766,-0.413122,-0.240286,-0.102974,0.177032,-0.009918,0.065011,0.486265,-0.365485,-0.320821,0.291606,-0.350020,0.355880,-0.246199,-0.097304,0.109610,-0.077862,-0.122714,-0.315635,-0.025315,-0.448783,0.000860,-0.650030,0.067289,-0.048413,-0.276386,-0.096411,0.200029,0.124908,-0.157733,-0.173501,-0.230978,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3771,52.0,1.0,0.0,0.0,0,-0.218338,-0.279103,0.525833,-0.108297,-0.185997,-0.617595,-0.306278,-0.382984,-0.547010,0.008517,-0.510100,-0.514966,-0.360227,0.016236,-0.086525,-0.028767,0.089325,0.223187,-0.172069,-0.165562,0.263279,-0.417298,-0.165531,0.017997,0.256129,0.302277,0.371870,0.194196,0.004732,-0.241855,-0.086430,-0.312172,0.114562,1.078538,0.250224,-0.017264,-0.434224,-0.053257,-0.174554,0.029593,0.315223,-0.163284,0.309744,-0.101035,0.075791,...,0.042481,0.285718,-0.229131,-0.468140,-0.270336,-0.151340,-0.048428,0.090265,0.073199,0.167030,-0.454323,-0.515871,0.255998,-0.334426,0.503163,-0.247414,-0.052052,-0.059043,-0.129739,0.022377,-0.360486,-0.064188,-0.370561,-0.008928,-0.680877,-0.029006,0.163282,-0.156853,-0.127626,0.225402,-0.064305,-0.306678,0.023402,-0.257915,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3772,62.0,0.0,0.0,1.0,0,0.051889,-0.291487,0.405999,-0.360844,-0.227700,-0.609881,-0.569284,-0.463257,-0.558362,0.112397,-0.745912,-0.751715,-0.424379,-0.127704,-0.187913,-0.384001,-0.098747,0.129411,-0.093938,-0.226741,0.305927,-0.403796,-0.308442,0.151738,0.250305,0.173764,0.363731,-0.176612,-0.156437,-0.152733,-0.283506,-0.171910,0.050520,0.606909,0.503767,-0.050151,-0.605851,0.566846,-0.262849,-0.235550,0.624544,-0.238147,0.128646,-0.127235,0.462242,...,0.056546,0.076265,0.064280,-0.595994,-0.325737,-0.139608,0.055215,0.007831,0.270474,0.381247,-0.347967,-0.388788,0.231242,-0.356458,0.468366,-0.242079,-0.061456,0.057655,-0.316158,-0.149110,-0.493682,-0.235335,-0.522741,-0.275454,-0.837066,-0.012493,-0.046785,-0.308019,-0.093133,0.641744,0.198202,-0.256241,-0.471158,-0.419664,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


# Define utility funcions

In [None]:
# optuna function
def optuna_objective_function(trial: Trial, fold, train_x, train_y, train_groups, val_x, val_y, val_groups, categoIdx,
                              model_name, output_container, ntrees=1000, eta=1e-2):
    if model_name == "LGB_GOSS":
        tuning_params = {
            "num_leaves": trial.suggest_categorical("num_leaves", [pow(2, i) - 1 for i in [4, 5, 6, 7, 8]]),
            "subsample": trial.suggest_float("subsample", 0.5, 0.8, step=0.05),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8, step=0.05),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.001, 10.0, log=True),
            "min_child_weight": trial.suggest_float("min_child_weight", 0.001, 10.0, log=True),
            "min_child_samples": trial.suggest_int("min_child_samples", 3, 99, log=True),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.2, 5.0, log=True),
        }
        cb_list = [
            lgb.early_stopping(stopping_rounds=int(ntrees * 0.2), first_metric_only=True, verbose=False, min_delta=0.001),
        ]

        model = lgb.LGBMClassifier(boosting_type="goss", objective="binary",
                                   n_estimators=ntrees, learning_rate=eta,
                                   random_state=fold, device_type="gpu",
                                   verbose=-1, **tuning_params)
        model.fit(train_x, train_y, categorical_feature=categoIdx,
                  eval_set=(val_x,val_y), eval_metric="auc", callbacks=cb_list)
    elif model_name == "XGB_GBT":
        tuning_params = {
            "max_depth": trial.suggest_int("max_depth", 4, 8, step=1),
            "subsample": trial.suggest_float("subsample", 0.5, 0.8, step=0.05),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8, step=0.05),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 10.0, step=0.01),
            "min_child_weight": trial.suggest_float("min_child_weight", 0.01, 10.0, step=0.01),
            "gamma": trial.suggest_float("gamma", 0.01, 10.0, step=0.01),
            "max_delta_step":  trial.suggest_float("max_delta_step", 0.01, 10.0, step=0.01)
        }
        model = xgb.XGBRanker(booster="gbtree", objective="rank:ndcg",
                            tree_method="gpu_hist", sampling_method="gradient_based",
                            n_estimators=int(ntrees * 0.2),
                            learning_rate=eta / 10,
                            random_state=fold, verbosity=0,
                            **tuning_params)
        model.fit(train_x, train_y, group=train_groups, verbose=False)
    elif model_name == "CAT_GBM":
        tuning_params = {
            # "n_estimators" : trial.suggest_int("n_estimators", 500, 5000, step=500),
            # "learning_rate" : trial.suggest_categorical("learning_rate", [1e-3, 5e-3, 1e-2, 5e-2, 1e-1]),
            "max_depth": trial.suggest_int("max_depth", 4, 8, step=1),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.05, 0.95, step=0.05),
            # "rsm": trial.suggest_float("rsm", 0.5, 0.9, step=0.05),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 10.0, step=0.01),
            "random_strength": trial.suggest_float("random_strength", 0.01, 1.5, step=0.01),
            "min_child_samples": trial.suggest_int("min_child_samples", 3, 99, step=2),  
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 3.0, step=0.01)
        }

        model = cat.CatBoostClassifier(boosting_type="Plain", loss_function="MultiClass", task_type="GPU",
                                    n_estimators=int(ntrees * 0.2),
                                    learning_rate=eta / 10,
                                    one_hot_max_size=3, leaf_estimation_method="Gradient",
                                    # leaf_estimation_iterations=5,
                                    # max_ctr_complexity=2,
                                    logging_level="Silent", random_state=fold, thread_count=cpu_count(),
                                    **tuning_params)
        model.fit(train_x, train_y, cat_features=categoIdx)
    elif model_name == "CAT_ORD":
        tuning_params = {
            # "n_estimators" : trial.suggest_int("n_estimators", 500, 5000, step=500),
            # "learning_rate" : trial.suggest_categorical("learning_rate", [1e-3, 5e-3, 1e-2, 5e-2, 1e-1]),
            "max_depth": trial.suggest_int("max_depth", 4, 8, step=1),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.05, 0.95, step=0.05),
            # "rsm": trial.suggest_float("rsm", 0.5, 0.9, step=0.05),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 10.0, step=0.01),
            "random_strength": trial.suggest_float("random_strength", 0.01, 1.5, step=0.01),
            "min_child_samples": trial.suggest_int("min_child_samples", 3, 99, step=2),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 3.0, step=0.01)
        }
        
        model = cat.CatBoostClassifier(boosting_type="Ordered", loss_function="MultiClass", task_type="GPU",
                                    n_estimators=int(ntrees * 0.2),
                                    learning_rate=eta / 10,
                                    one_hot_max_size=3, leaf_estimation_method="Gradient",
                                    # leaf_estimation_iterations=5,
                                    # max_ctr_complexity=2,
                                    logging_level="Silent", random_state=fold, thread_count=cpu_count(),
                                    **tuning_params)
        model.fit(train_x, train_y, cat_features=categoIdx)
    else:
        print("unknown")
        return -1
    
    pred = model.predict_proba(val_x)
    # f1 score
    optuna_score = metrics.f1_score(val_y.tolist(), [1.0 if i > threshold else 0.0 for i in pred[:,1]])

    if optuna_score > output_container["score"]:
        output_container["model"] = model
        output_container["pred"] = pred
        output_container["score"] = optuna_score

    return optuna_score

# Training

In [None]:
architecture_name = "vggish_lgb_goss_thresholdOri_tuningPosWeight_try1"
createFolder(folder_path + "architecture/" + architecture_name)

In [None]:
def do_fold_training(fold, train_idx, val_idx, sample_weight=None, finetuning=False):
    tmp_time = time()
    print("\n===== Fold", fold, "=====\n")
    global val_pred, test_pred

    wandb.init(
        project="dacon_covid19_classification",
        group=architecture_name,
        name="fold_" + str(fold)
    ); wandb.config.step = 0

    train_x = df_full.iloc[train_idx][num_vars + bin_vars + audio_vars].copy()
    train_y = df_full.iloc[train_idx][target_var].copy()
    
    val_x = df_full.iloc[val_idx][num_vars + bin_vars + audio_vars].copy()
    val_y = df_full.iloc[val_idx][target_var].copy()
    
    output_container = {"model": None, "pred": None, "score": -np.inf}
    optuna_timout = int(6 * 3600 / kfolds_spliter.get_n_splits())
    optuna_study = create_study(direction='maximize', sampler=TPESampler())
    optuna_study.optimize(
        lambda trial: optuna_objective_function(
            trial, fold, train_x, train_y, None, val_x, val_y, None, categoIdx=None, model_name="LGB_GOSS", output_container=output_container, ntrees=ntrees, eta=eta
        ),
        n_jobs=1, n_trials=300, timeout=optuna_timout
    )
    
    model_list.append(output_container["model"])
    params_list.append(optuna_study.best_params)
    print("fold", fold, "best params :", params_list[-1])
    val_pred[val_idx] = output_container["pred"]
    fold_metric.append(output_container["score"])
    print("fold", fold, "score :", fold_metric[-1])

    test_x = df_test[num_vars + bin_vars + audio_vars].copy()
    test_pred += model_list[-1].predict_proba(test_x) / n_folds

    wandb.log({"fold": fold,
               "logloss": metrics.log_loss(val_y.tolist(), val_pred[val_idx, 1]),
               "auc": metrics.roc_auc_score(val_y.tolist(), val_pred[val_idx, 1]),
               "f1_score": fold_metric[-1]})
    wandb.finish()
 
    print("fold", fold, "time to training :", round(time() - tmp_time, 3))

In [None]:
# learning parameter setting
# The range of converged trees is about 2000 ~ 4000
ntrees = 5000
eta = 5e-3

# ntrees = 1000
# eta = 1e-2

threshold = round(df_full[target_var].mean(), 5)
# threshold = 0.5

In [None]:
model_list = []
params_list = []
fold_metric = []

val_pred = np.zeros(shape=(df_full.shape[0], 2))
test_pred = np.zeros(shape=(df_test.shape[0], 2))
seed_everything()

n_folds = 5
kfolds_spliter = StratifiedKFold(n_folds, shuffle=True, random_state=42)

start_time_training = time()
# fold training
for fold, (train_idx, val_idx) in enumerate(kfolds_spliter.split(df_full, y=df_full[target_var])):
    start_mem = memory_usage()   
    do_fold_training(fold, train_idx, val_idx, None, finetuning=False)
    gc.collect()
    end_mem = memory_usage()
    print("@Memory leaked :", end_mem - start_mem, "\n")
end_time_training = time()

[debug] memory usage:    746.559 MB

===== Fold 0 =====



[34m[1mwandb[0m: Currently logged in as: [33mfrony[0m. Use [1m`wandb login --relogin`[0m to force relogin


[32m[I 2022-06-16 04:21:35,078][0m A new study created in memory with name: no-name-2c51b461-fff4-4f95-aa28-9ec6ab3251ad[0m
[32m[I 2022-06-16 04:21:50,690][0m Trial 0 finished with value: 0.1794871794871795 and parameters: {'num_leaves': 15, 'subsample': 0.8, 'colsample_bytree': 0.65, 'reg_lambda': 0.02589094297065214, 'min_child_weight': 0.6880660203135626, 'min_child_samples': 17, 'scale_pos_weight': 1.107691836763628}. Best is trial 0 with value: 0.1794871794871795.[0m
[32m[I 2022-06-16 04:22:11,505][0m Trial 1 finished with value: 0.1809045226130653 and parameters: {'num_leaves': 127, 'subsample': 0.75, 'colsample_bytree': 0.5, 'reg_lambda': 0.02386718429551674, 'min_child_weight': 1.1668092599252171, 'min_child_samples': 18, 'scale_pos_weight': 0.5432022857929192}. Best is trial 1 with value: 0.1809045226130653.[0m
[32m[I 2022-06-16 04:22:25,046][0m Trial 2 finished with value: 0.1884498480243161 and parameters: {'num_leaves': 255, 'subsample': 0.75, 'colsample_bytree':

In [None]:
avg_score = 0
for idx, value in enumerate(fold_metric):
    print("fold", idx, "score :", value)
    avg_score += value / n_folds
print("average score :", avg_score)

# Submission

In [None]:
dataframe(val_pred).to_csv(folder_path + "architecture/" + architecture_name + "/valPred_" + architecture_name + ".csv", index=False)
dataframe(test_pred).to_csv(folder_path + "architecture/" + architecture_name + "/testPred_" + architecture_name + ".csv", index=False)

In [None]:
submission = pd.read_csv(folder_path + 'open.zip (Unzipped Files)/sample_submission.csv')
submission['covid19'] = [1 if i > threshold else 0 for i in test_pred[:,1]]
submission.to_csv(folder_path + "architecture/" + architecture_name + "/submission_" + architecture_name + ".csv", index=False)