## Setup

In [4]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import glob
import sys
import shutil
import pickle
import random as rnd
from tqdm import tqdm
import time

import numpy as np
from numpy import random as np_rnd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn import linear_model as lm
from sklearn import metrics

import xgboost as xgb

import librosa

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_polynomial_decay_schedule_with_warmup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op == "w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op == "r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj
    
def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]
    
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(np.ceil(adjusted_dom/7.0))

def get_season(dt):
    dt = int(dt)
    if dt in [3, 4, 5]:
        return 0
    elif dt in [6, 7, 8]:
        return 1
    elif dt in [9, 10, 11]:
        return 2
    else:
        return 3

In [6]:
class CFG:
    debug = False
    data_path = ".\\data\\"
    
    n_mfcc = 32
    n_chroma = 16
    
    epochs = 50
    batch_size = 32
    warmup = int(epochs * 0.1)
    early_stopping_rounds = int(epochs * 0.2)
    
    eta = 5e-4
    weight_decay = 1e-4

## Inference

In [37]:
architecture_root_path = "./architectures/"
architecture_name_list = {
    "lg_v1": None,
    "elasticnet_v1": None,
    "randomforest_v1": None,
    "knn_v1": None,
    "xgboost_v1":  None,
    "dnn_v1": None,
    "ensemble_v1": None,
}
# architecture_path = architecture_root_path + architecture_name + "/"

In [38]:
df_private = pickleIO(None, "./dataset/df_private.pkl", "r")
df_private["type"] = df_private["type"].apply(lambda x: 1 if x == "abdominal" else 0)
df_private_y = df_private["type"].astype("int32")

In [39]:
for i in architecture_name_list.keys():
    architecture_path = architecture_root_path + i + "/"
    output = pickleIO(None, architecture_path + "./submission.pkl", "r")
    y_pred_prob = output["prob"]
    y_pred = output["pred"]
    score_dic = {
        "logloss": metrics.log_loss(df_private_y, y_pred_prob[:, 1]),
        "roc_auc": metrics.roc_auc_score(df_private_y, y_pred_prob[:, 1]),
        "accuracy": metrics.accuracy_score(df_private_y, y_pred),
        "f1": metrics.f1_score(df_private_y, y_pred, average="macro"),
    }
    print(f"Model {i} Private Score !")
    display(score_dic)
    architecture_name_list[i] = score_dic

Model lg_v1 Private Score !


{'logloss': 0.1759513420781306,
 'roc_auc': 0.97845287641206,
 'accuracy': 0.9335839598997494,
 'f1': 0.9238740125166718}

Model elasticnet_v1 Private Score !


{'logloss': 0.17836701588416268,
 'roc_auc': 0.9781305291509373,
 'accuracy': 0.9385964912280702,
 'f1': 0.9297629846510334}

Model randomforest_v1 Private Score !


{'logloss': 0.25222669378092255,
 'roc_auc': 0.9779801004290801,
 'accuracy': 0.9285714285714286,
 'f1': 0.9181286549707602}

Model knn_v1 Private Score !


{'logloss': 0.30105869748166014,
 'roc_auc': 0.9765080479366193,
 'accuracy': 0.924812030075188,
 'f1': 0.9126277372262774}

Model xgboost_v1 Private Score !


{'logloss': 0.10904101810282689,
 'roc_auc': 0.9903080923489086,
 'accuracy': 0.9611528822055138,
 'f1': 0.955654189440398}

Model dnn_v1 Private Score !


{'logloss': 0.15466274567646401,
 'roc_auc': 0.9864184353980272,
 'accuracy': 0.9536340852130326,
 'f1': 0.946409416377018}

Model ensemble_v1 Private Score !


{'logloss': 0.16744313197341587,
 'roc_auc': 0.9877866204396817,
 'accuracy': 0.9573934837092731,
 'f1': 0.9511142181734318}

In [40]:
pd.DataFrame(architecture_name_list).T

Unnamed: 0,logloss,roc_auc,accuracy,f1
lg_v1,0.175951,0.978453,0.933584,0.923874
elasticnet_v1,0.178367,0.978131,0.938596,0.929763
randomforest_v1,0.252227,0.97798,0.928571,0.918129
knn_v1,0.301059,0.976508,0.924812,0.912628
xgboost_v1,0.109041,0.990308,0.961153,0.955654
dnn_v1,0.154663,0.986418,0.953634,0.946409
ensemble_v1,0.167443,0.987787,0.957393,0.951114


In [42]:
pd.DataFrame(architecture_name_list).T.to_csv("./private_score.csv")