## Setup

In [1]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import glob
import sys
import shutil
import pickle
import random as rnd
from tqdm import tqdm
import time

import numpy as np
from numpy import random as np_rnd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn import linear_model as lm
from sklearn import metrics

import xgboost as xgb

import librosa

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_polynomial_decay_schedule_with_warmup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op == "w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op == "r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj
    
def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]
    
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(np.ceil(adjusted_dom/7.0))

def get_season(dt):
    dt = int(dt)
    if dt in [3, 4, 5]:
        return 0
    elif dt in [6, 7, 8]:
        return 1
    elif dt in [9, 10, 11]:
        return 2
    else:
        return 3

In [3]:
class CFG:
    debug = False
    data_path = ".\\data\\"
    
    n_mfcc = 32
    n_chroma = 16
    
    epochs = 50
    batch_size = 32
    warmup = int(epochs * 0.1)
    early_stopping_rounds = int(epochs * 0.2)
    
    eta = 5e-4
    weight_decay = 1e-4

## Loading public data & Ensemble

In [4]:
df_public = pickleIO(None, "./dataset/df_public.pkl", "r")
df_public["type"] = df_public["type"].apply(lambda x: 1 if x == "abdominal" else 0)
df_public_y = df_public["type"].astype("int32")

In [5]:
architecture_root_path = "./architectures/"
architecture_name = "ensemble_v1"
architecture_path = architecture_root_path + architecture_name + "/"
createFolder(architecture_path)
architecture_name_list = {
    "elasticnet_v1": None,
    "randomforest_v1": None,
    "knn_v1": None,
    "xgboost_v1":  None,
    "dnn_v1": None,
}
model_weight = {
    "elasticnet_v1": 0.1,
    "randomforest_v1": 0.2,
    "knn_v1": 0.2,
    "xgboost_v1":  0.25,
    "dnn_v1": 0.25,
}

In [6]:
for i in architecture_name_list.keys():
    architecture_name_list[i] = pickleIO(None, architecture_root_path + i + "/lb_submission.pkl", "r")

In [7]:
y_pred_prob = np.stack([i[0] * i[1]["prob"] for i in zip(model_weight.values(), architecture_name_list.values())], axis=0).sum(axis=0)
best_threshold = np.sum([i[0] * pickleIO(None, architecture_root_path + i[1] + "/threshold_opt_resut.pkl", "r")[1] for i in zip(model_weight.values(), architecture_name_list)])
y_pred = (y_pred_prob[:, 1] > best_threshold).astype("int32")
print("ensembled threshold :", best_threshold)

ensembled threshold : 0.42500000000000004


In [8]:
score_dic = {
    "logloss": metrics.log_loss(df_public_y, y_pred_prob[:, 1]),
    "roc_auc": metrics.roc_auc_score(df_public_y, y_pred_prob[:, 1]),
    "accuracy": metrics.accuracy_score(df_public_y, y_pred),
    "f1": metrics.f1_score(df_public_y, y_pred, average="macro"),
}
print("LB Score !")
display(score_dic)

LB Score !


{'logloss': 0.15918331248678821,
 'roc_auc': 0.9910477120335406,
 'accuracy': 0.967459324155194,
 'f1': 0.9626388409300184}

In [9]:
# Save data
score_dic = pd.Series(score_dic)
score_dic.index.name = "metric"
score_dic.name = "value"
score_dic.to_csv(architecture_path + "./lb_score.csv")

pickleIO(architecture_name_list, architecture_path + "./architecture_name_list.pkl", "w")
pickleIO(model_weight, architecture_path + "./model_weight.pkl", "w")
pickleIO(best_threshold, architecture_path + "./threshold_opt_resut.pkl", "w")
pickleIO({"prob": y_pred_prob, "pred": y_pred}, architecture_path + "./lb_submission.pkl", "w")

## Inference private data

In [10]:
for i in architecture_name_list.keys():
    architecture_name_list[i] = pickleIO(None, architecture_root_path + i + "/submission.pkl", "r")

In [11]:
y_pred_prob = np.stack([i[0] * i[1]["prob"] for i in zip(model_weight.values(), architecture_name_list.values())], axis=0).sum(axis=0)
# best_threshold = np.sum([i[0] * pickleIO(None, architecture_root_path + i[1] + "/threshold_opt_resut.pkl", "r")[1] for i in zip(model_weight.values(), architecture_name_list)])
y_pred = (y_pred_prob[:, 1] > best_threshold).astype("int32")

In [12]:
y_pred_prob[:10]

array([[0.05686288, 0.9431371 ],
       [0.21562597, 0.78437402],
       [0.57055397, 0.42944603],
       [0.67106817, 0.32893185],
       [0.17026489, 0.82973513],
       [0.67779343, 0.32220658],
       [0.20540322, 0.79459678],
       [0.0615923 , 0.9384077 ],
       [0.24983181, 0.75016818],
       [0.03753989, 0.96246013]])

In [13]:
y_pred[:10]

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1])

In [14]:
pickleIO({"prob": y_pred_prob, "pred": y_pred}, architecture_path + "./submission.pkl", "w")