## Setup

In [2]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import glob
import sys
import shutil
import pickle
import random as rnd
from tqdm import tqdm

import numpy as np
from numpy import random as np_rnd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn import linear_model as lm

from sklearn import metrics

import librosa

import torch
from torch import nn
from torch.nn import functional as F

In [3]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op == "w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op == "r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj
    
def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]
    
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(np.ceil(adjusted_dom/7.0))

def get_season(dt):
    dt = int(dt)
    if dt in [3, 4, 5]:
        return 0
    elif dt in [6, 7, 8]:
        return 1
    elif dt in [9, 10, 11]:
        return 2
    else:
        return 3

In [4]:
class CFG:
    debug = True
    data_path = ".\\data\\"
    
    n_mfcc = 32
    n_chroma = 16

## Loading data

In [5]:
# Save datasets
df_train = pickleIO(None, "./dataset/df_train.pkl", "r")
df_valid = pickleIO(None, "./dataset/df_valid.pkl", "r")

In [6]:
df_train

Unnamed: 0,data_path,type,timestamp,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,rms_0,rms_1,rms_2,rms_3,rms_4,rms_5,month,season,temp,hum
8053,.\data\abdominal\abdominal_202209\train_abdomi...,abdominal,2022-09-01,0.100375,-152.008209,114.103264,-6.809704,-13.400922,2.236301,4.635829,...,0.043832,0.006729,0.053669,0.031314,0.022355,0.713902,9,2,26.532222,78.021111
9055,.\data\abdominal\abdominal_202210\train_abdomi...,abdominal,2022-10-01,0.071383,-194.701126,152.761902,-20.654911,4.816885,-13.634759,10.529870,...,0.042325,0.008508,0.054016,0.025527,0.028489,1.116019,10,2,20.590323,64.034409
7528,.\data\abdominal\abdominal_202209\train_abdomi...,abdominal,2022-09-01,0.105982,-154.330490,111.065445,-28.823948,20.789288,-3.794430,21.345968,...,0.041040,0.009133,0.056500,0.025861,0.030639,1.184786,9,2,26.532222,78.021111
9001,.\data\abdominal\abdominal_202210\train_abdomi...,abdominal,2022-10-01,0.091739,-168.230530,135.824020,-32.273739,10.225665,-4.109786,9.913898,...,0.047252,0.009714,0.065793,0.035313,0.030480,0.863135,10,2,20.590323,64.034409
7073,.\data\abdominal\abdominal_202209\train_abdomi...,abdominal,2022-09-01,0.133309,-55.229809,104.472702,-40.752758,12.543762,-34.147350,11.086745,...,0.126325,0.065481,0.240394,0.043419,0.196975,4.536574,9,2,26.532222,78.021111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8510,.\data\abdominal\abdominal_202210\train_abdomi...,abdominal,2022-10-01,0.094740,-166.271500,128.656921,-20.329794,-2.984057,-4.430411,5.766476,...,0.043196,0.008440,0.055936,0.025718,0.030217,1.174952,10,2,20.590323,64.034409
4441,.\data\dry\dry_202302\train_dry_13890.wav,dry,2023-02-01,0.110137,-103.181274,112.578857,-19.416920,17.093378,-13.513755,12.859381,...,0.067428,0.010854,0.084913,0.049394,0.035519,0.719080,2,3,7.042693,51.873399
2303,.\data\dry\dry_202211\train_dry_07809.wav,dry,2022-11-01,0.110293,-97.368134,109.255257,-16.161146,21.142416,-20.549458,7.729898,...,0.075805,0.014018,0.097809,0.048181,0.049628,1.030047,11,2,12.853333,55.112222
2420,.\data\dry\dry_202212\train_dry_08049.wav,dry,2022-12-01,0.126861,-54.028755,107.528053,-27.415445,25.679974,-29.958881,6.513360,...,0.109673,0.020581,0.141579,0.058601,0.082978,1.415970,12,3,4.760215,47.618280


In [7]:
df_valid

Unnamed: 0,data_path,type,timestamp,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,rms_0,rms_1,rms_2,rms_3,rms_4,rms_5,month,season,temp,hum
15964,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.127317,-129.935684,120.849190,-57.607597,-1.361679,9.187469,-1.416911,...,0.078874,0.038915,0.138307,0.018661,0.119646,6.411605,3,0,13.506452,58.259140
5592,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01,0.107439,-172.269699,118.385941,-35.123390,-0.768732,-13.109409,7.783858,...,0.037951,0.014019,0.058832,0.013265,0.045568,3.435256,8,1,30.219355,80.798925
2569,.\data\dry\dry_202212\train_dry_08383.wav,dry,2022-12-01,0.131083,-70.272850,93.061905,-23.908194,19.563688,-16.200703,12.072376,...,0.099741,0.049849,0.226470,0.057848,0.168622,2.914919,12,3,4.760215,47.618280
6694,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01,0.069195,-205.911087,147.817291,-35.009632,9.234331,-5.329868,13.523920,...,0.046749,0.015951,0.068912,0.015650,0.053262,3.403417,8,1,30.219355,80.798925
9192,.\data\abdominal\abdominal_202210\train_abdomi...,abdominal,2022-10-01,0.103156,-170.419708,130.740021,-37.892384,14.527298,-9.853578,9.035569,...,0.037664,0.010373,0.051824,0.018760,0.033065,1.762541,10,2,20.590323,64.034409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8329,.\data\abdominal\abdominal_202210\train_abdomi...,abdominal,2022-10-01,0.108994,-71.564240,133.406113,-64.242844,-0.741359,-14.942988,-2.030332,...,0.137886,0.055763,0.209324,0.038656,0.170668,4.415051,10,2,20.590323,64.034409
6962,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01,0.104523,-193.763641,129.363861,-47.244011,11.891455,-7.298118,11.752427,...,0.031929,0.008256,0.043753,0.016167,0.027585,1.706237,8,1,30.219355,80.798925
5585,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01,0.079260,-154.604126,146.838577,-37.803913,-2.129235,-11.512072,18.544447,...,0.080659,0.031508,0.125135,0.033550,0.091585,2.729756,8,1,30.219355,80.798925
2677,.\data\dry\dry_202212\train_dry_08619.wav,dry,2022-12-01,0.176620,-17.093391,79.770050,-11.531411,22.224131,-26.905506,5.901452,...,0.140793,0.032006,0.203969,0.093442,0.110527,1.182833,12,3,4.760215,47.618280


In [8]:
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

## Preprocessing

In [9]:
df_train = df_train.drop(["data_path", "timestamp"], axis=1)
df_valid = df_valid.drop(["data_path", "timestamp"], axis=1)

In [10]:
df_train["type"] = df_train["type"].apply(lambda x: 1 if x == "abdominal" else 0)
df_valid["type"] = df_valid["type"].apply(lambda x: 1 if x == "abdominal" else 0)

In [11]:
df_train["month"] = df_train["month"] - 1
df_valid["month"] = df_valid["month"] - 1

In [12]:
ohe_cols = []
for i, j in zip(["month", "season"], [list(range(12)), list(range(4))]):
    ohe_cols.extend([i + "_ohe_" + str(z) for z in j])
ohe = OneHotEncoder(categories=[list(range(12)), list(range(4))], sparse=False)

df_train[ohe_cols] = ohe.fit_transform(df_train[["month", "season"]])
df_train = df_train.drop(["month", "season"], axis=1)

df_valid[ohe_cols] = ohe.transform(df_valid[["month", "season"]])
df_valid = df_valid.drop(["month", "season"], axis=1)

In [13]:
feature_info = {
    "target_var": "type",
    "num_vars": [],
    "cat_vars": ["month", "season"],
    "cat_ohe_vars": ohe_cols,
}
feature_info["num_vars"] = diff(df_train.drop(feature_info["target_var"], axis=1).columns, feature_info["cat_ohe_vars"])
assert df_train.shape[1] == 1 + len(feature_info["num_vars"]) + len(feature_info["cat_ohe_vars"])

In [14]:
df_train_y = df_train[feature_info["target_var"]].astype("int32")
df_train_x = df_train.drop(feature_info["target_var"], axis=1)

In [15]:
df_valid_y = df_valid[feature_info["target_var"]].astype("int32")
df_valid_x = df_valid.drop(feature_info["target_var"], axis=1)

In [16]:
# Scaling
feature_scaler = StandardScaler()
df_train_x[feature_info["num_vars"]] = feature_scaler.fit_transform(df_train_x[feature_info["num_vars"]])
df_valid_x[feature_info["num_vars"]] = feature_scaler.transform(df_valid_x[feature_info["num_vars"]])

In [17]:
df_train_x = df_train_x.astype("float32")
df_valid_x = df_valid_x.astype("float32")

In [18]:
df_train_x

Unnamed: 0,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,month_ohe_6,month_ohe_7,month_ohe_8,month_ohe_9,month_ohe_10,month_ohe_11,season_ohe_0,season_ohe_1,season_ohe_2,season_ohe_3
0,-0.435507,-0.701487,-0.235354,2.179530,-2.545707,2.157758,-0.942752,0.369791,-0.929737,-0.071672,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.274409,-1.569641,1.434420,1.097911,-0.617446,0.376357,-0.055125,-0.719684,-0.786294,-0.612519,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.273249,-0.748710,-0.366566,0.459726,1.073150,1.480856,1.573752,0.654565,0.025341,0.207848,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.685386,-1.031365,0.702826,0.190221,-0.044955,1.445459,-0.147889,-1.383263,-0.815970,-0.899398,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.517470,1.266489,-0.651325,-0.472180,0.200404,-1.926019,0.028739,0.456741,-0.635872,-1.384631,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12779,-0.598555,-0.991529,0.393259,1.123309,-1.443136,1.409472,-0.772480,-0.346586,-0.247062,-0.280758,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
12780,-0.153026,0.291402,-0.301197,1.194625,0.681957,0.389939,0.295694,0.267415,0.955887,0.943488,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12781,-0.148508,0.409612,-0.444753,1.448974,1.110527,-0.399763,-0.476794,0.501708,0.914052,0.135142,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
12782,0.330900,1.290912,-0.519355,0.569762,1.590804,-1.455896,-0.660001,0.451398,0.646493,-0.955529,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [19]:
df_valid_x

Unnamed: 0,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,month_ohe_6,month_ohe_7,month_ohe_8,month_ohe_9,month_ohe_10,month_ohe_11,season_ohe_0,season_ohe_1,season_ohe_2,season_ohe_3
0,0.344096,-0.252645,0.056022,-1.788920,-1.271415,2.937971,-1.854280,-2.427330,-0.289635,-1.604205,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.231109,-1.113501,-0.050373,-0.032401,-1.208655,0.435323,-0.468667,-0.858219,-0.603325,0.520652,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.453055,0.960591,-1.144189,0.843756,0.943427,0.088350,0.177173,0.698636,0.966421,-0.178522,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-1.337716,-1.797594,1.220849,-0.023514,-0.149882,1.308515,0.395772,-1.831484,-1.920407,-2.063777,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.355023,-1.075882,0.483234,-0.248721,0.410351,0.800764,-0.280163,-0.431077,0.670784,-0.403621,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,-0.186108,0.934330,0.598390,-2.307281,-1.205758,0.229518,-1.946660,-0.636601,1.006245,3.228349,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1593,-0.315489,-1.550578,0.423794,-0.979292,0.131361,1.087595,0.128989,-1.442697,-1.140377,0.097957,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1594,-1.046496,-0.754274,1.178576,-0.241810,-1.352657,0.614612,1.151850,0.806476,-0.532744,-1.188527,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1595,1.770716,2.041988,-1.718301,1.810660,1.225021,-1.113179,-0.752153,0.096642,0.503089,-0.368399,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


## Training & Evaluation

In [20]:
architecture_root_path = "./architectures/"
architecture_name = "lg_v1"
architecture_path = architecture_root_path + architecture_name + "/"
createFolder(architecture_path)

In [21]:
seed_everything()

# define model
model = lm.LogisticRegression(penalty="none", solver="saga")

# training
model.fit(df_train_x.values, df_train_y.values)



LogisticRegression(penalty='none', solver='saga')

In [22]:
# threshold optimization
def threshold_optimization(y_pred_prob, y_true):
    search_space = np.arange(5e-2, 0.95 + 1e+3, 5e-2)
    best_score = -np.inf
    best_threshold = 0.5
    for threshold in tqdm(search_space):
        y_pred = (y_pred_prob[:, 1] > threshold).astype("int32")
        score = metrics.f1_score(y_true, y_pred, average="macro")
        if best_score < score:
            best_score = score
            best_threshold = threshold
    return best_score, best_threshold

result = threshold_optimization(
    np.concatenate([model.predict_proba(df_train_x.values), model.predict_proba(df_valid_x.values)], axis=0),
    np.concatenate([df_train_y.values, df_valid_y.values], axis=0),
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20018/20018 [01:08<00:00, 292.08it/s]


In [23]:
print("best threshold :", result[1])
print("best f1 :", result[0])
best_threshold = result[1]

best threshold : 0.55
best f1 : 0.9317236789422241


In [24]:
# evaluation
y_pred_prob = model.predict_proba(df_valid_x.values)
y_pred = (y_pred_prob[:, 1] > best_threshold).astype("int32")

In [25]:
y_pred_prob

array([[2.4914622e-02, 9.7508538e-01],
       [1.9652069e-02, 9.8034793e-01],
       [7.6687562e-01, 2.3312436e-01],
       ...,
       [6.8080425e-04, 9.9931920e-01],
       [9.9814218e-01, 1.8578203e-03],
       [9.5885390e-01, 4.1146096e-02]], dtype=float32)

In [26]:
score_dic = {
    "logloss": metrics.log_loss(df_valid_y, y_pred_prob[:, 1]),
    "roc_auc": metrics.roc_auc_score(df_valid_y, y_pred_prob[:, 1]),
    "accuracy": metrics.accuracy_score(df_valid_y, y_pred),
    "f1": metrics.f1_score(df_valid_y, y_pred, average="macro"),
}
print("Valid Score !")
display(score_dic)

Valid Score !


{'logloss': 0.15812132965323536,
 'roc_auc': 0.9822497830748159,
 'accuracy': 0.9461490294301816,
 'f1': 0.9391680485868978}

In [27]:
score_dic = pd.Series(score_dic)
score_dic.index.name = "metric"
score_dic.name = "value"
score_dic.to_csv(architecture_path + "./valid_score.csv")

In [28]:
pickleIO(feature_info, architecture_path + "feature_info.pkl", "w")
pickleIO(feature_scaler, architecture_path + "feature_scaler.pkl", "w")
pickleIO(model, architecture_path + "model.pkl", "w")
pickleIO(model, architecture_path + "model.pkl", "w")
pickleIO(result, architecture_path + "threshold_opt_resut.pkl", "w")

## Inference

In [29]:
# Save datasets
df_public = pickleIO(None, "./dataset/df_public.pkl", "r")
df_private = pickleIO(None, "./dataset/df_private.pkl", "r")

In [30]:
df_public = df_public.reset_index(drop=True)
df_private = df_private.reset_index(drop=True)

In [31]:
df_public = df_public.drop(["data_path", "timestamp"], axis=1)
df_private = df_private.drop(["data_path", "timestamp"], axis=1)

In [32]:
df_public["type"] = df_public["type"].apply(lambda x: 1 if x == "abdominal" else 0)
df_private["type"] = df_private["type"].apply(lambda x: 1 if x == "abdominal" else 0)

In [33]:
df_public["month"] = df_public["month"] - 1
df_private["month"] = df_private["month"] - 1

In [34]:
# ohe_cols = []
# for i, j in zip(["month", "season"], [list(range(12)), list(range(4))]):
#     ohe_cols.extend([i + "_ohe_" + str(z) for z in j])
# ohe = OneHotEncoder(categories=[list(range(12)), list(range(4))], sparse=False)

df_public[ohe_cols] = ohe.transform(df_public[["month", "season"]])
df_public = df_public.drop(["month", "season"], axis=1)

df_private[ohe_cols] = ohe.transform(df_private[["month", "season"]])
df_private = df_private.drop(["month", "season"], axis=1)

In [35]:
# feature_info = {
#     "target_var": "type",
#     "num_vars": [],
#     "cat_vars": ["month", "season"],
#     "cat_ohe_vars": ohe_cols,
# }
# feature_info["num_vars"] = diff(df_train_x.columns, feature_info["cat_ohe_vars"])
# assert df_train.shape[1] == 1 + len(feature_info["num_vars"]) + len(feature_info["cat_ohe_vars"])

In [36]:
df_public_y = df_public[feature_info["target_var"]]
df_public_x = df_public.drop(feature_info["target_var"], axis=1)

In [37]:
# df_private_y = df_valid[feature_info["target_var"]]
df_private_x = df_private.drop(feature_info["target_var"], axis=1)

In [38]:
# Scaling
# feature_scaler = StandardScaler()
df_public_x[feature_info["num_vars"]] = feature_scaler.transform(df_public_x[feature_info["num_vars"]])
df_private_x[feature_info["num_vars"]] = feature_scaler.transform(df_private_x[feature_info["num_vars"]])

In [39]:
df_public_x

Unnamed: 0,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,month_ohe_6,month_ohe_7,month_ohe_8,month_ohe_9,month_ohe_10,month_ohe_11,season_ohe_0,season_ohe_1,season_ohe_2,season_ohe_3
0,-0.129326,0.568441,0.238056,-0.926726,-1.523925,-2.325457,-0.461698,1.186348,-1.038703,-0.066611,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.807884,0.750091,1.576654,-0.026673,-2.544739,-0.925768,0.617175,-0.296513,-0.362152,1.736116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.070139,1.357889,-1.883178,0.487301,-0.367036,-0.539941,0.446380,0.893243,0.002299,-0.618204,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-0.082111,0.027244,0.185897,0.011895,-0.169248,-0.383471,1.926082,-0.024591,-3.743301,-1.532121,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.429296,-0.432416,1.359626,-1.701725,-2.082217,-0.168553,-0.293178,-1.966107,-0.173617,1.827145,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,-0.313031,-0.666389,0.973856,-1.517662,-1.453320,0.244174,1.210261,-0.333510,-1.422155,-0.491757,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
795,-0.608180,-0.335359,0.001088,0.935999,-0.151907,-0.453964,-1.179227,1.162312,0.925322,1.243863,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
796,-0.619873,0.291625,0.058192,1.228921,0.924250,0.266496,-0.247784,0.464956,0.331954,1.551092,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
797,2.407531,1.315496,-2.338611,-0.250043,-0.470138,-0.179471,-0.522758,0.364130,0.267767,-0.200799,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [40]:
df_private_x

Unnamed: 0,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,month_ohe_6,month_ohe_7,month_ohe_8,month_ohe_9,month_ohe_10,month_ohe_11,season_ohe_0,season_ohe_1,season_ohe_2,season_ohe_3
0,-0.856551,1.542045,1.313773,-1.056625,-1.317837,-0.114841,-0.925214,0.634648,-1.767448,-0.074750,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.891395,0.139097,0.507568,0.733639,0.832912,1.371877,0.266273,0.682436,0.138965,0.016581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.494753,0.113429,-1.070624,0.710673,2.066977,1.534694,2.162259,0.359770,-0.024369,0.869389,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.166616,-1.567166,-0.103923,1.148674,0.285086,-0.191956,0.424920,0.170466,0.605194,0.941097,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.050899,-0.002851,0.155180,-1.072893,0.328857,-2.030894,-0.297123,0.277364,1.386337,-1.291279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,-0.473979,0.142081,0.335835,-1.664565,-0.720774,0.100608,-1.555453,-1.558996,-1.930325,0.994621,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
794,1.213622,0.394419,-1.352246,0.641277,1.191257,-0.624456,-0.637435,0.890834,0.962613,-0.613859,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
795,-0.005044,-1.035779,0.413180,-2.037291,0.476375,0.003272,-0.770577,-1.754144,-0.992100,-0.301869,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
796,-0.078694,0.835793,0.206998,-1.198152,-1.388965,0.067360,0.011800,0.447557,-0.438236,0.346065,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### public dataset

In [41]:
y_pred_prob = model.predict_proba(df_public_x)
y_pred = (y_pred_prob[:, 1] > best_threshold).astype("int32")



In [42]:
y_pred_prob

array([[1.08092577e-02, 9.89190742e-01],
       [5.67397693e-04, 9.99432602e-01],
       [9.89825167e-01, 1.01748326e-02],
       ...,
       [9.31475928e-01, 6.85240721e-02],
       [9.99053756e-01, 9.46243917e-04],
       [3.30407025e-01, 6.69592975e-01]])

In [43]:
score_dic = {
    "logloss": metrics.log_loss(df_public_y, y_pred_prob[:, 1]),
    "roc_auc": metrics.roc_auc_score(df_public_y, y_pred_prob[:, 1]),
    "accuracy": metrics.accuracy_score(df_public_y, y_pred),
    "f1": metrics.f1_score(df_public_y, y_pred, average="macro"),
}
print("LB Score !")
display(score_dic)

LB Score !


{'logloss': 0.17231512251443923,
 'roc_auc': 0.9788576679541114,
 'accuracy': 0.9299123904881101,
 'f1': 0.9206486578514272}

In [44]:
score_dic = pd.Series(score_dic)
score_dic.index.name = "metric"
score_dic.name = "value"
score_dic.to_csv(architecture_path + "./lb_score.csv")

### private dataset

In [45]:
y_pred_prob = model.predict_proba(df_private_x)
y_pred = [1 if i > result[1] else 0 for i in y_pred_prob[:, 1]]
pickleIO({"prob": y_pred_prob, "pred": y_pred}, architecture_path + "./submission.pkl", "w")



In [46]:
y_pred_prob

array([[3.56791285e-04, 9.99643209e-01],
       [2.46777237e-01, 7.53222763e-01],
       [3.42512904e-01, 6.57487096e-01],
       ...,
       [3.48245294e-03, 9.96517547e-01],
       [5.36804476e-04, 9.99463196e-01],
       [2.41241986e-01, 7.58758014e-01]])