## Setup

In [38]:
import os
import glob
import sys
import shutil
import pickle
import random as rnd
from tqdm import tqdm

import numpy as np
from numpy import random as np_rnd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts

import librosa

import torch
from torch import nn
from torch.nn import functional as F

In [39]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="w"):
    if op == "w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op == "r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj
    
def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def week_of_month(dt):
    """ 
        Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + (1 + first_day.weekday()) % 7
    return int(np.ceil(adjusted_dom/7.0))

def get_season(dt):
    dt = int(dt)
    if dt in [3, 4, 5]:
        return 0
    elif dt in [6, 7, 8]:
        return 1
    elif dt in [9, 10, 11]:
        return 2
    else:
        return 3

In [40]:
class CFG:
    debug = True
    data_path = ".\\data\\"
    
    n_mfcc = 32
    n_chroma = 16

## Config data archive metadata

In [41]:
CFG.data_archive = {}
CFG.data_archive["dry"] = glob.glob(CFG.data_path + "dry\\*")
CFG.data_archive["abdominal"] = glob.glob(CFG.data_path + "abdominal\\*")
CFG.data_archive

{'dry': ['.\\data\\dry\\dry_202208',
  '.\\data\\dry\\dry_202209',
  '.\\data\\dry\\dry_202210',
  '.\\data\\dry\\dry_202211',
  '.\\data\\dry\\dry_202212',
  '.\\data\\dry\\dry_202301',
  '.\\data\\dry\\dry_202302',
  '.\\data\\dry\\dry_202303'],
 'abdominal': ['.\\data\\abdominal\\abdominal_202208',
  '.\\data\\abdominal\\abdominal_202209',
  '.\\data\\abdominal\\abdominal_202210',
  '.\\data\\abdominal\\abdominal_202211',
  '.\\data\\abdominal\\abdominal_202212',
  '.\\data\\abdominal\\abdominal_202301',
  '.\\data\\abdominal\\abdominal_202302',
  '.\\data\\abdominal\\abdominal_202303']}

In [42]:
# Create dataframe
df = {
    "dry": [],
    "abdominal": [],
}
for i, j in zip(CFG.data_archive["dry"], CFG.data_archive["abdominal"]):
    df["dry"].extend(sorted(glob.glob(i + "\\*")))
    df["abdominal"].extend(sorted(glob.glob(j + "\\*")))
df = pd.DataFrame({"data_path": df["dry"] + df["abdominal"], "type": ["dry"] * len(df["dry"]) + ["abdominal"] * len(df["abdominal"])})
display(df.head())
display(df.tail())

Unnamed: 0,data_path,type
0,.\data\dry\dry_202208\train_dry_00004.wav,dry
1,.\data\dry\dry_202208\train_dry_00005.wav,dry
2,.\data\dry\dry_202208\train_dry_00006.wav,dry
3,.\data\dry\dry_202208\train_dry_00028.wav,dry
4,.\data\dry\dry_202208\train_dry_00029.wav,dry


Unnamed: 0,data_path,type
15995,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal
15996,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal
15997,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal
15998,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal
15999,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal


In [43]:
# Add date feature
df["timestamp"] = df["data_path"].apply(lambda x: x.split("_")[1].split("\\")[0])
df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y%m")

In [44]:
df

Unnamed: 0,data_path,type,timestamp
0,.\data\dry\dry_202208\train_dry_00004.wav,dry,2022-08-01
1,.\data\dry\dry_202208\train_dry_00005.wav,dry,2022-08-01
2,.\data\dry\dry_202208\train_dry_00006.wav,dry,2022-08-01
3,.\data\dry\dry_202208\train_dry_00028.wav,dry,2022-08-01
4,.\data\dry\dry_202208\train_dry_00029.wav,dry,2022-08-01
...,...,...,...
15995,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01
15996,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01
15997,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01
15998,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01


In [45]:
# # save audio object
# tmp = []
# for i in tqdm(df["data_path"]):
#     tmp.append(librosa.load(i))
# pickleIO(tmp, "./audio_obejct.pkl", "w")
# del tmp; gc.collect()

## Take a look 

In [None]:
import IPython.display as ipd

In [None]:
# for dry cough
wave, sr = librosa.load(df.loc[df["type"] == "dry", "data_path"].iloc[0])

In [None]:
# feature shape
wave.shape

In [None]:
# sample ratio
sr

In [None]:
ipd.Audio(df.loc[df["type"] == "dry", "data_path"].iloc[0])

In [None]:
# Wave visualization
librosa.display.waveshow(wave, sr=sr, color="green")
plt.show()

In [None]:
# for abdominal cough
wave, sr = librosa.load(df.loc[df["type"] == "abdominal", "data_path"].iloc[40])

In [None]:
# feature shape
wave.shape

In [None]:
# sample ratio
sr

In [None]:
ipd.Audio(df.loc[df["type"] == "abdominal", "data_path"].iloc[0])

In [None]:
# Wave visualization
librosa.display.waveshow(wave, sr=sr, color="orange")
plt.show()

## Audio Feature Extraction

In [None]:
data_dic = {}
error_dic = {}

### Zero Crossing Rate

* The zero-crossing rate is the rate of sign-changes along with a signal,
* The rate at which the signal changes from positive to negative or back.
* Each time signal crosses from positve to negative boundary it counts as 1

In [None]:
def get_feature_zcr(data):
    error_list = []
    output = []
    for idx, value in enumerate(tqdm(df["data_path"])):
        wave, sr = librosa.load(value)
        if len(wave) > 0:
            output.append(librosa.zero_crossings(wave))
        else:
            error_list.append(idx)
    return output, error_list

In [None]:
feature, error_list = get_feature_zcr(df["data_path"])
data_dic["zcr"] = feature
error_dic["zcr"] = error_list

In [None]:
feature[0].shape

In [None]:
feature[1].shape

In [None]:
len(error_list)

In [None]:
# Visualization feature
fig, ax1 = plt.subplots(figsize=(6, 4))
tmp = librosa.load(df["data_path"].iloc[0])
librosa.display.waveshow(tmp[0], sr=tmp[1], alpha=0.5, color="green")

# ax2 = ax1.twinx()
# ax2.plot(librosa.frames_to_time(range(len(feature[0]))), feature[0], color='orange', marker="o")
print(feature[0].sum())

### Spectral Centroid

* Spectral centroid indicates where the ”center of mass” for a sound is located
* Calculated as the weighted mean of the frequencies present in the sound.
* Consider two songs, one from a blues genre and the other belonging to metal.
* Now, as compared to the blues genre song, which is the same throughout its length, the metal song has more * frequencies towards the end.
    * For blues song spectral centroid will lie somewhere near the middle of its spectrum
    * For a metal song spectral centroid would be towards its end

In [None]:
def get_feature_sc(data):
    error_list = []
    output = []
    for idx, value in enumerate(tqdm(df["data_path"])):
        wave, sr = librosa.load(value)
        if len(wave) > 0:
            output.append(librosa.feature.spectral_centroid(y=wave, sr=sr))
        else:
            error_list.append(idx)
    return output, error_list

In [None]:
feature, error_list = get_feature_sc(df["data_path"])
data_dic["sc"] = feature
error_dic["sc"] = error_list

In [None]:
feature[0].shape

In [None]:
feature[1].shape

In [None]:
len(error_list)

In [None]:
# Visualization feature
fig, ax1 = plt.subplots(figsize=(6, 4))
tmp = librosa.load(df["data_path"].iloc[0])
librosa.display.waveshow(tmp[0], sr=tmp[1], alpha=0.5, color="green")

ax2 = ax1.twinx()
ax2.plot(librosa.frames_to_time(range(len(feature[0].mean(axis=0)))), feature[0].mean(axis=0), color='orange')

### Spectral Rollof

* Spectral rolloff is a measure of the shape of the signal.
* It represents the frequency below which a specified percentage of the total spectral energy, e.g., 85%, lies.

In [None]:
def get_feature_srl(data):
    error_list = []
    output = []
    for idx, value in enumerate(tqdm(df["data_path"])):
        wave, sr = librosa.load(value)
        if len(wave) > 0:
            output.append(librosa.feature.spectral_rolloff(y=wave, sr=sr))
        else:
            error_list.append(idx)
    return output, error_list

In [None]:
feature, error_list = get_feature_srl(df["data_path"])
data_dic["srl"] = feature
error_dic["srl"] = error_list

In [None]:
feature[0].shape

In [None]:
feature[1].shape

In [None]:
len(error_list)

In [None]:
# Visualization feature
fig, ax1 = plt.subplots(figsize=(6, 4))
tmp = librosa.load(df["data_path"].iloc[0])
librosa.display.waveshow(tmp[0], sr=tmp[1], alpha=0.5, color="green")

ax2 = ax1.twinx()
ax2.plot(librosa.frames_to_time(range(len(feature[0].mean(axis=0)))), feature[0].mean(axis=0), color='orange')

### MFCC (Mel frequency cepstral coefficients)

* The MFCCs of a signal are a small set of features (usually about 10–20) that concisely describe the overall shape of a spectral envelope.
* So in given 10 to 20 feature set, the music information is packed for the all music frames.
* It models the characteristics of the human voice.

In [None]:
def get_feature_mfcc(data):
    error_list = []
    output = []
    for idx, value in enumerate(tqdm(df["data_path"])):
        wave, sr = librosa.load(value)
        if len(wave) > 0:
            output.append(librosa.feature.mfcc(y=wave, sr=sr, n_mfcc=CFG.n_mfcc))
        else:
            error_list.append(idx)
    return output, error_list

In [None]:
feature, error_list = get_feature_mfcc(df["data_path"])
data_dic["mfcc"] = feature
error_dic["mfcc"] = error_list

In [None]:
feature[0].shape

In [None]:
feature[1].shape

In [None]:
feature[0].mean(axis=1)

In [None]:
feature[1].mean(axis=1)

In [None]:
len(error_list)

In [None]:
# Visualization feature
fig, ax1 = plt.subplots(figsize=(6, 4))
tmp = librosa.load(df["data_path"].iloc[0])
librosa.display.waveshow(tmp[0], sr=tmp[1], alpha=0.5, color="green")

ax2 = ax1.twinx()
ax2.plot(librosa.frames_to_time(range(len(feature[0].mean(axis=0)))), feature[0].mean(axis=0), color='orange')

### Chroma Frequencies

* Chroma features represents the entire spectrum onto 12 bins representing the 12 distinct semitones (or chroma) of the musical octave.

In [None]:
def get_feature_cf(data):
    error_list = []
    output = []
    for idx, value in enumerate(tqdm(df["data_path"])):
        wave, sr = librosa.load(value)
        if len(wave) > 0:
            output.append(librosa.feature.chroma_stft(y=wave, sr=sr, n_chroma=CFG.n_chroma))
        else:
            error_list.append(idx)
    return output, error_list

In [None]:
feature, error_list = get_feature_cf(df["data_path"])
data_dic["cf"] = feature
error_dic["cf"] = error_list

In [None]:
feature[0].shape

In [None]:
feature[1].shape

In [None]:
feature[0].mean(axis=1)

In [None]:
feature[1].mean(axis=1)

In [None]:
len(error_list)

In [None]:
# Visualization feature
fig, ax1 = plt.subplots(figsize=(6, 4))
tmp = librosa.load(df["data_path"].iloc[0])
librosa.display.waveshow(tmp[0], sr=tmp[1], alpha=0.5, color="green")

ax2 = ax1.twinx()
ax2.plot(librosa.frames_to_time(range(len(feature[0].mean(axis=0)))), feature[0].mean(axis=0), color='orange')

### RMS (Root Mean Square)

* Compute root-mean-square (RMS) value for each frame, either from the audio samples y or from a spectrogram S.

In [None]:
def get_feature_rms(data):
    error_list = []
    output = []
    for idx, value in enumerate(tqdm(df["data_path"])):
        wave, sr = librosa.load(value)
        if len(wave) > 0:
            output.append(librosa.feature.rms(y=wave))
        else:
            error_list.append(idx)
    return output, error_list

In [None]:
feature, error_list = get_feature_rms(df["data_path"])
data_dic["rms"] = feature
error_dic["rms"] = error_list

In [None]:
feature[0].shape

In [None]:
feature[1].shape

In [None]:
feature[0].mean(axis=1)

In [None]:
feature[1].mean(axis=1)

In [None]:
len(error_list)

In [None]:
# Visualization feature
fig, ax1 = plt.subplots(figsize=(6, 4))
tmp = librosa.load(df["data_path"].iloc[0])
librosa.display.waveshow(tmp[0], sr=tmp[1], alpha=0.5, color="green")

ax2 = ax1.twinx()
ax2.plot(librosa.frames_to_time(range(len(feature[0].mean(axis=0)))), feature[0].mean(axis=0), color='orange')

In [None]:
# pickleIO(data_dic, "./data_dic.pkl", "w")
# pickleIO(error_dic, "./error_dic.pkl", "w")

## Preprocessing & Feature Engineering

In [46]:
data_dic = pickleIO(None, "./data_dic.pkl", "r")
error_dic = pickleIO(None, "./error_dic.pkl", "r")

In [47]:
selected_audio_features = ["zcr", "mfcc", "cf", "rms"]

### 1. Remove wave length is zero

In [48]:
error_list = []
for k, v in error_dic.items():
    if k in selected_audio_features:
        error_list.extend(v)

In [49]:
error_list = list(sorted(set(error_list)))
len(error_list)

22

In [50]:
print("Droped samples")
df.loc[df.index[error_list]]

Droped samples


Unnamed: 0,data_path,type,timestamp
3039,.\data\dry\dry_202212\train_dry_09247.wav,dry,2022-12-01
3850,.\data\dry\dry_202301\train_dry_11541.wav,dry,2023-01-01
3851,.\data\dry\dry_202301\train_dry_11543.wav,dry,2023-01-01
5070,.\data\dry\dry_202303\train_dry_15017.wav,dry,2023-03-01
5403,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01
5467,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01
5512,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01
6262,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01
6291,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01
6676,.\data\abdominal\abdominal_202208\train_abdomi...,abdominal,2022-08-01


In [51]:
df = df.drop(index=df.index[error_list]).reset_index(drop=True)

In [52]:
df.shape

(15978, 3)

### 2. Add audio features

In [53]:
# ZCR
audio_zcr = []
for i in data_dic["zcr"]:
    audio_zcr.append(i.mean())
audio_zcr = np.array(audio_zcr)
print(audio_zcr.shape)

(15978,)


In [54]:
# MFCC
audio_mfcc = []
for i in data_dic["mfcc"]:
    audio_mfcc.append(i.mean(axis=1))
audio_mfcc = np.stack(audio_mfcc)
# Add mean & std on embedding dimension
audio_mfcc = np.concatenate([audio_mfcc, audio_mfcc.mean(axis=1).reshape(-1, 1), audio_mfcc.std(axis=1).reshape(-1, 1)], axis=1)
print(audio_mfcc.shape)

(15978, 34)


In [55]:
# Chroma Frequencies
audio_cf = []
for i in data_dic["cf"]:
    audio_cf.append(i.mean(axis=1))
audio_cf = np.stack(audio_cf)
# Add mean & std on embedding dimension
audio_cf = np.concatenate([audio_cf, audio_cf.mean(axis=1).reshape(-1, 1), audio_cf.std(axis=1).reshape(-1, 1)], axis=1)
print(audio_cf.shape)

(15978, 18)


In [56]:
# RMS
audio_rms = []
for i in tqdm(data_dic["rms"]):
    tmp = pd.Series(i[0])
    feature_list = []
    # mean
    feature_list.append(tmp.mean())
    # standard deviation
    feature_list.append(tmp.std())
    # max 
    feature_list.append(tmp.max())
    # min
    feature_list.append(tmp.min())
    # min-max range
    feature_list.append( np.abs(tmp.max() - tmp.min()) )
    # min-max pct change
    feature_list.append( (tmp.max() - tmp.min()) / tmp.min() )
    audio_rms.append(np.array(feature_list))
audio_rms = np.stack(audio_rms)
print(audio_rms.shape)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15978/15978 [00:07<00:00, 2265.36it/s]

(15978, 6)





### Add to dataframe

In [57]:
df["zcr"] = audio_zcr.astype("float32")
df[["mfcc_" + str(i) for i in range(audio_mfcc.shape[1])]] = audio_mfcc.astype("float32")
df[["cf_" + str(i) for i in range(audio_cf.shape[1])]] = audio_cf.astype("float32")
df[["rms_" + str(i) for i in range(audio_rms.shape[1])]] = audio_rms.astype("float32")

In [58]:
df

Unnamed: 0,data_path,type,timestamp,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,cf_14,cf_15,cf_16,cf_17,rms_0,rms_1,rms_2,rms_3,rms_4,rms_5
0,.\data\dry\dry_202208\train_dry_00004.wav,dry,2022-08-01,0.139900,-164.412201,91.956772,-35.305840,14.003232,-12.111714,11.872651,...,0.383683,0.371184,0.526031,0.129426,0.033644,0.011288,0.051737,0.014360,0.037378,2.602967
1,.\data\dry\dry_202208\train_dry_00005.wav,dry,2022-08-01,0.158927,-147.619812,85.897751,-34.192146,10.878366,-13.145910,15.909914,...,0.510290,0.472357,0.566666,0.104708,0.036923,0.013021,0.056774,0.013845,0.042929,3.100740
2,.\data\dry\dry_202208\train_dry_00006.wav,dry,2022-08-01,0.156274,-183.159805,83.018837,-18.152813,4.313260,-13.685164,12.100813,...,0.610446,0.533425,0.581933,0.106541,0.023505,0.006078,0.032167,0.012039,0.020129,1.672029
3,.\data\dry\dry_202208\train_dry_00028.wav,dry,2022-08-01,0.167875,-129.386353,75.861954,-27.340113,11.384302,-12.023940,4.880302,...,0.364151,0.456148,0.520683,0.120454,0.044298,0.013805,0.062053,0.019455,0.042598,2.189502
4,.\data\dry\dry_202208\train_dry_00029.wav,dry,2022-08-01,0.185046,-134.342682,69.627205,-33.352882,12.914204,-13.951382,8.966229,...,0.473140,0.416629,0.559900,0.115355,0.043204,0.019100,0.075789,0.016624,0.059165,3.559124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15973,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.111602,-149.501144,148.598618,-55.505138,-10.146660,-12.806923,16.677174,...,0.464035,0.433128,0.463365,0.149373,0.069876,0.027308,0.109631,0.027022,0.082609,3.057065
15974,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.107968,-154.951431,141.261551,-59.844250,-6.186874,-6.261546,17.245977,...,0.510430,0.604079,0.555448,0.139338,0.061831,0.023182,0.094516,0.024802,0.069714,2.810760
15975,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.110665,-148.315048,145.844757,-62.244270,-9.097380,-9.925684,14.065295,...,0.592941,0.591096,0.544817,0.132261,0.070057,0.030247,0.114065,0.019370,0.094695,4.888852
15976,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.101207,-203.691879,139.977020,-44.398407,8.527876,-2.497573,10.334782,...,0.620527,0.577988,0.558574,0.094814,0.031462,0.008508,0.042846,0.015440,0.027406,1.775020


### 3. Date feature engineering

In [59]:
# time feature engineernig (just all all)
df["month"] = df["timestamp"].dt.month
df["season"] = df["month"].apply(get_season)
# df["week_of_month"] = df["timestamp"].apply(week_of_month)
# df["day"] = df["timestamp"].dt.day
# df["weekday"] = df["timestamp"].dt.weekday
# df["hour"] = df["timestamp"].dt.hour
# df["office_hour"] = df["hour"].apply(lambda x: 1 if ((x >= 9) & (x < 18)) else 0)
# df["sec_in_day"] = (df["timestamp"] - df["timestamp"].dt.normalize()).dt.total_seconds() / 3600
# df["sin_in_day"] = np.sin(2 * np.pi * df["sec_in_day"].values)
# df["cos_in_day"] = np.cos(2 * np.pi * df["sec_in_day"].values)

In [60]:
df

Unnamed: 0,data_path,type,timestamp,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,cf_16,cf_17,rms_0,rms_1,rms_2,rms_3,rms_4,rms_5,month,season
0,.\data\dry\dry_202208\train_dry_00004.wav,dry,2022-08-01,0.139900,-164.412201,91.956772,-35.305840,14.003232,-12.111714,11.872651,...,0.526031,0.129426,0.033644,0.011288,0.051737,0.014360,0.037378,2.602967,8,1
1,.\data\dry\dry_202208\train_dry_00005.wav,dry,2022-08-01,0.158927,-147.619812,85.897751,-34.192146,10.878366,-13.145910,15.909914,...,0.566666,0.104708,0.036923,0.013021,0.056774,0.013845,0.042929,3.100740,8,1
2,.\data\dry\dry_202208\train_dry_00006.wav,dry,2022-08-01,0.156274,-183.159805,83.018837,-18.152813,4.313260,-13.685164,12.100813,...,0.581933,0.106541,0.023505,0.006078,0.032167,0.012039,0.020129,1.672029,8,1
3,.\data\dry\dry_202208\train_dry_00028.wav,dry,2022-08-01,0.167875,-129.386353,75.861954,-27.340113,11.384302,-12.023940,4.880302,...,0.520683,0.120454,0.044298,0.013805,0.062053,0.019455,0.042598,2.189502,8,1
4,.\data\dry\dry_202208\train_dry_00029.wav,dry,2022-08-01,0.185046,-134.342682,69.627205,-33.352882,12.914204,-13.951382,8.966229,...,0.559900,0.115355,0.043204,0.019100,0.075789,0.016624,0.059165,3.559124,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15973,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.111602,-149.501144,148.598618,-55.505138,-10.146660,-12.806923,16.677174,...,0.463365,0.149373,0.069876,0.027308,0.109631,0.027022,0.082609,3.057065,3,0
15974,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.107968,-154.951431,141.261551,-59.844250,-6.186874,-6.261546,17.245977,...,0.555448,0.139338,0.061831,0.023182,0.094516,0.024802,0.069714,2.810760,3,0
15975,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.110665,-148.315048,145.844757,-62.244270,-9.097380,-9.925684,14.065295,...,0.544817,0.132261,0.070057,0.030247,0.114065,0.019370,0.094695,4.888852,3,0
15976,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.101207,-203.691879,139.977020,-44.398407,8.527876,-2.497573,10.334782,...,0.558574,0.094814,0.031462,0.008508,0.042846,0.015440,0.027406,1.775020,3,0


### Add external features (climate-related)

In [61]:
# Temperature
seoul_temp = pd.read_csv("./data/seoul_temperature_high.csv", encoding="cp949")
seoul_temp = seoul_temp[["일시", "최고기온(℃)"]]
seoul_temp.columns = ["timestamp", "temp"]
seoul_temp["timestamp"] = pd.to_datetime(seoul_temp["timestamp"])

In [62]:
# Get recent 3-year month average temperature
seoul_temp["year"] = seoul_temp["timestamp"].dt.year
seoul_temp["month"] = seoul_temp["timestamp"].dt.month
seoul_temp = seoul_temp[seoul_temp["year"].isin([2019, 2020, 2021])]
seoul_temp = seoul_temp.groupby(["year", "month"]).mean().groupby("month").mean()["temp"]

In [63]:
seoul_temp

month
1      4.069892
2      7.042693
3     13.506452
4     17.907778
5     23.617204
6     28.073333
7     30.084946
8     30.219355
9     26.532222
10    20.590323
11    12.853333
12     4.760215
Name: temp, dtype: float64

In [64]:
# Humidity
busan_hum = pd.read_csv("./data/busan_humidity_mean.csv", encoding="cp949")
busan_hum = busan_hum[["일시", "평균습도(%rh)"]]
busan_hum.columns = ["timestamp", "hum"]
busan_hum["timestamp"] = pd.to_datetime(busan_hum["timestamp"])

In [65]:
# Get recent 3-year month average humidity
busan_hum["year"] = busan_hum["timestamp"].dt.year
busan_hum["month"] = busan_hum["timestamp"].dt.month
busan_hum = busan_hum[busan_hum["year"].isin([2019, 2020, 2021])]
busan_hum = busan_hum.groupby(["year", "month"]).mean().groupby("month").mean()["hum"]

In [66]:
busan_hum

month
1     48.419355
2     51.873399
3     58.259140
4     56.134444
5     65.948387
6     74.348889
7     83.955914
8     80.798925
9     78.021111
10    64.034409
11    55.112222
12    47.618280
Name: hum, dtype: float64

In [67]:
# Add recent 3-year average climate feature
df["temp"] = df["month"].map(seoul_temp)
df["hum"] = df["month"].map(busan_hum)

In [68]:
df

Unnamed: 0,data_path,type,timestamp,zcr,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,rms_0,rms_1,rms_2,rms_3,rms_4,rms_5,month,season,temp,hum
0,.\data\dry\dry_202208\train_dry_00004.wav,dry,2022-08-01,0.139900,-164.412201,91.956772,-35.305840,14.003232,-12.111714,11.872651,...,0.033644,0.011288,0.051737,0.014360,0.037378,2.602967,8,1,30.219355,80.798925
1,.\data\dry\dry_202208\train_dry_00005.wav,dry,2022-08-01,0.158927,-147.619812,85.897751,-34.192146,10.878366,-13.145910,15.909914,...,0.036923,0.013021,0.056774,0.013845,0.042929,3.100740,8,1,30.219355,80.798925
2,.\data\dry\dry_202208\train_dry_00006.wav,dry,2022-08-01,0.156274,-183.159805,83.018837,-18.152813,4.313260,-13.685164,12.100813,...,0.023505,0.006078,0.032167,0.012039,0.020129,1.672029,8,1,30.219355,80.798925
3,.\data\dry\dry_202208\train_dry_00028.wav,dry,2022-08-01,0.167875,-129.386353,75.861954,-27.340113,11.384302,-12.023940,4.880302,...,0.044298,0.013805,0.062053,0.019455,0.042598,2.189502,8,1,30.219355,80.798925
4,.\data\dry\dry_202208\train_dry_00029.wav,dry,2022-08-01,0.185046,-134.342682,69.627205,-33.352882,12.914204,-13.951382,8.966229,...,0.043204,0.019100,0.075789,0.016624,0.059165,3.559124,8,1,30.219355,80.798925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15973,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.111602,-149.501144,148.598618,-55.505138,-10.146660,-12.806923,16.677174,...,0.069876,0.027308,0.109631,0.027022,0.082609,3.057065,3,0,13.506452,58.259140
15974,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.107968,-154.951431,141.261551,-59.844250,-6.186874,-6.261546,17.245977,...,0.061831,0.023182,0.094516,0.024802,0.069714,2.810760,3,0,13.506452,58.259140
15975,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.110665,-148.315048,145.844757,-62.244270,-9.097380,-9.925684,14.065295,...,0.070057,0.030247,0.114065,0.019370,0.094695,4.888852,3,0,13.506452,58.259140
15976,.\data\abdominal\abdominal_202303\train_abdomi...,abdominal,2023-03-01,0.101207,-203.691879,139.977020,-44.398407,8.527876,-2.497573,10.334782,...,0.031462,0.008508,0.042846,0.015440,0.027406,1.775020,3,0,13.506452,58.259140


## Split dataset into train, validation and test

In [69]:
df = df.sample(frac=1, random_state=42)

In [70]:
stratVec = df["type"].astype("str")  + "_" + df["month"].astype("str")
df_full, df_test = tts(df, test_size=int(len(df) * 0.1), stratify=stratVec, random_state=42)

In [71]:
df_full.shape, df_test.shape

((14381, 66), (1597, 66))

In [72]:
assert len(df) == len(df_full) + len(df_test)

In [73]:
stratVec = df_full["type"].astype("str")  + "_" + df_full["month"].astype("str")
df_train, df_valid = tts(df_full, test_size=int(len(df) * 0.1), stratify=stratVec, random_state=42)

In [74]:
df_train.shape, df_valid.shape

((12784, 66), (1597, 66))

In [75]:
stratVec = df_test["type"].astype("str")  + "_" + df_test["month"].astype("str")
df_public, df_private = tts(df_test, test_size=int(len(df_test) * 0.5), stratify=stratVec, random_state=42)

In [76]:
df_public.shape, df_private.shape

((799, 66), (798, 66))

In [77]:
# Save datasets
pickleIO(df, "./dataset/df.pkl", "w")
pickleIO(df_full, "./dataset/df_full.pkl", "w")
pickleIO(df_test, "./dataset/df_test.pkl", "w")
pickleIO(df_train, "./dataset/df_train.pkl", "w")
pickleIO(df_valid, "./dataset/df_valid.pkl", "w")
pickleIO(df_public, "./dataset/df_public.pkl", "w")
pickleIO(df_private, "./dataset/df_private.pkl", "w")