# cRT (T-F)

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import tensorflow as tf
import tensorflow_addons as tfa
from keras import layers

import librosa
from audiomentations import SpecCompose, SpecFrequencyMask

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import pickle
import argparse
import wandb
from wandb.keras import WandbCallback
wandb.init(project="DACON_235910", name="cRT")

parser = argparse.ArgumentParser(description="cRT")
parser.add_argument('--feature', default="melspec", type=str) # mfcc or melspec
parser.add_argument('--pretrained_model', default="efficientnetb0", type=str)
parser.add_argument('--resize_size', default=224, type=int)
parser.add_argument('--sampling_rate', default=16000, type=int)
# parser.add_argument('--frequency_mask', default=0, type=float)
parser.add_argument('--optimizer', default="sgd", type=str) # sgd or adam
parser.add_argument('--loss', default="bc", type=str) # bc or fl
parser.add_argument('--learning_rate', default=0.001, type=float)
parser.add_argument('--batch_size', default=32, type=int)
parser.add_argument('--epochs', default=100, type=int)
parser.add_argument('--cv', default=10, type=int)
parser.add_argument('--seed', default=1011, type=int)
args = parser.parse_args('')

wandb.config.update(args)

feature = args.feature
pretrained_model = args.pretrained_model
resize_size = args.resize_size
sampling_rate = args.sampling_rate
# frequency_mask = args.frequency_mask
optimizer = args.optimizer
loss = args.loss
learning_rate = args.learning_rate
BATCH_SIZE = args.batch_size
EPOCHS = args.epochs
cv = args.cv
seed = args.seed

def set_seeds(seed=seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

# augment = SpecCompose([SpecFrequencyMask(p=frequency_mask)])
    
with open('data/train_df.pkl', 'rb') as f:
    train_df = pickle.load(f)
with open('data/test_df.pkl', 'rb') as f:
    test_df = pickle.load(f)
    
train_df.head()

[34m[1mwandb[0m: Currently logged in as: [33mgnoeyheat[0m (use `wandb login --relogin` to force relogin)


Unnamed: 0,data,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,24,female,0,1,0
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,51,male,0,0,0
2,"[2.7372453e-09, -1.0615647e-08, 5.2142607e-08,...",3,22,male,0,0,0
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,29,female,1,0,0
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5,23,male,0,0,0


In [2]:
train_df["covid19"].value_counts(normalize=True)

0    0.91958
1    0.08042
Name: covid19, dtype: float64

## Data Generation

In [3]:
# train_df = pd.read_csv("data/train_data.csv")
# test_df = pd.read_csv("data/test_data.csv")

# train_folder = "data/train/"
# test_folder = "data/test/"

# def dataset(folder, df):
#     dataset = []
#     for uid in tqdm(df['id']):
#         path = os.path.join(folder, str(uid).zfill(5)+'.wav')
#         y, sr = librosa.load(path, sr=sampling_rate)
#         y = librosa.util.normalize(y)
#         dataset.append([y])
#     dataset = pd.DataFrame(dataset, columns=['data'])
#     dataset = pd.concat([dataset, df], axis=1)
#     return dataset

# train_df = dataset(train_folder, train_df)
# test_df = dataset(test_folder, test_df)

# with open('train_df.pkl', 'wb') as f:
#     pickle.dump(train_df, f, pickle.HIGHEST_PROTOCOL)
# with open('test_df.pkl', 'wb') as f:
#     pickle.dump(test_df, f, pickle.HIGHEST_PROTOCOL)

## Preprocessing

In [4]:
def preprocess_dataset(data):

    frame_length = 0.025
    frame_stride = 0.010

    input_nfft = int(round(sampling_rate*frame_length))
    input_stride = int(round(sampling_rate*frame_stride))

    extracted_features = []
    for i in tqdm(data):
        temp_S = []
        for nfft, stride in zip([input_nfft, input_nfft*4, input_nfft],
                                [input_stride, input_stride, input_stride*4]):
            if feature == "mfcc":
                S = librosa.feature.mfcc(y=i,
                                         sr=sampling_rate,
                                         n_mfcc=40,
                                         n_fft=nfft,
                                         hop_length=stride)
            elif feature == "melspec":
                S = librosa.feature.melspectrogram(y=i,
                                                   sr=sampling_rate,
                                                   n_mels=128,
                                                   n_fft=nfft,
                                                   hop_length=stride)
                S = librosa.power_to_db(S, ref=np.max)
            S = tf.image.resize(S[:, :, np.newaxis], (resize_size, resize_size))
            temp_S.append(S[:, :, 0])
        S = np.stack(temp_S, axis=2)
        extracted_features.append(S)
    return extracted_features

X = np.array(preprocess_dataset(train_df["data"]))
X_test = np.array(preprocess_dataset(test_df["data"]))
y = train_df["covid19"]

X.shape, y.shape, X_test.shape

100%|██████████████████████████████████████████████████████████████████████████████| 3805/3805 [04:25<00:00, 14.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5732/5732 [05:44<00:00, 16.64it/s]


((3805, 224, 224, 3), (3805,), (5732, 224, 224, 3))

In [5]:
def preprocess_feature(df):
    temp = df.copy()
    temp["condition1"] = temp["respiratory_condition"] + temp["fever_or_muscle_pain"]
    temp["condition2"] = temp["respiratory_condition"] * temp["fever_or_muscle_pain"]
    temp = temp.drop(["id", "age", "gender", "respiratory_condition", "fever_or_muscle_pain"], axis=1)
    return temp

train_df = preprocess_feature(train_df)
test_df = preprocess_feature(test_df)

X_test_tab = test_df[["condition1", "condition2"]].values

X_test_tab.shape

(5732, 2)

## Training

In [6]:
validations = []
thresholds = []
predictions = []

idx=0

skf = StratifiedKFold(n_splits=cv)
for train_index, val_index in tqdm(skf.split(X, y)):
    
    idx+=1

    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    X_train_tab = train_df[["condition1", "condition2"]].values[train_index]
    X_val_tab = train_df[["condition1", "condition2"]].values[val_index]
    
    train_ds = (
        tf.data.Dataset.from_tensor_slices(((X_train, X_train_tab), y_train))
        .shuffle(len(X_train))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )

    val_ds = (
        tf.data.Dataset.from_tensor_slices(((X_val, X_val_tab), y_val))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )

    if pretrained_model == "efficientnetb0":
        encoder = tf.keras.applications.EfficientNetB0(
            include_top=False,
            weights="imagenet",
            pooling='avg',
        )

    inp = tf.keras.Input(shape=(resize_size, resize_size, 3))
    tab = tf.keras.Input(shape=(X_test_tab.shape[1],))
    x = encoder(inp)
    x = layers.Concatenate()([x, tab])
    oup = layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=[inp, tab], outputs=oup)

    lr = tf.keras.optimizers.schedules.CosineDecay(learning_rate, decay_steps=1000)
    if optimizer == "adam":
        optim = tf.keras.optimizers.Adam(learning_rate=lr)
    elif optimizer == "sgd":
        optim = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9)
        
    label_smoothing=0
    loss_function = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing)

    model.compile(
        optimizer=optim,
        loss=loss_function,
    )
    
    checkpoint_filepath=f"load_model/{parser.description}_{idx}_1stage"

    checkpoint_callback = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
        ),
        tf.keras.callbacks.ModelCheckpoint(
            checkpoint_filepath,
            monitor="val_loss",
            save_best_only=True,
            save_weights_only=True
        )
    ]

    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        callbacks=[checkpoint_callback, WandbCallback()],
    )
    
    X_train = np.concatenate((X_train[np.where(y_train==0, True, False)][:y_train.sum()],
                              X_train[np.where(y_train==1, True, False)]))
    X_train_tab = np.concatenate((X_train_tab[np.where(y_train==0, True, False)][:y_train.sum()],
                                  X_train_tab[np.where(y_train==1, True, False)]))
    y_train = np.concatenate((y_train[np.where(y_train==0, True, False)][:y_train.sum()],
                              y_train[np.where(y_train==1, True, False)]))

    train_ds = (
        tf.data.Dataset.from_tensor_slices(((X_train, X_train_tab), y_train))
        .shuffle(len(X_train))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    
    encoder.trainable = False
    
    lr = tf.keras.optimizers.schedules.CosineDecay(learning_rate*0.1, decay_steps=1000)
    if optimizer == "adam":
        optim = tf.keras.optimizers.Adam(learning_rate=lr)
    elif optimizer == "sgd":
        optim = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9)

    model.compile(
        optimizer=optim,
        loss=loss_function,
    )
    
    checkpoint_filepath=f"load_model/{parser.description}_{idx}_2stage"

    checkpoint_callback = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
        ),
        tf.keras.callbacks.ModelCheckpoint(
            checkpoint_filepath,
            monitor="val_loss",
            save_best_only=True,
            save_weights_only=True
        )
    ]

    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        callbacks=[checkpoint_callback, WandbCallback()],
    )

    max_f1 = 0
    threshold = 0
    for temp_threshold in np.linspace(0.05, 0.95, 19):
        temp_f1 = f1_score(y_val, np.where(model.predict(val_ds)>temp_threshold, 1, 0), average="macro")
        if temp_f1 > max_f1:
            max_f1 = temp_f1
            threshold = temp_threshold
    
    print(f"idx:{idx}, f1:{max_f1}, threshold:{threshold}")
    
    validations.append(max_f1)
    thresholds.append(threshold)
    predictions.append(model.predict([X_test, X_test_tab]))

val_f1 = np.mean(validations, axis=0)

print("validation_macro-f1: ", val_f1)
wandb.log({'validation_macro-f1': val_f1})

0it [00:00, ?it/s]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
idx:1, f1:0.5587574085428163, threshold:0.2


1it [08:30, 510.49s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
idx:2, f1:0.5775560224089636, threshold:0.25


2it [18:19, 556.79s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
idx:3, f1:0.5355276907001045, threshold:0.2


3it [27:13, 546.34s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
idx:4, f1:0.5206489349961372, threshold:0.2


4it [35:54, 536.47s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
idx:5, f1:0.522931031604254, threshold:0.15


5it [43:39, 510.59s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
idx:6, f1:0.5834855681402996, threshold:0.25


6it [53:26, 536.47s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
idx:7, f1:0.5657142857142857, threshold:0.3


7it [1:01:52, 526.67s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
idx:8, f1:0.5211566520758721, threshold:0.3


8it [1:09:49, 510.85s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
idx:9, f1:0.5353247984826932, threshold:0.25


9it [1:16:48, 482.18s/it]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
idx:10, f1:0.58260644827809, threshold:0.2


10it [1:25:00, 510.00s/it]

validation_macro-f1:  0.5503708840943518





## Inference

In [7]:
threshold = np.mean(thresholds)
test_df["covid19"] = np.where(np.mean(predictions, axis=0)>threshold, 1, 0)

submission = pd.read_csv('data/sample_submission.csv')
submission['covid19'] = test_df['covid19']
submission.to_csv('submission.csv', index=False)

test_df['covid19'].value_counts(normalize=True)

0    0.974878
1    0.025122
Name: covid19, dtype: float64