In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import sys

sys.path.append('../')

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.autonotebook import tqdm
from torch.utils.data import DataLoader
from glob import glob
from sklearn.model_selection import train_test_split
from hyperopt import hp
from sklearn.model_selection import KFold

from src.data import TaggingDataset, collate_fn_test, collate_fn_train, NewTaggingDataset
from src.models import (
    BiLSTM,
    ConvBERT,
    BERT,
    CompressBERT,
    single_model,
    predict,
    train_model_early_stopping,
    tune_params_and_fit,
)
from settings import BATCH_SIZE, CUDA_DEV, EMB_PATH, RANDOM_STATE

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
track_idx2embeds = {}
max_len = 0
for fn in tqdm(glob(EMB_PATH)[:250]):
    track_idx = int(fn.split('\\')[-1].split('.')[0])
    embeds = np.load(fn)
    max_len = max(max_len, embeds.shape[0])
    track_idx2embeds[track_idx] = embeds

In [None]:
df_train = df_train[df_train.track.isin(track_idx2embeds)]
df_test = df_test[df_test.track.isin(track_idx2embeds)]
predictions = np.zeros((len(df_test), 256))
for i in range(8):
    print(f"Fold {i}:")
    train, val = train_test_split(df_train, test_size=0.2, random_state=42)
    model = CompressBERT(hidden_dim=512, attn_heads=4, n_attn_layers=2, dropout=0.1, output_dim=32)
    criterion = nn.BCEWithLogitsLoss()
    model = model.to(CUDA_DEV)
    criterion = criterion.to(CUDA_DEV)
    optimizer = torch.optim.AdamW(model.parameters(), lr=6e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.3)
    train_dataset = NewTaggingDataset(train, track_idx2embeds=track_idx2embeds, max_len=max_len, i=i)
    val_dataset = NewTaggingDataset(val, track_idx2embeds=track_idx2embeds, max_len=max_len, i=i)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn_train)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn_train)
    test_dataset = NewTaggingDataset(df_test, testing=True, track_idx2embeds=track_idx2embeds, max_len=max_len, i=i)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_test, shuffle=False)
    best_model, metric = train_model_early_stopping(
        model=model,
        train_loader=train_dataloader,
        val_loader=val_dataloader,
        loss_function=criterion,
        optimizer=optimizer,
        scheduler=None,
        device=CUDA_DEV,
        early_stopping=4
    )
    torch.save(best_model.state_dict(), f"../models/bert_{i}_{str(metric)[:5]}.pt")
    track_idxs, preds = predict(best_model, test_dataloader)
    predictions[:, 32 * i:32 * (i + 1)] = preds

In [None]:
predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in zip(track_idxs, predictions)
])

In [None]:
predictions_df.to_csv('submit.csv', index=False)