In [16]:
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from glob import glob

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
import torch
from torch.utils.data import DataLoader
import xgboost as xgb

In [17]:
# Matplotlib settings
import matplotlib
import matplotlib as mp
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.collections import PolyCollection
from matplotlib.colors import colorConverter

titlesize = 20
labelsize = 16
legendsize = labelsize
xticksize = 14
yticksize = xticksize

matplotlib.rcParams['legend.markerscale'] = 1.5     # the relative size of legend markers vs. original
matplotlib.rcParams['legend.handletextpad'] = 0.5
matplotlib.rcParams['legend.labelspacing'] = 0.4    # the vertical space between the legend entries in fraction of fontsize
matplotlib.rcParams['legend.borderpad'] = 0.5       # border whitespace in fontsize units
matplotlib.rcParams['font.size'] = 12
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'
matplotlib.rcParams['axes.labelsize'] = labelsize
matplotlib.rcParams['axes.titlesize'] = titlesize

matplotlib.rc('xtick', labelsize=xticksize)
matplotlib.rc('ytick', labelsize=yticksize)
matplotlib.rc('legend', fontsize=legendsize)

matplotlib.rc('font', **{'family':'serif'})

# Data loading

In [18]:
df_trainval = pd.read_csv('data/train.csv')
df_train, df_val = train_test_split(df_trainval, test_size=0.2)
df_train['tags'] = df_train['tags'].apply(lambda x: np.array(list(map(int, x.split(',')))))
df_val['tags'] = df_val['tags'].apply(lambda x: np.array(list(map(int, x.split(',')))))

df_test = pd.read_csv('data/test.csv')

In [5]:
# global_idx2embeds = {}
# for track_idx in tqdm(df_trainval['track']):
#     fn = f"data/track_embeddings\\{track_idx}.npy"
#     embeds = np.load(fn)
#     global_idx2embeds[track_idx] = embeds
# print(len(global_idx2embeds))

100%|██████████| 10000/10000 [00:11<00:00, 857.54it/s]

10000





In [19]:
global_idx2embeds = {} # {idx: np.ndarray[n, 768])}
for npy_file in tqdm(glob('data/track_embeddings/*')):
    track_idx = int(npy_file.split('\\')[1].split('.')[0])
    embeds = np.load(npy_file)
    global_idx2embeds[track_idx] = embeds[:-2]

100%|██████████| 76714/76714 [01:01<00:00, 1237.74it/s]


In [20]:
emb_train_mean = 0
for track_idx in tqdm(df_train['track']):
    emb_train_mean += global_idx2embeds[track_idx].mean(axis=0)
emb_train_mean /= len(df_train['track'])
emb_train_mean.shape

100%|██████████| 40907/40907 [00:23<00:00, 1751.14it/s]


(768,)

In [23]:
class TrackDataset(torch.utils.data.Dataset):
    def __init__(self, df_tags, test=False):
        self.df_tags = df_tags
        self.test = test

    def __len__(self):
        return len(self.df_tags)

    def __getitem__(self, idx):
        track_idx = self.df_tags.iloc[idx]['track']
        embeds = global_idx2embeds[track_idx]
        if self.test:
            return track_idx, embeds
        labels_onehot = np.zeros(256)
        labels_onehot[self.df_tags.iloc[idx]['tags']] = 1
        return track_idx, embeds, labels_onehot

In [26]:
def collate(batch):
    track_idxs = np.vstack([triplet[0] for triplet in batch])
    embeds = np.array([np.mean(triplet[1], axis=0) - emb_train_mean for triplet in batch])
    labels_onehot = np.vstack([triplet[2] for triplet in batch])
    return track_idxs, embeds, labels_onehot

def collate_test(batch):
    track_idxs = np.vstack([triplet[0] for triplet in batch])
    embeds = np.array([np.mean(triplet[1], axis=0) - emb_train_mean for triplet in batch])
    return track_idxs, embeds

batch_size = 40907

dataset_train = TrackDataset(df_train)
dataloader_train = DataLoader(dataset_train, batch_size, shuffle=True, collate_fn=collate)

dataset_val = TrackDataset(df_val)
dataloader_val = DataLoader(dataset_val, batch_size, shuffle=False, collate_fn=collate)

dataset_test = TrackDataset(df_test, test=True)
dataloader_test = DataLoader(dataset_test, batch_size, shuffle=False, collate_fn=collate_test)

In [None]:
xgb_estimator = xgb.XGBClassifier(objective='binary:logistic')

multilabel_model = MultiOutputClassifier(xgb_estimator)

y_val = np.array([y[2] for y in dataset_val])
X_val = np.array([np.mean(y[1], axis=0) for y in dataset_val])

# fit the model
epoch2valap = dict()
best_model = None
for epoch in range(10):
    for batch_train in dataloader_train:
        track_idxs, embeds, labels_onehot = batch_train
        multilabel_model.fit(embeds, labels_onehot)
        break
    pred_val = multilabel_model.predict(X_val)
    ap = average_precision_score(y_val, pred_val)
    epoch2valap[epoch] = ap
    if ap >= max(epoch2valap.values()):
        best_model = multilabel_model
    print(f"{epoch=}, val_ap={ap}")

In [9]:
pred_val = multilabel_model.predict(X_val)

In [10]:
average_precision_score(y_val, pred_val)

0.02047152887568346

In [34]:
def predict(model, loader):
    model.to(global_device)
    model.eval()
    track_idxs = []
    predictions = []
    with torch.no_grad():
        for data in loader:
            track_idx, embeds = data
            embeds = [x.to(global_device) for x in embeds]
            pred_logits = model(embeds)
            pred_probs = torch.sigmoid(pred_logits)
            predictions.append(pred_probs.cpu().numpy())
            track_idxs.append(track_idx.numpy())
    predictions = np.vstack(predictions)
    track_idxs = np.vstack(track_idxs).ravel()
    return track_idxs, predictions

In [40]:
val_true = np.array([y[2] for y in dataset_val])
track_idxs_val, val_pred = predict(model, dataloader_val)
average_precision_score(val_true[2000:2256], val_pred[2000:2256])

0.3002778632457312

In [56]:
val_true.shape

(10227, 256)

In [54]:
average_precision_score(val_true[:100], val_pred[:100], default="macro")

0.2785028297836928

In [70]:
track_idxs, predictions = predict(model.to(global_device), dataloader_test)

In [72]:
predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in zip(track_idxs, predictions)
])
predictions_df.to_csv('prediction.csv', index=False)