In [1]:
import os
import sys
sys.path.remove('/home/jovyan/.imgenv-lm-poly-0/lib/python3.7/site-packages')
os.environ['PYTHONPATH'] = '/home/user/conda/envs/ya/lib/python3.10/site-packages'

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from glob import glob

In [2]:
CUDA_DEV = 0
NUM_TAGS = 256

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
from collections import Counter

tags = [[int(i) for i in x.split(',')] for x in df_train.tags.values]
dict_tags = {}
for cls_tags in tags:
    for c in cls_tags:
        if c not in dict_tags.keys():
            dict_tags[c] = Counter(cls_tags)
        else:
            dict_tags[c].update(Counter(cls_tags))
            
for tag in dict_tags.keys():
    del dict_tags[tag][tag]
    n = np.sum(list(dict_tags[tag].values()))
    for t in dict_tags[tag].keys():
        dict_tags[tag][t] = dict_tags[tag][t]/n

In [None]:
track_idx2embeds = {}
for fn in tqdm(glob('track_embeddings/*')):
    name = fn.split('/')[1].split('.')[0]
    if name == "track_embeddings":
        continue
    track_idx = int(name)
    embeds = np.load(fn)
    track_idx2embeds[track_idx] = embeds

  7%|▋         | 5077/76715 [00:09<02:10, 547.52it/s]

In [None]:
class TaggingDataset(Dataset):
    def __init__(self, df, aug=0, testing=False):
        self.df = df
        self.testing = testing
        self.aug = aug
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        track_idx = row.track
        embeds = track_idx2embeds[track_idx]
        if self.testing:
            return track_idx, embeds
        tags = [int(x) for x in row.tags.split(',')]
        target = np.zeros(NUM_TAGS)
        target[tags] = 1
        
        if np.random.choice([0, 1], p=[1 - self.aug, self.aug]):
            s = np.random.uniform(0.0, 0.4)
            e = np.random.uniform(s+0.1, 1)
            s = int(s * embeds.shape[0])
            e = int(e * embeds.shape[0])
            embeds = embeds[s:e]
        
        return track_idx, embeds, target

In [7]:
train_dataset = TaggingDataset(df_train[:-1000], aug=0.6)
val_dataset = TaggingDataset(df_train[-1000:])

test_dataset = TaggingDataset(df_test, testing=True)

In [8]:
class FeedForward(nn.Module):
    def __init__(self, emb_dim=768, mult=4, p=0.0):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(emb_dim, emb_dim * mult),
            nn.Dropout(p),
            nn.GELU(),
            nn.Linear(emb_dim * mult, emb_dim)
        )

    def forward(self, x):
        return self.fc(x)
    
class AttentionPooling(nn.Module):
    def __init__(self, embedding_size):
        super().__init__()
        self.attn = nn.Sequential(
            nn.Linear(embedding_size, embedding_size),
            nn.LayerNorm(embedding_size),
            nn.GELU(),
            nn.Linear(embedding_size, 1)
        )

    def forward(self, x, mask=None):
        attn_logits = self.attn(x)
        if mask is not None:
            attn_logits[mask] = -float('inf')
        attn_weights = torch.softmax(attn_logits, dim=1)
        x = x * attn_weights
        x = x.sum(dim=1)
        return x
    
class Network(nn.Module):
    def __init__(
        self,
        num_classes = NUM_TAGS,
        input_dim = 768,
        hidden_dim = 512
    ):
        super().__init__()
        self.num_classes = num_classes
        self.position_enc = nn.Embedding(128, input_dim, padding_idx=-1) 
        self.proj = FeedForward(input_dim)
        self.bn = nn.BatchNorm1d(input_dim)
        self.ln = nn.LayerNorm(input_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=768, nhead=12, activation="gelu", batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        self.poooling = AttentionPooling(input_dim)
        self.fc = nn.Linear(input_dim, num_classes)
               
    def forward(self, embeds):
        embeds = self.proj(embeds)
        src_key_padding_mask = (embeds.mean(-1) == -1)
        embeds = self.ln(embeds)
        x = self.transformer_encoder(embeds, src_key_padding_mask=src_key_padding_mask)
        x = self.bn(self.poooling(x, mask=src_key_padding_mask))
        outs = self.fc(x)
        return outs


In [9]:
from torch.nn.utils.rnn import pad_sequence

def train_epoch(model, loader, criterion, optimizer, scheduler, print_loss=True, iteration_step=100, epoch=0):
    model.train()
    running_loss = None
    alpha = 0.8
    iters = len(loader)
    for iteration,data in enumerate(loader):
        optimizer.zero_grad()
        track_idxs, embeds, target = data
        embeds = [x.to(CUDA_DEV) for x in embeds]
        embeds = pad_sequence(embeds, padding_value=-1, batch_first=True)[:, :64, :]
        target = target.to(CUDA_DEV)
        pred_logits = model(embeds)
        pred_probs = torch.sigmoid(pred_logits)
        ce_loss = criterion(pred_logits, target)
            
        ce_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        scheduler.step(epoch + iteration / iters)
        
        if running_loss is None:
            running_loss = ce_loss.item()
        else:
            running_loss = alpha * running_loss + (1 - alpha) * ce_loss.item()
        if (iteration % iteration_step == 0) and print_loss:
            print('   {} batch {} running loss {} loss {}'.format(
                datetime.now(), iteration + 1, running_loss, ce_loss.item()
            ))

In [10]:
def predict(model, loader):
    model.eval()
    track_idxs = []
    predictions = []
    with torch.no_grad():
        for data in loader:
            track_idx, embeds = data
            embeds = [x.to(CUDA_DEV) for x in embeds]
            embeds = pad_sequence(embeds, padding_value=-1, batch_first=True)[:, :64, :]
            pred_logits = model(embeds)
            pred_probs = torch.sigmoid(pred_logits)
            predictions.append(pred_probs.cpu().numpy())
            track_idxs.append(track_idx.numpy())
    predictions = np.vstack(predictions)
    track_idxs = np.vstack(track_idxs).ravel()
    return track_idxs, predictions

In [11]:
from tqdm import tqdm

def predict_train(model, loader):
    model.eval()
    track_idxs = []
    predictions = []
    targets = []
    with torch.no_grad():
        for data in loader:
            track_idx, embeds, target = data
            embeds = [x.to(CUDA_DEV) for x in embeds]
            embeds = pad_sequence(embeds, padding_value=-1, batch_first=True)[:, :64, :]
            pred_logits = model(embeds)
            pred_probs = torch.sigmoid(pred_logits)
            predictions.append(pred_probs.cpu().numpy())
            track_idxs.append(track_idx.numpy())
            targets.append(target.numpy())
    predictions = np.vstack(predictions)
    targets = np.vstack(targets)
    track_idxs = np.vstack(track_idxs).ravel()
    return track_idxs, predictions, targets

In [12]:
def collate_fn(b):
    track_idxs = torch.from_numpy(np.vstack([x[0] for x in b]))
    embeds = [torch.from_numpy(x[1]) for x in b]
    targets = np.vstack([x[2] for x in b])
    targets = torch.from_numpy(targets)
    return track_idxs, embeds, targets

def collate_fn_test(b):
    track_idxs = torch.from_numpy(np.vstack([x[0] for x in b]))
    embeds = [torch.from_numpy(x[1]) for x in b]
    return track_idxs, embeds

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn_test)

In [14]:
import sklearn.metrics
from timm.loss import AsymmetricLossMultiLabel
from transformers import set_seed
import warnings
warnings.filterwarnings("ignore")

set_seed(42)


model = Network()
criterion = nn.BCEWithLogitsLoss()

epochs = 150
model = model.to(CUDA_DEV)
criterion = criterion.to(CUDA_DEV)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, eta_min=1e-7)

best = 0
best_model = None

for epoch in range(epochs):
    train_epoch(model, train_dataloader, criterion, optimizer, scheduler, print_loss=True, iteration_step=700, epoch=epoch)
    track_idxs, predictions, targets = predict_train(model, val_dataloader)
    ap = sklearn.metrics.average_precision_score(targets, predictions)
    print(f"epoch: {epoch}, AP: {ap}")
    if (ap > best):
        best = ap
        best_model = model
        if (epoch > 20):
            track_idxs, predictions = predict(model, test_dataloader)
            for i, c in enumerate(predictions.argmax(-1)):
                probs = np.array([1 + dict_tags[c].get(t, 0) for t in np.arange(predictions.shape[1])])
                probs[c] = 2
                predictions[i] = predictions[i] * probs
                predictions[i] /= predictions[i].sum()
                        
            predictions_df = pd.DataFrame([
                {'track': track, 'prediction': ','.join([str(p) for p in probs])}
                for track, probs in zip(track_idxs, predictions)
            ])
            predictions_df.to_csv(f'subs/prediction_best.csv', index=False)
            torch.save(best_model.state_dict(), f'models/models_best.pt')

  from .autonotebook import tqdm as notebook_tqdm


   2023-10-28 14:50:17.055754 batch 1 running loss 0.7308424547369654 loss 0.7308424547369654
   2023-10-28 14:51:31.536043 batch 701 running loss 0.6817006676839591 loss 0.6817837555747133
epoch: 0, AP: 0.10535884323029973
   2023-10-28 14:51:41.369855 batch 1 running loss 0.6796277644900348 loss 0.6796277644900348
   2023-10-28 14:52:56.811810 batch 701 running loss 0.6450572991579687 loss 0.6453651685236517
epoch: 1, AP: 0.14464805845653567
   2023-10-28 14:53:06.674733 batch 1 running loss 0.6373535739577588 loss 0.6373535739577588
   2023-10-28 14:54:22.696326 batch 701 running loss 0.5720440619285229 loss 0.5733344379245074
epoch: 2, AP: 0.1708249582689514
   2023-10-28 14:54:32.739213 batch 1 running loss 0.5634246307950161 loss 0.5634246307950161
   2023-10-28 14:56:15.106554 batch 701 running loss 0.48979199537589546 loss 0.4899965929767496
epoch: 3, AP: 0.17386765768934992
   2023-10-28 14:56:34.186536 batch 1 running loss 0.483616359162266 loss 0.483616359162266
   2023-10-2


KeyboardInterrupt



In [15]:
track_idxs, predictions, targets = predict_train(best_model, val_dataloader)
ap = sklearn.metrics.average_precision_score(targets, predictions)
print(f"AP: {ap}")

for i, c in enumerate(predictions.argmax(-1)):
    probs = np.array([1 + dict_tags[c].get(t, 0) for t in np.arange(predictions.shape[1])])
    probs[c] = 2
    predictions[i] = predictions[i] * probs
    predictions[i] /= predictions[i].sum()
ap = sklearn.metrics.average_precision_score(targets, predictions)

print(f"POST AP: {ap}")

for i, c in enumerate(predictions.argmax(-1)):
    probs = np.array([1 + dict_tags[c].get(t, 0) for t in np.arange(predictions.shape[1])])
    probs[c] = 2
    predictions[i] = predictions[i] * probs
    predictions[i] /= predictions[i].sum()
ap = sklearn.metrics.average_precision_score(targets, predictions)

print(f"POST-1 AP: {ap}")

AP: 0.3199148484354201
POST AP: 0.32061250757510396
POST-1 AP: 0.3215476797234166


In [17]:
track_idxs, predictions = predict(best_model, test_dataloader)

for i, c in enumerate(predictions.argmax(-1)):
    probs = np.array([1 + dict_tags[c].get(t, 0) for t in np.arange(predictions.shape[1])])
    probs[c] = 2
    predictions[i] = predictions[i] * probs
    predictions[i] /= predictions[i].sum()
    
for i, c in enumerate(predictions.argmax(-1)):
    probs = np.array([1 + dict_tags[c].get(t, 0) for t in np.arange(predictions.shape[1])])
    probs[c] = 2
    predictions[i] = predictions[i] * probs
    predictions[i] /= predictions[i].sum()

predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in zip(track_idxs, predictions)
])
predictions_df.to_csv(f'prediction_best_2.csv', index=False)