In [1]:
!pip -q install kagglehub catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import os
import torch
import re
import json
import torch.nn as nn
import gc
from PIL import Image
from tqdm.auto import tqdm
from transformers import (AutoImageProcessor, AutoModel, Trainer, TrainingArguments,
                          AutoModelForImageClassification, AutoTokenizer)
from io import StringIO
from torch.utils.data import Dataset
from itertools import chain
from sklearn.model_selection import train_test_split, KFold
from scipy.optimize import differential_evolution
from catboost import CatBoostClassifier
import lightgbm as lgb
from xgboost import XGBClassifier

In [3]:
init_path = kagglehub.dataset_download('bobbyshmurda31/multi-label-classification-competition2023forcolab') + '/COMP5329S1A2Dataset/'

# некоторые строки файлов плохо закодированы => надо читать вот так
def read_bad_csv(file):
    with open(init_path + file) as file:
        lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
        return pd.read_csv(StringIO(''.join(lines)), escapechar="/")

train_data = read_bad_csv('train.csv')
test_data = read_bad_csv('test.csv')
test_ids = test_data['ImageID']

train_data['ImageID'] = train_data['ImageID'].apply(lambda x: os.path.join(init_path, 'data', x))
test_data['ImageID'] = test_data['ImageID'].apply(lambda x: os.path.join(init_path, 'data', x))

train_data['Labels'] = train_data['Labels'].apply(lambda x: [int(n) - 1 for n in x.split()])
unique_labels = sorted(set(chain.from_iterable(train_data['Labels'].tolist())))
to_ohe_vector = lambda labels: [int(label in labels) for label in unique_labels]
train_data['Labels'] = train_data['Labels'].apply(to_ohe_vector)

# здесь метки почему-то:
# - начинаются с 1
# - метка 11 отсутствует вообще
# вот какие метки есть: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18

# train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42, shuffle=True)

Using Colab cache for faster access to the 'multi-label-classification-competition2023forcolab' dataset.


In [10]:
class AsymmetricLoss(nn.Module):
    def __init__(self, gamma_neg=4, gamma_pos=0, clip=0.05):
        super().__init__()
        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos
        self.clip = clip

    def forward(self, logits, labels):
        probs = torch.sigmoid(logits)
        if self.clip > 0:
            probs = torch.clamp(probs, self.clip, 1 - self.clip)

        pos_loss = labels * torch.log(probs) * (1 - probs) ** self.gamma_pos
        neg_loss = (1 - labels) * torch.log(1 - probs) * probs ** self.gamma_neg

        return -(pos_loss + neg_loss).mean()

class ASLTrainer(Trainer):
    def __init__(self, *args, gamma_neg=4, gamma_pos=0, clip=0.05, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = AsymmetricLoss(gamma_neg, gamma_pos, clip)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        loss = self.loss_fn(outputs['logits'], labels)
        return (loss, outputs) if return_outputs else loss

class TextImgModel(nn.Module):
    def __init__(self, text_model_checkpoint, image_model_checkpoint, num_classes):
        super(TextImgModel, self).__init__()

        self.text_model = AutoModel.from_pretrained(text_model_checkpoint)
        self.image_model = AutoModel.from_pretrained(image_model_checkpoint)
        self.text_shape = self.text_model.config.hidden_size
        self.image_shape = self.image_model.config.hidden_size
        self.embedding_shape = self.text_shape + self.image_shape

        self.classifier = nn.Sequential(
            nn.Linear(self.embedding_shape, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

        self.config = {
            'text_model_checkpoint': text_model_checkpoint,
            'image_model_checkpoint': image_model_checkpoint,
            'num_classes': num_classes
        }

    def forward(self, input_ids, attention_mask, token_type_ids, pixel_values, labels=None):
        text_embedding = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        ).last_hidden_state[:, 0]
        image_embedding = self.image_model(pixel_values=pixel_values).last_hidden_state[:, 0]

        embedding = torch.cat([text_embedding, image_embedding], dim=1)
        logits = self.classifier(embedding)

        output = {'logits': logits}
        if labels is not None:
            output['loss'] = nn.BCELoss()(logits, labels)

        return output

    def save_pretrained(self, save_directory):
        os.makedirs(save_directory, exist_ok=True)

        with open(os.path.join(save_directory, 'config.json'), 'w') as f:
            json.dump(self.config, f, indent=2)

        state_dict = {k: v.contiguous() for k, v in self.state_dict().items()}
        torch.save(state_dict, os.path.join(save_directory, 'pytorch_model.bin'))

    @classmethod
    def from_pretrained(cls, load_directory):
        with open(os.path.join(load_directory, 'config.json'), 'r') as f:
            config = json.load(f)

        model = cls(**config)

        state_dict = torch.load(os.path.join(load_directory, 'pytorch_model.bin'))
        model.load_state_dict(state_dict)

        return model

class TextImgDataset(Dataset):
    def __init__(self, texts, image_paths, labels=None, image_size=(224, 224)):
        self.texts = texts
        self.image_paths = image_paths
        self.labels = labels
        self.image_size = image_size

    def __len__(self):
        return len(self.image_paths)


    def __getitem__(self, index):
        return dict(
            text=self.texts[index],
            image=Image.open(self.image_paths[index]).resize(self.image_size),
            **({} if self.labels is None else {'labels': self.labels[index]})
        )

class TextImgCollator:
    def __init__(self, tokenizer, processor, max_length=512):
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_length = max_length

    def __call__(self, samples):
        # тексты
        encoded_text = self.tokenizer(
            [sample['text'] for sample in samples],
            truncation=True,
            padding='longest',
            max_length=self.max_length,
            add_special_tokens=True,
            return_tensors='pt'
        )

        # картинки
        encoded_image = self.processor(
            [sample['image'] for sample in samples],
            return_tensors='pt'
        )

        # лейблы
        labels = {
            'labels': torch.tensor([sample['labels'] for sample in samples], dtype=torch.long)
            } if 'labels' in samples[0] else {}

        # соединение всего
        return dict(
            **encoded_text,
            **encoded_image,
            **labels
        )

class TextImageHFModel:
    def __init__(self):
        text_model_checkpoint = 'prajjwal1/bert-tiny'
        image_model_checkpoint = 'WinKawaks/vit-small-patch16-224'

        self.tokenizer = AutoTokenizer.from_pretrained(text_model_checkpoint)
        self.processor = AutoImageProcessor.from_pretrained(image_model_checkpoint)
        self.data_collator = TextImgCollator(tokenizer=self.tokenizer, processor=self.processor)

        self.model = TextImgModel(
            text_model_checkpoint=text_model_checkpoint,
            image_model_checkpoint=image_model_checkpoint,
            num_classes=18
        )

    def fit(self, train_data, val_data):
        train_dataset = TextImgDataset(
            texts=train_data['Caption'].values,
            image_paths=train_data['ImageID'].values,
            labels=train_data['Labels'].values
        )
        val_dataset = TextImgDataset(
            texts=val_data['Caption'].values,
            image_paths=val_data['ImageID'].values,
            labels=val_data['Labels'].values
        )

        optimizer = torch.optim.AdamW([
            {'params': self.model.text_model.parameters(), 'lr': 8e-5},
            {'params': self.model.image_model.parameters(), 'lr': 1e-4},
            {'params': self.model.classifier.parameters(), 'lr': 2e-4}
        ])

        def compute_metrics(preds):
            logits, labels = preds
            sigmoid = 1 / (1 + np.exp(-logits))
            predictions = (sigmoid > 0.5).astype(int)
            return {'f1': f1_score(labels, predictions, average='macro', zero_division=0)}

        args = TrainingArguments(
            per_device_train_batch_size=128,
            per_device_eval_batch_size=128,
            num_train_epochs=3,
            learning_rate=2e-4,
            save_strategy='steps',
            logging_strategy='steps',
            eval_strategy='steps',
            logging_steps=100,
            eval_steps=100,
            metric_for_best_model='f1',
            load_best_model_at_end=True,
            lr_scheduler_type='cosine',
            save_steps=100,
            warmup_ratio=0.01,
            weight_decay=0.01,
            report_to='none',
            output_dir='./result',
            fp16=torch.cuda.is_available(),
            remove_unused_columns=False,  # иначе словарь метода TextImgDataset.__getitem__ будет иметь только labels
            save_safetensors=False  # для кастомных функций сохранения
        )
        self.trainer = ASLTrainer(
            args=args,
            model=self.model,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator,
            optimizers=(optimizer, None)
        )
        self.trainer.train()

    def get_embeddings_and_probs(self, data, batch_size=32):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        all_embeddings = []
        all_probs = []

        dataset = TextImgDataset(data['Caption'].values, data['ImageID'].values)
        batched_indexes = [list(range(i, min(i + batch_size, len(data)))) for i in range(0, len(data), batch_size)]
        for indexes in tqdm(batched_indexes):
            batch = [dataset[index] for index in indexes]
            batch = self.data_collator(batch)
            batch = {k: v.to(device) for k, v in batch.items()}

            with torch.no_grad():
                text_embeddings = self.model.text_model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    token_type_ids=batch['token_type_ids']
                ).last_hidden_state[:, 0]
                image_embeddings = self.model.image_model(
                    pixel_values=batch['pixel_values']
                    ).last_hidden_state[:, 0]

                embeddings = torch.cat([text_embeddings, image_embeddings], dim=1)
                del text_embeddings, image_embeddings; gc.collect()

                logits = self.model.classifier(embeddings)
                probs = nn.Softmax(dim=-1)(logits)
                del logits; gc.collect()

            all_embeddings.append(embeddings.cpu().numpy())
            all_probs.append(probs.cpu().numpy())

        all_embeddings = np.concatenate(all_embeddings, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        outputs = np.concatenate([all_embeddings, all_probs], axis=1)

        return outputs

    def predict(self, data):
        dataset = TextImgDataset(data['Caption'].values, data['ImageID'].values)
        preds = self.trainer.predict(dataset).predictions
        preds = torch.tensor(preds)
        preds = nn.Softmax(dim=-1)(preds).numpy()
        preds = (preds > 0.5).astype(int)
        return preds

In [None]:
# в перспективе можно ещё добавить TF-IDF признаки и статистика текста (длина текста, кол-во запятых и т. д.)

oof_hf_train_data = np.zeros((len(train_data), 530))  # 338
oof_hf_test_data = np.zeros((len(test_data), 530))  # 338

N_SPLITS = 5
kfold = KFold(n_splits=N_SPLITS, random_state=42, shuffle=True)

for train_idx, val_idx in tqdm(list(kfold.split(train_data))):
    train_fold, val_fold = train_data.iloc[train_idx], train_data.iloc[val_idx]

    model = TextImageHFModel()
    model.fit(train_fold, val_fold)

    oof_hf_train_data[val_idx] = model.get_embeddings_and_probs(val_fold)
    oof_hf_test_data += model.get_embeddings_and_probs(test_data) / N_SPLITS

    gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
100,0.0691,0.041307,0.648695
200,0.0389,0.032413,0.724493
300,0.0306,0.030058,0.750032
400,0.0274,0.028832,0.757163
500,0.0234,0.028742,0.755729


  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
100,0.0684,0.041374,0.646166
200,0.0384,0.032804,0.710348
300,0.0302,0.030772,0.729435
400,0.0273,0.029525,0.734642
500,0.0234,0.029508,0.733676


  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
100,0.069,0.041606,0.638754
200,0.0385,0.032248,0.730082
300,0.0301,0.03,0.741176
400,0.0275,0.029259,0.744097
500,0.0236,0.029134,0.745403


  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
100,0.0685,0.041333,0.62645
200,0.0391,0.033891,0.711858
300,0.0304,0.031335,0.726989
400,0.027,0.030149,0.735889
500,0.0232,0.030072,0.735819


  0%|          | 0/188 [00:00<?, ?it/s]

In [None]:
oof_hf_train_data.save('oof_hf_train_data.npy')
oof_hf_test_data.save('oof_hf_test_data.npy')

In [None]:
class MultiGradientBoostingClassifier:
    def __init__(self, gradient_boosting_class, num_labels, *args, **kwargs):
        self.models = [gradient_boosting_class(*args, **kwargs) for _ in range(num_labels)]
        self.num_labels = num_labels

    def fit(self, x_train, y_train, x_val, y_val, *args, **kwargs):
        for i in tqdm(range(self.num_labels)):
            self.models[i].fit(
                x_train, y_train[:, i],
                eval_set=[(x_val, y_val[:, i])],
                *args, **kwargs
            )

    def predict_proba(self, x):
        preds = [model.predict_proba(x) for model in self.models]
        return np.concatenate(preds, axis=1)

def f1_multilabel(y_true, y_probs):
    predictions = (y_probs > 0.5).astype(int)
    return f1_score(y_true, predictions, average='macro', zero_division=0)

def lgb_f1_multiclass(y_true, y_probs):
    return 'f1', f1_multilabel(y_true, y_probs), True

def xgb_f1_multiclass(y_pred, dtrain):
    y_true = dtrain.get_label().astype(int)
    y_probs = y_pred.reshape((len(y_true), len(np.unique(y_true))))
    return 'f1', -f1_multilabel(y_true, y_probs)

def optimize_weights_f1(predictions, y_true):
    def objective(weights):
        weights = weights / weights.sum()
        y_probs = np.dot(predictions, weights)
        return -f1_multilabel(y_true, y_probs)

    bounds = [(0, 1) for _ in range(predictions.shape[1])]
    result = differential_evolution(objective, bounds, seed=42, maxiter=5000, polish=True)

    weights = result.x / result.x.sum()
    best_f1 = -result.fun

    return weights, best_f1

oof_train_preds = np.zeros((len(oof_hf_train_data), 3))
oof_test_preds = np.zeros((len(oof_hf_test_data), 3))

N_SPLITS = 5
kfold = KFold(n_splits=N_SPLITS, random_state=42, shuffle=True)

for train_idx, val_idx in tqdm(list(kfold.split(oof_hf_train_data))):
    x_train_fold, x_val_fold = oof_hf_train_data[train_idx], oof_hf_train_data[val_idx]
    y_train_fold, y_val_fold = train_data['Labels'][train_idx], train_data['Labels'][val_idx]

    model = MultiGradientBoostingClassifier(
        gradient_boosting_class=lgb.LGBMClassifier,
        num_labels=18,
        n_estimators=1500,
        learning_rate=0.03,
        max_depth=6,
        verbosity=-1,
        device='gpu'
    )
    model.fit(
        x_train=x_train_fold,
        y_train=np.array(y_train_fold.tolist()),
        x_val=x_val_fold,
        y_val=np.array(y_val_fold.tolist()),
        eval_metric=lgb_f1_multiclass,
        callbacks=[
            lgb.log_evaluation(20),
            lgb.early_stopping(100, verbose=False)
        ]
    )
    oof_train_preds[val_idx, 0] = model.predict_proba(x_val_fold)
    oof_test_preds[:, 0] = model.predict_proba(oof_hf_test_data) / N_SPLITS

    model = MultiGradientBoostingClassifier(
        gradient_boosting_class=XGBClassifier,
        num_labels=18,
        n_estimators=1500,
        learning_rate=0.03,
        max_depth=6,
        verbosity=-1,
        device='gpu'
    )
    model.fit(
        x_train=x_train_fold,
        y_train=np.array(y_train_fold.tolist()),
        x_val=x_val_fold,
        y_val=np.array(y_val_fold.tolist()),
        eval_metric=xgb_f1_multiclass,
        callbacks=[
            lgb.log_evaluation(20),
            lgb.early_stopping(100, verbose=False)
        ]
    )
    oof_train_preds[val_idx, 1] = model.predict_proba(x_val_fold)
    oof_test_preds[:, 1] = model.predict_proba(oof_hf_test_data) / N_SPLITS

    model = CatBoostClassifier(
        loss_function='MultiLogloss',
        iterations=1500,
        learning_rate=0.03,
        verbose=50,
        eval_metric='TotalF1',
        early_stopping_rounds=100,
        task_type='GPU',
        random_state=42
    )
    model.fit(
        x_train_fold, np.array(y_train_fold.tolist()),
        eval_set=(x_val_fold, np.array(y_val_fold.tolist()))
    )
    oof_train_preds[val_idx, 2] = model.predict_proba(x_val_fold)
    oof_test_preds[:, 2] = model.predict_proba(oof_hf_test_data) / N_SPLITS

weights, score = optimize_weights_f1(oof_train_preds, train_data['Labels'][train_idx].values)
print(weights, score)

In [None]:
preds = np.dot(oof_test_preds, weights)
preds = (preds > 0.5).astype(int)

preds = pd.Series([[i for i, label in enumerate(labels) if label == 1] for labels in preds])  # перевод из бинарного OHE в список меток
preds += 1  # возвращаю нумерацию классов с 1
preds[preds > 10] += 1  # возвращаю отсутствие 11-го класса
preds = preds.apply(lambda labels: ' '.join(list(map(str, labels))))  # преобразование в строки

submission = pd.DataFrame({'ImageID': test_ids, 'Labels': preds})
submission.to_csv('submission.csv', index=False)
submission.to_csv()