In [1]:
import pandas as pd
from torch.utils.data import Dataset
import PIL
import albumentations as A
import torch.nn as nn
import os
from transformers import Trainer, TrainingArguments, EfficientNetImageProcessor, EfficientNetForImageClassification
from transformers import AutoImageProcessor, Swinv2ForImageClassification
import torch
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, WeightedRandomSampler




In [2]:
device = "cuda" if  torch.cuda.is_available else "cpu"

In [3]:
model_name = "microsoft/swinv2-base-patch4-window8-256"
run_name = "./swin2-base-2"

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
age_mean = train['age_approx'].mean()
train['age_approx'] = train['age_approx'].fillna(age_mean)
test['age_approx'] = test['age_approx'].fillna(age_mean)

train['sex'] = train['sex'].fillna('UNK')
train['anatom_site_general_challenge'] = train['anatom_site_general_challenge'].fillna('UNK')
test['sex'] = test['sex'].fillna('UNK')
test['anatom_site_general_challenge'] = test['anatom_site_general_challenge'].fillna('UNK')


In [6]:
from sklearn.preprocessing import LabelEncoder

sex_le = LabelEncoder()
site_le = LabelEncoder()

train['sex_enc'] = sex_le.fit_transform(train['sex'])
train['site_enc'] = site_le.fit_transform(train['anatom_site_general_challenge'])
test['sex_enc'] = sex_le.transform(test['sex'])
test['site_enc'] = site_le.transform(test['anatom_site_general_challenge'])

mean_age = train['age_approx'].mean()
std_age = train['age_approx'].std()
train['age_norm'] = (train['age_approx'] - mean_age) / std_age

test['age_norm'] = (test['age_approx'] - mean_age) / std_age

In [7]:
preprocessor = AutoImageProcessor.from_pretrained(model_name)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
train_transform = A.Compose([
    A.Transpose(p=0.5),
    A.VerticalFlip(p=0.5),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1),
    A.OneOf([
        A.MotionBlur(blur_limit=5),
        A.MedianBlur(blur_limit=5),
        A.GaussianBlur(blur_limit=5),
        A.GaussNoise(var_limit=(5.0, 30.0)),
    ], p=0.7),

    A.OneOf([
        A.OpticalDistortion(distort_limit=1.0),
        A.GridDistortion(num_steps=5, distort_limit=1.),
        A.ElasticTransform(alpha=3),
    ], p=0.7),

    A.CLAHE(clip_limit=4.0, p=0.7),
    A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, border_mode=0, p=0.85),

    # A.OneOf([
    #     A.CLAHE(clip_limit=2),
    #     A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1),
    # ], p=0.3),

    # A.GaussianBlur(blur_limit=(3,5), p=0.1),
    # A.GaussNoise(var_limit=(5, 20), p=0.1),
])


  A.GaussNoise(var_limit=(5.0, 30.0)),
  original_init(self, **validated_kwargs)


In [9]:
# train_transform = A.Compose([
#     A.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0), p=1.0),
#     A.HorizontalFlip(p=0.5),
#     A.VerticalFlip(p=0.5),
#     A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.15, 
#                        rotate_limit=25, p=0.7, border_mode=0),
#     A.RandomBrightnessContrast(brightness_limit=0.25, 
#                                contrast_limit=0.25, p=0.5),
#     A.HueSaturationValue(hue_shift_limit=10, 
#                          sat_shift_limit=15, 
#                          val_shift_limit=10, p=0.4),
#     A.CLAHE(clip_limit=2.0, p=0.2),
#     A.GaussianBlur(blur_limit=(3, 5), p=0.2),
#     A.CoarseDropout(max_holes=1, max_height=40, max_width=40, 
#                     min_holes=1, fill_value=0, p=0.3),
# ])


In [10]:
class ISICDataset_train(Dataset):
    def __init__(self, image_dir, df, preprocessor, transform=None):
        self.image_dir = image_dir
        self.preprocessor = preprocessor
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        image_name = row['image_name'] + '.jpg'
        image_path = os.path.join(self.image_dir, image_name)

        with PIL.Image.open(image_path) as img:
            image = img.convert("RGB")

        if self.transform:
            augmented = self.transform(image=np.array(image))
            image = augmented['image']
            if isinstance(image, torch.Tensor):
                image = image.permute(1, 2, 0).cpu().numpy()

        inputs = self.preprocessor(image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)

        label = int(row['target'])

        # табличные
        sex = torch.tensor(row['sex_enc'], dtype=torch.long)
        age = torch.tensor(row['age_approx'], dtype=torch.float32)
        site = torch.tensor(row['site_enc'], dtype=torch.long)

        return {
            "pixel_values": pixel_values,
            "labels": label,
            "sex": sex,
            "age": age,
            "site": site
        }

    def __len__(self):
        return len(self.df)
    
    def show_image(self, idx):
        image_name = self.df.iloc[idx]['image_name'] + '.jpg'
        image_path = os.path.join(self.image_dir, image_name)

        with PIL.Image.open(image_path) as img:
            image = img.convert("RGB")

        if self.transform:
            augmented = self.transform(image=np.array(image))
            image = augmented["image"]
        else:
            image = np.array(image)

        if isinstance(image, torch.Tensor):
            image = image.permute(1, 2, 0).cpu().numpy()

        plt.imshow(image)
        plt.axis('off')
        plt.title(f"Index: {idx}, Label: {self.df.iloc[idx]['target']}")
        plt.show()



In [11]:
train_data, val_data = train_test_split(train, test_size=0.05, random_state=42, shuffle=True, stratify=train['target'])

train_dataset = ISICDataset_train("train", train_data, preprocessor, transform=train_transform)
val_dataset = ISICDataset_train("train", val_data, preprocessor)

In [12]:
# model = EfficientNetForImageClassification.from_pretrained(
#     model_name, 
#     use_safetensors=True,
#     num_labels=2,
#     ignore_mismatched_sizes=True
# )

In [13]:
# model = EfficientNetForImageClassification.from_pretrained(
#     "efficient-net-b3/checkpoint-3110", 
#     num_labels=2,
#     ignore_mismatched_sizes=True
# )

In [14]:
# model = Swinv2ForImageClassification.from_pretrained(
#     model_name,
#     num_labels=2,
#     use_safetensors=True,
#     ignore_mismatched_sizes=True
# )

In [15]:
base_model = Swinv2ForImageClassification.from_pretrained(
    "swin2-base/checkpoint-1230",
    num_labels=2,
    use_safetensors=True,
    ignore_mismatched_sizes=True
)

In [16]:
class TabularEncoder(nn.Module):
    def __init__(self, n_sex, n_site, embed_dim=16):
        super().__init__()
        self.sex_emb = nn.Embedding(n_sex, embed_dim)
        self.site_emb = nn.Embedding(n_site, embed_dim)
        self.age_lin = nn.Linear(1, embed_dim)
        self.embed_dim = embed_dim
        
    def forward(self, sex, age, site):
        s = self.sex_emb(sex)
        a = self.age_lin(age.unsqueeze(1))
        t = self.site_emb(site)
        return torch.cat([s, a, t], dim=1)

class MultiModalClassifier(nn.Module):
    def __init__(self, base_model: nn.Module, tabular_encoder: TabularEncoder, hidden=256, num_labels=2):
        super().__init__()
        self.base = base_model
        self.tab = tabular_encoder
        self.classifier = nn.Sequential(
            nn.Linear(base_model.config.hidden_size + 48, hidden),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden, num_labels)
        )
        self.num_labels = num_labels
    
    def forward(self, pixel_values, sex, age, site, labels=None):
        img_out = self.base(pixel_values, output_hidden_states=True)
        last_hidden = img_out.hidden_states[-1]
        img_feat = last_hidden[:, 0] 
        
        tab_feat = self.tab(sex, age, site)
        feat = torch.cat([img_feat, tab_feat], dim=1)
        
        logits = self.classifier(feat)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"logits": logits, "loss": loss}

In [17]:
n_sex = train["sex"].nunique()
n_site = train["anatom_site_general_challenge"].nunique()

tab_encoder = TabularEncoder(n_sex, n_site)
model = MultiModalClassifier(base_model, tab_encoder)

In [18]:
def compute_metrics(pred):
    preds = torch.tensor(pred.predictions)
    labels = torch.tensor(pred.label_ids)
    probs = torch.softmax(preds, dim=1)
    pred_classes = torch.argmax(probs, dim=1)
    f1 = f1_score(labels, pred_classes, average="weighted")
    auc = roc_auc_score(labels.numpy(), probs[:, 1].numpy())
    precision = precision_score(labels, pred_classes, average="weighted")
    recall = recall_score(labels, pred_classes, average="weighted")


    return {
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "auc_roc": auc
    }

training_args = TrainingArguments(
    # Основные параметры
    output_dir=run_name,  # Директория для сохранения
    
    # Параметры обучения
    num_train_epochs=6,                     # Количество эпох
    per_device_train_batch_size=16,         # Размер батча для обучения
    per_device_eval_batch_size=16,          # Размер батча для валидации
    learning_rate=2e-5,                     # Learning rate
    warmup_ratio = 0.05,                     # 10% от общего числа шагов для вармапа или warmup_steps = int(0.1 * total_training_steps)
    lr_scheduler_type = 'cosine',           # Можете посмотреть на них в 
                                            # https://www.kaggle.com/code/snnclsr/learning-rate-schedulers 
                                            # соответсвующий ему будет get_cosine_schedule_with_warmup
    gradient_accumulation_steps=8,
    # Сохранение и логирование
    logging_dir='./logs',                   # Директория для логов
    logging_steps=20,                      # Частота логирования
    save_steps=294,                         # Частота сохранения
    save_total_limit=4,                     # Максимум чекпоинтов
    save_strategy='steps',                  # Стратегия сохранения
    
    # Валидация
    eval_strategy='steps',
    eval_steps=294,            # Стратегия валидации
    load_best_model_at_end=True,            # Загружать лучшую модель
    metric_for_best_model='auc_roc',
    greater_is_better=True,                 # Больше значение = лучше
    # воспроизводимость
    seed=42,                                # Seed для воспроизводимости
)

In [19]:
# num_pos = len(train_dataset.df[train_dataset.df['target'] == 1])
# num_neg = len(train_dataset.df[train_dataset.df['target'] == 0])
# weight = torch.tensor([1.0, num_neg / num_pos]).to("cuda")  # [w0, w1]
# class WeightedTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.get("logits")  # [B, num_labels]

#         loss_fct = CrossEntropyLoss(weight=weight)
#         loss = loss_fct(logits, labels.long())

#         return (loss, outputs) if return_outputs else loss

class_counts = train_data['target'].value_counts().to_dict()
weights = [1.0 / class_counts[label] for label in train_data['target']]

class RandomSamplerTrainer(Trainer):
    def get_train_dataloader(self):
        dataset = self.train_dataset
        
        sampler = WeightedRandomSampler(
            weights=weights,
            num_samples=len(dataset),
            replacement=True
        )
        
        return DataLoader(
            dataset,
            batch_size=self.args.train_batch_size,
            sampler=sampler,
            collate_fn=self.data_collator,
            num_workers=self.args.dataloader_num_workers,
        )

In [20]:
trainer = RandomSamplerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
test['target'] = 0
test_dataset = ISICDataset_train("test", test, preprocessor)

In [None]:
predictions = trainer.predict(test_dataset)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
logits = torch.tensor(predictions.predictions)

probs = torch.softmax(logits, dim=1)[:, 1]

In [None]:
submission_df = pd.DataFrame({
    "image_name": test['image_name'],
    "target": probs
})
submission_df.to_csv("submission3.csv", index=False)

In [None]:
full_train_dataset = ISICDataset_train("train", train, preprocessor)
full_train_predictions = trainer.predict(full_train_dataset)
full_train_logits = torch.tensor(full_train_predictions.predictions)
full_train_probs = torch.softmax(full_train_logits, dim=1)[:, 1]
submission_df = pd.DataFrame({
    "image_name": train['image_name'],
    "target": full_train_probs
})
submission_df.to_csv("train_submission4.csv", index=False)


ValueError: array length 10982 does not match index length 33126