## Let`s begin!

At first, to get things going, we need to import all the libraries we will use:

In [80]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import cohen_kappa_score
import optuna

To exclude the randomness in computing, we use the random_seed parameter:

In [81]:
random_seed = 42
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)

Let`s determine input and output paths and device we will work on:

In [82]:
train_csv = "/kaggle/input/deep-learning-for-computer-vision-and-nlp-2024-12/train.csv"
test_csv = "/kaggle/input/deep-learning-for-computer-vision-and-nlp-2024-12/test.csv"
images_dir = "/kaggle/input/deep-learning-for-computer-vision-and-nlp-2024-12/images"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Creating dataframes and filling all NA values in "Description" feature:

In [83]:
# Read data
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

# Fill missing descriptions
train_df['Description'].fillna('', inplace=True)
test_df['Description'].fillna('', inplace=True)

## Data processing

Let`s use resampling to balance the dataset:

In [84]:
# Balance the dataset by resampling
class_counts = train_df['AdoptionSpeed'].value_counts()
max_count = class_counts.max()

balanced_train_df = pd.concat([
    train_df[train_df['AdoptionSpeed'] == cls].sample(max_count, replace=True, random_state=random_seed)
    for cls in class_counts.index
]).reset_index(drop=True)

As tokenizer we are going to use BERT:

In [85]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



Let`s create a Dataset class we will work with:

In [86]:
class PetDataset(Dataset):
    def __init__(self, dataframe, images_dir, tokenizer, max_text_length=128, transform=None):
        self.dataframe = dataframe
        self.images_dir = images_dir
        self.tokenizer = tokenizer
        self.max_text_length = max_text_length
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        pet_id = row['PetID']
        description = row['Description']
        label = row.get('AdoptionSpeed', -1)

        # Tokenize text
        text_inputs = self.tokenizer(
            description,
            max_length=self.max_text_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        # Process images
        image_files = [f for f in os.listdir(self.images_dir) if f.startswith(pet_id)]
        images = []
        for image_file in image_files:
            image_path = os.path.join(self.images_dir, image_file)
            image = Image.open(image_path).convert("RGB")
            if self.transform:
                image = self.transform(image)
            images.append(image)

        if len(images) == 0:
            images = [torch.zeros(3, 224, 224)]  # Default empty image tensor

        images = torch.stack(images[:4])  # Take up to 4 images per ID

        return {
            'pet_id': pet_id,
            'text_inputs': {key: val.squeeze(0) for key, val in text_inputs.items()},
            'images': images,
            'label': torch.tensor(label, dtype=torch.long)
        }

collate_fn function:

In [87]:
def custom_collate_fn(batch):
    text_inputs = {
        'input_ids': torch.stack([item['text_inputs']['input_ids'] for item in batch]),
        'attention_mask': torch.stack([item['text_inputs']['attention_mask'] for item in batch])
    }

    max_images = max([item['images'].size(0) for item in batch])
    images = torch.stack([
        torch.cat([item['images'], torch.zeros(max_images - item['images'].size(0), 3, 224, 224)])
        if item['images'].size(0) < max_images else item['images']
        for item in batch
    ])

    labels = torch.stack([item['label'] for item in batch])
    pet_ids = [item['pet_id'] for item in batch]

    return {'pet_ids': pet_ids, 'text_inputs': text_inputs, 'images': images, 'labels': labels}

Let`s determine a way we will transform our images before the learning:

In [88]:
# Transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

Creating datasets and dataloaders:

In [89]:
from torch.utils.data import random_split, DataLoader

train_size = int(0.8 * len(balanced_train_df))
val_size = len(balanced_train_df) - train_size

train_indices, val_indices = random_split(range(len(balanced_train_df)), [train_size, val_size])

train_df = balanced_train_df.iloc[train_indices.indices]
val_df = balanced_train_df.iloc[val_indices.indices]

train_dataset = PetDataset(train_df, images_dir, tokenizer, transform=transform)
val_dataset = PetDataset(val_df, images_dir, tokenizer, transform=transform)
test_dataset = PetDataset(test_df, images_dir, tokenizer, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)

## Model class and optuna

In this case we are going to use Resnet50 to process the images and BERT to process the text:

In [90]:
class PetModel(nn.Module):
    def __init__(self):
        super(PetModel, self).__init__()
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        self.image_model = models.resnet50(pretrained=True)
        self.image_model.fc = nn.Identity()  # Remove classification head

        self.fc = nn.Sequential(
            nn.Linear(768 + 2048, 512),  # Combining text (768) and image (2048) features
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 5),  # Output layer with 5 units (for 5 classes)
            nn.Softmax(dim=1)    # Softmax activation on the output
        )

    def forward(self, text_inputs, images):
        # Process text inputs
        text_features = self.text_model(**text_inputs).pooler_output

        # Process image inputs
        batch_size, num_images, channels, height, width = images.size()
        image_features = []

        # Loop over each sample in the batch
        for i in range(batch_size):
            sample_images = images[i]  # Shape: (num_images, 3, 224, 224)
            sample_features = []

            # Loop over each image for the current sample
            for j in range(num_images):
                image = sample_images[j].unsqueeze(0)  # Shape: (1, 3, 224, 224)
                feature = self.image_model(image)  # Feature shape: (1, 2048)
                sample_features.append(feature)

            # Stack the features and average them across images
            sample_features = torch.stack(sample_features).mean(dim=0)  # Shape: (1, 2048)
            image_features.append(sample_features)

        # Stack all image features (shape: (batch_size, 2048))
        image_features = torch.stack(image_features)  

        # Flatten the text features (batch_size, 768) and image features (batch_size, 2048)
        text_features = text_features.view(batch_size, -1)  # Ensure text features are 2D
        image_features = image_features.view(batch_size, -1)  # Ensure image features are 2D

        # Concatenate text and image features
        combined_features = torch.cat((text_features, image_features), dim=1)

        # Final classification layer with softmax
        output = self.fc(combined_features)
        return output


Let`s create an Optuna objective function:

In [93]:
def objective(trial):
    # Hyperparameters for optimization
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64])

    # Update DataLoader with new batch_size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn, num_workers=4)

    # Initialize model
    model = PetModel()
    model = nn.DataParallel(model)  # Wrap for multi-GPU
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()

    # Training loop for 3 epochs
    num_epochs = 3
    for epoch in range(num_epochs):
        model.train()
        for batch in train_loader:
            input_ids = batch['text_inputs']['input_ids'].to(device)
            attention_mask = batch['text_inputs']['attention_mask'].to(device)
            images = batch['images'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(
                text_inputs={'input_ids': input_ids, 'attention_mask': attention_mask},
                images=images
            )
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Validation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batc8device)

            outputs = model(
                text_inputs={'input_ids': input_ids, 'attention_mask': attention_mask},
                images=images
            )
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    val_kappa = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
    return val_kappa


And a study:

In [94]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print('Найкраще значення Kappa:', study.best_value)
print('Найкращі гіперпараметри:')
for key, value in study.best_params.items():
    print(f'    {key}: {value}')

[I 2024-12-23 17:25:07,096] A new study created in memory with name: no-name-b38254cc-94d5-4e8a-a560-8963af689f0c
[I 2024-12-23 17:34:10,252] Trial 0 finished with value: 0.23752293442274786 and parameters: {'dropout_rate': 0.4391976263774061, 'lr': 7.694906509391704e-05, 'batch_size': 64}. Best is trial 0 with value: 0.23752293442274786.
[I 2024-12-23 17:43:10,624] Trial 1 finished with value: 0.4675050788585333 and parameters: {'dropout_rate': 0.20981693394453257, 'lr': 3.85392872666055e-05, 'batch_size': 64}. Best is trial 1 with value: 0.4675050788585333.
[I 2024-12-23 17:52:05,332] Trial 2 finished with value: 0.0 and parameters: {'dropout_rate': 0.2502427679044994, 'lr': 0.0009360564909188658, 'batch_size': 64}. Best is trial 1 with value: 0.4675050788585333.
[I 2024-12-23 18:01:01,443] Trial 3 finished with value: 0.19545676841874682 and parameters: {'dropout_rate': 0.4539803826193035, 'lr': 2.4590587389685526e-05, 'batch_size': 64}. Best is trial 1 with value: 0.467505078858533

Найкраще значення Kappa: 0.4675050788585333
Найкращі гіперпараметри:
    dropout_rate: 0.20981693394453257
    lr: 3.85392872666055e-05
    batch_size: 64


## Training and evaluation

Now that we have the best model parameters for training, let`s determine a function to train and evaluate our model:

In [101]:
# Training and evaluation functions
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training", leave=False):
        text_inputs = {key: val.to(device) for key, val in batch['text_inputs'].items()}
        images = batch['images'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(text_inputs, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating", leave=False):
            text_inputs = {key: val.to(device) for key, val in batch['text_inputs'].items()}
            images = batch['images'].to(device)
            labels = batch['label'].to(device)

            outputs = model(text_inputs, images)
            loss = criterion(outputs, labels)

            total_loss += loss.item()

    return total_loss / len(loader)



Creating new dataloaders with new batch_size and starting the training:

In [95]:
best_params = study.best_params
batch_size = best_params['batch_size']
dropout_rate = best_params['dropout_rate']
learning_rate = best_params['lr']

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

model = PetModel() 
model = nn.DataParallel(model)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()



In [103]:
num_epochs = 15
for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}")

                                                           

Epoch 1/15, Train Loss: 1.5722


                                                           

Epoch 2/15, Train Loss: 1.5284


                                                           

Epoch 3/15, Train Loss: 1.4339


                                                           

Epoch 4/15, Train Loss: 1.3443


                                                           

Epoch 5/15, Train Loss: 1.2584


                                                           

Epoch 6/15, Train Loss: 1.1928


                                                           

Epoch 7/15, Train Loss: 1.1467


                                                           

Epoch 8/15, Train Loss: 1.1155


                                                           

Epoch 9/15, Train Loss: 1.0835


                                                           

Epoch 10/15, Train Loss: 1.0717


                                                           

Epoch 11/15, Train Loss: 1.0523


                                                           

Epoch 12/15, Train Loss: 1.0392


                                                           

Epoch 13/15, Train Loss: 1.0400


                                                           

Epoch 14/15, Train Loss: 1.0266


                                                           

Epoch 15/15, Train Loss: 1.0151




In this notebook we will use some functions to evaluate our model with quadratic_weighted_kappa:

In [104]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

Let`s validate the model!

In [106]:
from tqdm import tqdm
import numpy as np
import torch

def validate_model(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_targets = []
    all_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Validating", leave=False):
            text_inputs = {key: val.to(device) for key, val in batch['text_inputs'].items()}
            images = batch['images'].to(device)
            labels = batch['label'].to(device)

            outputs = model(text_inputs, images)

            loss = criterion(outputs, labels)
            total_loss += loss.item()

            all_targets.extend(labels.cpu().numpy())
            all_predictions.extend(outputs.argmax(dim=1).cpu().numpy())

    avg_loss = total_loss / len(loader)

    conf_mat = confusion_matrix(all_targets, all_predictions)
    print("\nConfusion Matrix:")
    for row in conf_mat:
        print(row)

    hist_targets = histogram(all_targets)
    hist_predictions = histogram(all_predictions)
    print("\nHistogram of Targets:", hist_targets)
    print("Histogram of Predictions:", hist_predictions)

    kappa_score = quadratic_weighted_kappa(all_targets, all_predictions)

    return avg_loss, kappa_score

val_loss, val_kappa = validate_model(model, val_loader, criterion, device)
print(f"\nValidation Loss: {val_loss:.4f}")
print(f"Kappa Score: {val_kappa:.4f}")


                                                           


Confusion Matrix:
[329, 43, 13, 27]
[66, 291, 34, 51]
[33, 46, 286, 45]
[49, 51, 26, 317]

Histogram of Targets: [412, 442, 410, 443]
Histogram of Predictions: [477, 431, 359, 440]

Validation Loss: 1.1878
Kappa Score: 0.6510




## Saving the predictions

And save the predictions to submit:

In [112]:
def save_predictions(model, loader, device, output_csv):
    model.eval()
    pet_ids = []
    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Predicting"):
            # Move data to device
            text_inputs = {key: val.to(device) for key, val in batch['text_inputs'].items()}
            images = batch['images'].to(device)

            # Get predictions
            outputs = model(text_inputs, images)
            preds = outputs.argmax(dim=1).cpu().numpy()  # Predicted classes

            pet_ids.extend(batch['pet_id'])
            predictions.extend(preds)

    # Save to CSV
    prediction_df = pd.DataFrame({'PetID': pet_ids, 'AdoptionSpeed': predictions})
    prediction_df.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

In [113]:
test_output_csv = "test_predictions.csv"
save_predictions(model, test_loader, device, test_output_csv)

Predicting: 100%|██████████| 30/30 [00:18<00:00,  1.66it/s]

Predictions saved to test_predictions.csv



