In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
print(os.listdir("/kaggle/input/training/train/"))


In [2]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import models, transforms
from PIL import Image
from transformers import BertModel, BertTokenizer

In [3]:
train_data = pd.read_csv('/kaggle/input/training-2/subtask_a_train (2).csv')
target_data = pd.read_csv('/kaggle/input/training-2/target_t (1).csv')

In [4]:
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


In [5]:
class IdiomImageDataset(Dataset):
    def __init__(self, dataframe, target_df, image_dir):
        self.dataframe = dataframe
        self.target_df = target_df
        self.image_dir = image_dir
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        target_row = self.target_df.iloc[index]
        sentence = row['sentence']
        idiom_name = row['compound'].replace("'", "_")
        image_names = [row[f'image{i}_name'] for i in range(1, 6)]

        expected_order = eval(target_row['target'])
        expected_order = [x - 1 for x in expected_order]

        inputs = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        images = []
        for img_name in image_names:
            img_path = os.path.join(self.image_dir, idiom_name, img_name)
            img = Image.open(img_path).convert('RGB')
            img = image_transforms(img)
            images.append(img)
        images_tensor = torch.stack(images)
        expected_order_tensor = torch.tensor(expected_order, dtype=torch.long)
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), images_tensor, expected_order_tensor


In [9]:
class MultimodalRankingModel(nn.Module):
    def __init__(self):
        super(MultimodalRankingModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.resnet = models.resnet50(weights='DEFAULT')
        self.resnet.fc = nn.Identity()
        self.fc1 = nn.Linear(768 + 2048, 512)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, 5)

    def forward(self, input_ids, attention_mask, images):
        text_features = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        text_features = text_features.mean(dim=1)
        
        batch_size, num_images, channels, height, width = images.size()
        images = images.view(batch_size * num_images, channels, height, width)
        image_features = self.resnet(images)
        image_features = image_features.view(batch_size, num_images, -1)
        
        combined_features = torch.cat((text_features.unsqueeze(1).expand(-1, num_images, -1), image_features), dim=2)
        x = torch.relu(self.fc1(combined_features))
        x = self.dropout(x)
        rankings = self.fc2(x).squeeze(-1)
        
        return rankings


In [10]:
# Set a seed for reproducibility
torch.manual_seed(42)

# Define image folder and initialize dataset
image_folder = '/kaggle/input/training/train'
dataset = IdiomImageDataset(train_data, target_data, image_folder)

# Define static train-test split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Initialize data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [13]:
model = MultimodalRankingModel()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()


In [17]:
def train_model(model, data_loader, criterion, optimizer, epochs=10, checkpoint_path='/kaggle/working/checkpoint.pth'):
    start_epoch = 0

    # Load checkpoint if it exists
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"Resuming training from epoch {start_epoch}...")
    
    model.train()
    for epoch in range(start_epoch, epochs):
        total_loss = 0
        for input_ids, attention_mask, images, expected_order in data_loader:
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, images)
            loss = 0
            for i in range(outputs.size(1)):  # Loop over each image's rank
                loss += criterion(outputs[:, i], expected_order[:, i])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Print loss for the current epoch
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(data_loader)}')
        
        # Save checkpoint after each epoch
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': total_loss / len(data_loader),
        }, checkpoint_path)




In [18]:
train_model(model, train_loader, criterion, optimizer, epochs=50)
torch.save(model.state_dict(), 'multimodal_ranking_model.pth')

Epoch 1/50, Loss: 8.057949542999268
Epoch 2/50, Loss: 7.840265274047852
Epoch 3/50, Loss: 7.626566648483276
Epoch 4/50, Loss: 7.390778064727783
Epoch 5/50, Loss: 7.169898509979248
Epoch 6/50, Loss: 6.949058532714844
Epoch 7/50, Loss: 6.63524866104126
Epoch 8/50, Loss: 6.338412761688232
Epoch 9/50, Loss: 6.097182750701904
Epoch 10/50, Loss: 5.80884313583374
Epoch 11/50, Loss: 5.323753833770752
Epoch 12/50, Loss: 4.891294956207275
Epoch 13/50, Loss: 4.54177713394165
Epoch 14/50, Loss: 4.0530465841293335
Epoch 15/50, Loss: 3.6593425273895264
Epoch 16/50, Loss: 3.1793283224105835
Epoch 17/50, Loss: 2.7587671279907227
Epoch 18/50, Loss: 2.315897226333618
Epoch 19/50, Loss: 1.9299173951148987
Epoch 20/50, Loss: 1.592940330505371
Epoch 21/50, Loss: 1.3674546480178833
Epoch 22/50, Loss: 1.1155396103858948
Epoch 23/50, Loss: 0.9315878450870514
Epoch 24/50, Loss: 0.7360503077507019
Epoch 25/50, Loss: 0.573390007019043
Epoch 26/50, Loss: 0.48188231885433197
Epoch 27/50, Loss: 0.3742774426937103
E

In [71]:
def evaluate_model(model, data_loader):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask, images, _ in data_loader:
            outputs = model(input_ids, attention_mask, images)
            rankings = torch.argsort(outputs, dim=1)
            all_predictions.extend(rankings.cpu().numpy())
    return all_predictions


In [72]:
predicted_rankings = evaluate_model(model, test_loader)

# Convert each (5x5) prediction array to a rank order
final_predicted_rankings = []
for prediction_matrix in predicted_rankings:
    image_scores = prediction_matrix.sum(axis=1)
    ranked_order = np.argsort(image_scores)[::-1] + 1
    final_predicted_rankings.append(ranked_order.tolist())

# Get the test indices
test_indices = test_dataset.indices
print("Test Indices:", test_indices)

# True rankings for comparison
true_test_rankings = [eval(target_data.iloc[idx]['target']) for idx in test_indices]


Test Indices: [46, 1, 35, 4, 40, 11, 8, 44, 34, 52, 21, 48, 53, 67]


In [91]:
final_predicted_rankings

[[1, 2, 3, 4, 5],
 [2, 1, 3, 4, 5],
 [1, 3, 2, 5, 4],
 [1, 2, 4, 3, 5],
 [1, 2, 3, 5, 4]]

In [92]:
true_test_rankings 

[[1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5]]

In [101]:
def mean_reciprocal_rank(true_rankings, predicted_rankings):
    reciprocal_ranks = []
    for true, pred in zip(true_rankings, predicted_rankings):
        for i, p in enumerate(pred):
            if p == true[i]:
                reciprocal_ranks.append(1 / (i + 1))
                break
        else:
            reciprocal_ranks.append(0)
    return np.mean(reciprocal_ranks)
print(mean_reciprocal_rank(true_test_rankings,final_predicted_rankings))

Mean Reciprocal Rank (MRR): 0.5278 (52.78%)


In [107]:
def calculate_ranking_accuracy(predicted_rankings, true_rankings):
    """
    Calculate the percentage of images ranked correctly.
    
    Parameters:
    predicted_rankings (list of list of int): The predicted ranking for each image set.
    true_rankings (list of list of int): The true ranking for each image set.
    
    Returns:
    float: The average percentage of images ranked correctly.
    """
    assert len(predicted_rankings) == len(true_rankings), "Predicted and true rankings must have the same length."
    
    total_correct = 0
    total_images = 0
    
    for pred_ranking, true_ranking in zip(predicted_rankings, true_rankings):
        # Count correctly ranked images
        correct = sum(1 for p, t in zip(pred_ranking, true_ranking) if p == t)
        total_correct += correct
        total_images += len(true_ranking)
    
    accuracy = (total_correct / total_images) * 100
    return accuracy

accuracy = calculate_ranking_accuracy(true_test_rankings,final_predicted_rankings)
print(f"Ranking Accuracy: {accuracy:.2f}%")


Ranking Accuracy: 0.793
