In [1]:
!gdown 1Ua3QAi0oNMwM4-bmygY-HTDHx_ApwZZW

In [2]:
%ls

In [3]:
!unzip SharifML_Contest_NLP.zip

In [4]:
%ls

In [5]:
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Load the labels
labels_df = pd.read_csv('labels.csv')

# Define a custom dataset class
class VQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Initialize the processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Create the dataset and dataloader
train_dataset = VQADataset(labels_df, 'animals', processor)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [6]:
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Load the labels
labels_df = pd.read_csv('lables.csv')

# Define a custom dataset class
class VQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Initialize the processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Create the dataset and dataloader
train_dataset = VQADataset(labels_df, 'animals', processor)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [7]:
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

In [8]:
# Load the test info
test_df = pd.read_csv('test_info.csv')

# Create the test dataset and dataloader
test_dataset = VQADataset(test_df, 'test_images', processor)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Prediction loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_ids.cpu().numpy())

# Update the test dataframe with predictions
test_df['answer'] = predictions

# Save the predictions to a CSV file
submission = test_df[['file_name', 'question', 'answer']]
submission.to_csv('submission.csv', index=False)

In [9]:
# Define a custom dataset class
class ValVQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        # answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Load the test info
test_df = pd.read_csv('test_info.csv')

# Create the test dataset and dataloader
test_dataset = ValVQADataset(test_df, 'test_images', processor)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Prediction loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_ids.cpu().numpy())

# Update the test dataframe with predictions
test_df['answer'] = predictions

# Save the predictions to a CSV file
submission = test_df[['file_name', 'question', 'answer']]
submission.to_csv('submission.csv', index=False)

In [10]:
import zipfile

if not os.path.exists(os.path.join(os.getcwd(), 'SharifML_Contest_NLP.ipynb')):
    %notebook -e SharifML_Contest_NLP.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)

file_names = ['SharifML_Contest_NLP.ipynb', 'submission.csv']
compress(file_names)

In [11]:
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Load the labels
labels_df = pd.read_csv('lables.csv')

# Define a custom dataset class
class VQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Initialize the processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Create the dataset and dataloader
# train_dataset = VQADataset(labels_df, 'animals', processor)
train_dataset = VQADataset(labels_df, 'test_images', processor)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [12]:
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

In [13]:
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Load the labels
labels_df = pd.read_csv('lables.csv')

# Define a custom dataset class
class VQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Initialize the processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Create the dataset and dataloader
# train_dataset = VQADataset(labels_df, 'animals', processor)
train_dataset = VQADataset(labels_df, 'test_images', processor)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [14]:
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

In [15]:
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Load the labels
labels_df = pd.read_csv('lables.csv')

# Define a custom dataset class
class VQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        encoding['labels'] = torch.tensor(int(answer), dtype=torch.float)  # Use float for regression
        return encoding

# Initialize the processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Modify the model's output layer for regression
model.classifier = torch.nn.Linear(model.classifier.in_features, 1)

# Create the dataset and dataloader
# train_dataset = VQADataset(labels_df, 'animals', processor)
train_dataset = VQADataset(labels_df, 'test_images', processor)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [16]:
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Load the labels
labels_df = pd.read_csv('lables.csv')

# Define a custom dataset class
class VQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        encoding['labels'] = torch.tensor(int(answer), dtype=torch.float)  # Use float for regression
        return encoding

# Initialize the processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Modify the model's output layer for regression
# model.classifier = torch.nn.Linear(model.classifier.in_features, 1)
model.classifier[-1] = torch.nn.Linear(model.classifier[-1].in_features, 1)

# Create the dataset and dataloader
# train_dataset = VQADataset(labels_df, 'animals', processor)
train_dataset = VQADataset(labels_df, 'test_images', processor)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [17]:
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        # outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels)
        # loss = outputs.loss
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits = outputs.logits.squeeze(-1)
        loss = nn.MSELoss()(logits, labels)  # Use MSE loss for regression

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

In [18]:
from transformers import AdamW
import torch
import torch.nn as nn

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        # outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels)
        # loss = outputs.loss
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits = outputs.logits.squeeze(-1)
        loss = nn.MSELoss()(logits, labels)  # Use MSE loss for regression

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

In [19]:
# Define a custom dataset class
class ValVQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        # answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Load the test info
test_df = pd.read_csv('test_info.csv')

# Create the test dataset and dataloader
test_dataset = ValVQADataset(test_df, 'test_images', processor)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Prediction loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_ids.cpu().numpy())

# Update the test dataframe with predictions
test_df['answer'] = predictions

# Save the predictions to a CSV file
submission = test_df[['file_name', 'question', 'answer']]
submission.to_csv('submission.csv', index=False)

In [20]:
# # Predict and save the predictions
# # TODO
# # submission =
# def predict(test_csv_path):
#     test_df = pd.read_csv(test_csv_path)
#     submission = []
    
#     for _, row in test_df.iterrows():
#         image = Image.open(os.path.join('test_images', row['file_name']))
#         question = row['question']
        
#         encoding = processor(
#             image, question, 
#             return_tensors="pt", 
#             padding="max_length", 
#             truncation=True
#         ).to(device)
        
#         with torch.no_grad():
#             outputs = model(**encoding)
        
#         pred = torch.argmax(outputs.logits).item() + 1  # Convert back to 1-4
#         submission.append({
#             'file_name': row['file_name'],
#             'question': question,
#             'answer': pred
#         })
    
#     return pd.DataFrame(submission)

# # Generate submission
# submission = predict('test_info.csv')
# submission.to_csv('submission.csv', index=False)

# ****************

# Prepare the submission dataframe
submission = pd.DataFrame({
    'file_name': test_df['file_name'],
    'question': test_df['question'],
    'answer': test_predictions
})

# Save the submission dataframe to a CSV file
submission.to_csv('submission.csv', index=False)

In [21]:
# Define a custom dataset class
class ValVQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        # answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Load the test info
test_df = pd.read_csv('test_info.csv')

# Create the test dataset and dataloader
test_dataset = ValVQADataset(test_df, 'test_images', processor)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Prediction loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_ids.cpu().numpy())

# Update the test dataframe with predictions
test_df['answer'] = predictions

# Save the predictions to a CSV file
submission = test_df[['file_name', 'question', 'answer']]
submission.to_csv('submission.csv', index=False)

In [22]:
import zipfile

if not os.path.exists(os.path.join(os.getcwd(), 'SharifML_Contest_NLP.ipynb')):
    %notebook -e SharifML_Contest_NLP.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)

file_names = ['SharifML_Contest_NLP.ipynb', 'submission.csv']
compress(file_names)

In [23]:
# Define a custom dataset class
class ValVQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        # answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Load the test info
test_df = pd.read_csv('test_info.csv')

# Create the test dataset and dataloader
test_dataset = ValVQADataset(test_df, 'test_images', processor)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Prediction loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        print(outputs)
        logits = outputs.logits
        print(logits)
        predicted_ids = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_ids.cpu().numpy())
        break

# Update the test dataframe with predictions
test_df['answer'] = predictions

# Save the predictions to a CSV file
submission = test_df[['file_name', 'question', 'answer']]
submission.to_csv('submission.csv', index=False)

In [24]:
from transformers import AdamW
import torch
import torch.nn as nn

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)
        labels = batch['labels'].to(device)
        print(labels)

        # outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels)
        # loss = outputs.loss
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        print(outputs)
        logits = outputs.logits.squeeze(-1)
        print(logits)
        loss = nn.MSELoss()(logits, labels)  # Use MSE loss for regression

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        break

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

In [25]:
# Define a custom dataset class
class ValVQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        # answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Load the test info
test_df = pd.read_csv('test_info.csv')

# Create the test dataset and dataloader
test_dataset = ValVQADataset(test_df, 'test_images', processor)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Prediction loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        print(outputs)
        logits = outputs.logits
        print(logits)
        # predicted_ids = torch.argmax(logits, dim=-1)
        predicted_ids = torch.round(logits).int()
        print(predicted_ids)

        predictions.extend(predicted_ids.cpu().numpy())
        break

# Update the test dataframe with predictions
test_df['answer'] = predictions

# Save the predictions to a CSV file
submission = test_df[['file_name', 'question', 'answer']]
submission.to_csv('submission.csv', index=False)

In [26]:
# Define a custom dataset class
class ValVQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_dir, item['file_name'])
        image = Image.open(image_path).convert("RGB")
        question = item['question']
        # answer = item['answer']
        encoding = self.processor(images=image, text=question, padding="max_length", truncation=True, return_tensors="pt")
        # encoding['labels'] = torch.tensor(int(answer), dtype=torch.long)
        return encoding

# Load the test info
test_df = pd.read_csv('test_info.csv')

# Create the test dataset and dataloader
test_dataset = ValVQADataset(test_df, 'test_images', processor)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Prediction loop
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        pixel_values = batch['pixel_values'].squeeze(1).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        # print(outputs)
        logits = outputs.logits
        # print(logits)
        # predicted_ids = torch.argmax(logits, dim=-1)
        predicted_ids = torch.round(logits).int()
        # print(predicted_ids)

        predictions.extend(predicted_ids.cpu().numpy())

# Update the test dataframe with predictions
test_df['answer'] = predictions

# Save the predictions to a CSV file
submission = test_df[['file_name', 'question', 'answer']]
submission.to_csv('submission.csv', index=False)