In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Step 1: Install necessary libraries
!pip install transformers torchvision pandas scikit-learn tqdm



# Step 3: Import libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from torchvision import models, transforms
from PIL import Image
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from tqdm import tqdm
import os

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 4: Define paths (update these paths based on your Google Drive structure)
base_dir = "/content/drive/MyDrive/GDG_project/Multimodal_dataset_assignment3"  # Update this path
train_csv = os.path.join(base_dir, "labels.csv")
val_csv = os.path.join(base_dir, "reference.csv")
image_dir = os.path.join(base_dir, "images")

# Step 5: Dataset Class
class MemotionDataset(Dataset):
    def __init__(self, csv_file, image_dir, text_column="text_ocr", text_max_length=64, image_size=224):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.text_column = text_column  # Use the correct column name for text
        self.text_max_length = text_max_length
        self.image_size = image_size
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row[self.text_column]  # Use the correct column name for text
        image_path = os.path.join(self.image_dir, row["image_name"])  # Ensure 'image_name' column exists
        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)

        # Tokenize text
        inputs = self.tokenizer(
            text, max_length=self.text_max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)

        # Labels
        sentiment = row["overall_sentiment"]  # Use the correct column name for sentiment
        humour = row["humour"]  # Use the correct column name for humour
        sarcasm = row["sarcasm"]  # Use the correct column name for sarcasm
        offense = row["offensive"]  # Use the correct column name for offense
        motivation = row["motivational"]  # Use the correct column name for motivation

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "image": image,
            "sentiment": sentiment,
            "humour": humour,
            "sarcasm": sarcasm,
            "offense": offense,
            "motivation": motivation,
        }

# Step 6: Multimodal Model Architecture
class MultimodalModel(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", image_model_name="resnet50", num_classes=3):
        super(MultimodalModel, self).__init__()
        # Text model (BERT)
        self.text_model = BertModel.from_pretrained(text_model_name)
        self.text_fc = nn.Linear(self.text_model.config.hidden_size, 128)

        # Image model (ResNet)
        self.image_model = models.resnet50(pretrained=True)
        self.image_model.fc = nn.Linear(self.image_model.fc.in_features, 128)

        # Multimodal fusion
        self.fusion_fc = nn.Linear(128 + 128, 256)
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, input_ids, attention_mask, image):
        # Text features
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # CLS token
        text_features = self.text_fc(text_features)

        # Image features
        image_features = self.image_model(image)
        image_features = image_features.view(image_features.size(0), -1)

        # Fusion
        combined_features = torch.cat((text_features, image_features), dim=1)
        combined_features = self.fusion_fc(combined_features)
        logits = self.classifier(combined_features)

        return logits

# Step 7: Training function
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        images = batch["image"].to(device)
        labels = batch["sentiment"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

# Step 8: Evaluation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            images = batch["image"].to(device)
            labels = batch["sentiment"].to(device)

            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    f1 = f1_score(true_labels, predictions, average="macro")
    return total_loss / len(dataloader), f1

# Step 9: Main function
def main():
    # Hyperparameters
    batch_size = 32
    num_epochs = 5
    learning_rate = 2e-5

    # Load dataset
    train_dataset = MemotionDataset(csv_file=train_csv, image_dir=image_dir, text_column="text_ocr")
    val_dataset = MemotionDataset(csv_file=val_csv, image_dir=image_dir, text_column="text_ocr")
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, loss, and optimizer
    model = MultimodalModel(num_classes=3).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_f1 = evaluate(model, val_loader, criterion, device)
        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")

    # Save the model
    model_save_path = os.path.join(base_dir, "multimodal_sentiment_model.pth")
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

# Step 10: Run the main function
if __name__ == "__main__":
    main()

Using device: cuda




Epoch 1/5


  0%|          | 0/219 [00:04<?, ?it/s]


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).