<a href="https://colab.research.google.com/github/Aryan-Kamboj11/Multimodal-Hate-Speech-Detection/blob/main/MultiModalModelDevelopment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.optim import AdamW # Import AdamW from torch.optim
from torchvision import transforms
from torchvision.models import ResNet50_Weights # Import weights for ResNet50
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm

In [2]:
Device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"the available device is {Device}")

the available device is cuda


In [44]:
TEXT_MODEL = "bert-base-uncased"
IMAGE_MODEL = "resnet50"
MAX_LEN = 512
BATCH_SIZE = 32 # Reverting to original batch size
EPOCHS = 5
LEARNING_RATE = 2e-5
NUM_LABELS = 6 # Updated for multi-label classification
SAVED_MODEL_PATH = '/content/drive/MyDrive/Dataset/multimodal_hate_speech_model.bin'

In [37]:
def load_processed_data(file_path):
    try:
        df = pd.read_csv(file_path)
        # Drop rows with missing text or image paths
        df.dropna(subset=['cleaned_text', 'image_path', 'label'], inplace=True)
        # Construct full image paths
        base_path = '/content/drive/MyDrive/' # Assuming the base path to your dataset
        df['image_path'] = base_path + df['image_path']
        print(f"Successfully loaded and cleaned data from '{file_path}'.")
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at '{file_path}'.")
        return None

In [38]:
class MultiModalHateSpeechDataset(Dataset):
  def __init__(self,texts, image_paths, labels, tokenizer, max_len, transform):
    self.texts = texts
    self.image_paths = image_paths
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.transform = transform
  def __len__(self):
    return len(self.texts)

  def __getitem__(self, item):
    text = str(self.texts[item])
    image_path = self.image_paths[item]
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    try:
      image = Image.open(image_path).convert("RGB")
      image = self.transform(image)
    except Exception as e:
      print(f"Error loading image from {image_path}: {e}")
      image = torch.zeros((3,224,224))

    # Convert label string to list of integers
    try:
        label = ast.literal_eval(label)
    except ValueError:
        print(f"Could not evaluate label string: {label}")
        # Handle cases where the label string is not a valid list representation,
        # perhaps by skipping the item or assigning a default label.
        # For now, I'll assume valid list strings or handle the error.
        pass


    return{
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'image': image,
        'label': torch.tensor(label, dtype=torch.long)
    }

In [45]:
class MultiModalClassifier(nn.Module):
  def __init__(self, n_classes):
    super(MultiModalClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(TEXT_MODEL)
    self.resnet = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', weights=ResNet50_Weights.DEFAULT) # Changed to resnet50 and used weights
    resnet_output_dim = self.resnet.fc.in_features # Get input features of the original FC layer
    self.resnet.fc = nn.Identity()
    bert_output_dim = self.bert.config.hidden_size
    self.dropout = nn.Dropout(0.3)
    self.classifier = nn.Linear(bert_output_dim + resnet_output_dim, n_classes)

  def forward(self, input_ids, attention_mask, image):
    # Using the pooler_output for BERT
    text_features = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).pooler_output # Get pooled output

        # Process image
    image_features = self.resnet(image)

        # Concatenate features
    combined_features = torch.cat((text_features, image_features), dim=1)

        # Classification
    output = self.dropout(combined_features)
    return self.classifier(output)

In [46]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    """Function for a single training epoch."""
    model = model.train()
    total_loss = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            image=images
        )

        loss = loss_fn(outputs, labels.float())
        total_loss += loss.item()

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return total_loss / len(data_loader)

In [47]:
def eval_model(model, data_loader, loss_fn, device):
    """Function to evaluate the model on a validation set."""
    model = model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            images = batch["image"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                image=images
            )

            loss = loss_fn(outputs, labels.float())
            total_loss += loss.item()

            # Get predictions (using a 0.5 threshold after sigmoid)
            preds = torch.sigmoid(outputs) > 0.5
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

    # Calculate metrics for multi-label classification
    f1 = f1_score(all_labels, all_preds, average='samples', zero_division=0)
    accuracy = accuracy_score(all_labels, all_preds)

    return accuracy, f1, total_loss / len(data_loader)

In [48]:
df = load_processed_data('/content/drive/MyDrive/Dataset/data_transformed.csv')
if df is not None:
  df_train,df_val = train_test_split(df, test_size=0.2, random_state=42) # Removed stratify
  tokenizer = BertTokenizer.from_pretrained(TEXT_MODEL)
  image_transform = transforms.Compose([
      transforms.Resize((224, 224)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  ])
  train_dataset = MultiModalHateSpeechDataset(
            texts=df_train.cleaned_text.to_numpy(),
            image_paths=df_train.image_path.to_numpy(),
            labels=df_train.label.to_numpy(),
            tokenizer=tokenizer,
            max_len=MAX_LEN,
            transform=image_transform
        )
  train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=4,shuffle = True)
  val_dataset = MultiModalHateSpeechDataset(
            texts=df_val.cleaned_text.to_numpy(),
            image_paths=df_val.image_path.to_numpy(),
            labels=df_val.label.to_numpy(),
            tokenizer=tokenizer,
            max_len=MAX_LEN,
            transform=image_transform
        )
  val_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=4)
  model = MultiModalClassifier(n_classes=NUM_LABELS).to(Device)
  optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
  total_steps = len(train_data_loader) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
  loss_fn = nn.BCEWithLogitsLoss().to(Device)
  best_f1 = 0
  for epoch in range(EPOCHS):
    print(f'\n--- Epoch {epoch + 1}/{EPOCHS} ---')
    train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, Device, scheduler)
    print(f'Train loss: {train_loss:.4f}')
    val_acc, val_f1, val_loss = eval_model(model, val_data_loader, loss_fn, Device)
    print(f'Val loss: {val_loss:.4f} | Val accuracy: {val_acc:.4f} | Val F1: {val_f1:.4f}')
    # Save the best model based on validation F1 score
    if val_f1 > best_f1:
      torch.save(model.state_dict(), SAVED_MODEL_PATH)
      best_f1 = val_f1
      print(f"Best model saved to {SAVED_MODEL_PATH} (F1-score: {best_f1:.4f})")

Successfully loaded and cleaned data from '/content/drive/MyDrive/Dataset/data_transformed.csv'.


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 169MB/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 82.12 MiB is free. Process 2221 has 14.66 GiB memory in use. Of the allocated memory 14.42 GiB is allocated by PyTorch, and 118.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)