In [None]:
print("Contents of /content/mvsa_dataset:")
print(os.listdir("/content/mvsa_dataset"))


Contents of /content/mvsa_dataset:
['MVSA_Single']


In [None]:
print("Contents of /content/mvsa_dataset/MVSA_Single:")
print(os.listdir("/content/mvsa_dataset/MVSA_Single"))


Contents of /content/mvsa_dataset/MVSA_Single:
['data', 'labelResultAll.txt']


In [None]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd

class MVSADataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None, max_length=128):
        self.data_dir = data_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Read TSV (Tab-Separated Values) without header
        df = pd.read_csv(label_file, sep="\t", header=None, names=["id", "text_label", "image_label"])

        self.samples = []
        for _, row in df.iterrows():
            img_path = os.path.join(data_dir, "data", f"{row['id']}.jpg")
            txt_path = os.path.join(data_dir, "data", f"{row['id']}.txt")

            if os.path.exists(img_path) and os.path.exists(txt_path):
                with open(txt_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()

                self.samples.append({
                    "image_path": img_path,
                    "text": text,
                    "label": row["text_label"]
                })

        self.label2idx = {"positive": 0, "neutral": 1, "negative": 2}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Image
        image = Image.open(sample["image_path"]).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Text
        encoded = self.tokenizer(
            sample["text"],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"].squeeze()
        attention_mask = encoded["attention_mask"].squeeze()
        label = self.label2idx[sample["label"]]

        return image, input_ids, attention_mask, torch.tensor(label)


In [None]:
with open("/content/mvsa_dataset/MVSA_Single/labelResultAll.txt", "r", encoding="utf-8", errors="ignore") as f:
    text = f.read().strip()


In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image

class MVSADataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None, max_length=128):
        self.data_dir = data_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []

        # Load labels from the correct label file path
        label_path = label_file
        df = pd.read_csv(label_path, sep="\t", header=None, names=["id", "text_label", "image_label"])

        for _, row in df.iterrows():
            img_name = f"{row['id']}.jpg"
            txt_name = f"{row['id']}.txt"

            img_path = os.path.join(data_dir, "data", img_name)
            txt_path = os.path.join(data_dir, "data", txt_name)

            if os.path.exists(img_path) and os.path.exists(txt_path):
                # Safely read text with encoding fix
                with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
                    text = f.read().strip()

                self.samples.append({
                    "image": img_path,
                    "text": text,
                    "label": row["text_label"].strip().lower()  # You can also use "image_label"
                })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Load and transform image
        image = Image.open(sample["image"]).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Tokenize text
        text = sample["text"]
        encoded_text = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        label = self._label_to_int(sample["label"])

        return image, encoded_text["input_ids"].squeeze(0), encoded_text["attention_mask"].squeeze(0), label

    def _label_to_int(self, label):
        mapping = {"positive": 0, "neutral": 1, "negative": 2}
        return mapping.get(label, 1)  # Default to neutral if unknown


In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image

class MVSADataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None, max_length=128):
        self.data_dir = data_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        self.missing_samples = 0  # Track missing samples

        # Load labels from the correct label file path
        label_path = label_file
        df = pd.read_csv(label_path, sep="\t", header=None, names=["id", "text_label", "image_label"])

        for _, row in df.iterrows():
            img_name = f"{row['id']}.jpg"
            txt_name = f"{row['id']}.txt"

            img_path = os.path.join(data_dir, "data", img_name)
            txt_path = os.path.join(data_dir, "data", txt_name)

            # Check if both the image and text file exist
            if os.path.exists(img_path) and os.path.exists(txt_path):
                # Safely read text with encoding fix
                with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
                    text = f.read().strip()

                self.samples.append({
                    "image": img_path,
                    "text": text,
                    "label": row["text_label"].strip().lower()  # You can also use "image_label"
                })
            else:
                self.missing_samples += 1  # Count missing samples

        print(f"Total missing samples: {self.missing_samples}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Load and transform image
        image = Image.open(sample["image"]).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Tokenize text
        text = sample["text"]
        encoded_text = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        label = self._label_to_int(sample["label"])

        return image, encoded_text["input_ids"].squeeze(0), encoded_text["attention_mask"].squeeze(0), label

    def _label_to_int(self, label):
        mapping = {"positive": 0, "neutral": 1, "negative": 2}
        return mapping.get(label, 1)  # Default to neutral if unknown


In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image

class MVSADataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None, max_length=128):
        self.data_dir = data_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        self.missing_samples = 0  # Track missing samples
        self.missing_files = []   # To log missing files

        # Load labels from the correct label file path
        label_path = label_file
        df = pd.read_csv(label_path, sep="\t", header=None, names=["id", "text_label", "image_label"])

        for _, row in df.iterrows():
            img_name = f"{row['id']}.jpg"
            txt_name = f"{row['id']}.txt"

            img_path = os.path.join(data_dir, "data", img_name)
            txt_path = os.path.join(data_dir, "data", txt_name)

            # Check if both the image and text file exist
            if os.path.exists(img_path) and os.path.exists(txt_path):
                # Safely read text with encoding fix
                with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
                    text = f.read().strip()

                self.samples.append({
                    "image": img_path,
                    "text": text,
                    "label": row["text_label"].strip().lower()  # You can also use "image_label"
                })
            else:
                self.missing_samples += 1  # Count missing samples
                self.missing_files.append({"image": img_path, "text": txt_path})  # Log missing files

        print(f"Total missing samples: {self.missing_samples}")
        if self.missing_samples > 0:
            print(f"Missing files: {self.missing_files[:10]}")  # Show a sample of the missing files for debugging

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Load and transform image
        image = Image.open(sample["image"]).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Tokenize text
        text = sample["text"]
        encoded_text = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        label = self._label_to_int(sample["label"])

        return image, encoded_text["input_ids"].squeeze(0), encoded_text["attention_mask"].squeeze(0), label

    def _label_to_int(self, label):
        mapping = {"positive": 0, "neutral": 1, "negative": 2}
        return mapping.get(label, 1)  # Default to neutral if unknown


In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image

class MVSADataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None, max_length=128):
        self.data_dir = data_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        self.missing_samples = 0  # Track missing samples
        self.missing_files = []   # To log missing files

        # Load labels from the correct label file path
        label_path = label_file
        df = pd.read_csv(label_path, sep="\t", header=None, names=["id", "text_label", "image_label"])

        for _, row in df.iterrows():
            img_name = f"{row['id']}.jpg"
            txt_name = f"{row['id']}.txt"

            img_path = os.path.join(data_dir, "data", img_name)
            txt_path = os.path.join(data_dir, "data", txt_name)

            # Check if both the image and text file exist
            if os.path.exists(img_path) and os.path.exists(txt_path):
                # Safely read text with encoding fix
                with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
                    text = f.read().strip()

                self.samples.append({
                    "image": img_path,
                    "text": text,
                    "label": row["text_label"].strip().lower()  # You can also use "image_label"
                })
            else:
                self.missing_samples += 1  # Count missing samples
                self.missing_files.append({"image": img_path, "text": txt_path})  # Log missing files

        print(f"Total missing samples: {self.missing_samples}")
        if self.missing_samples > 0:
            print(f"Missing files: {self.missing_files[:10]}")  # Show a sample of the missing files for debugging

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Load and transform image
        image = Image.open(sample["image"]).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Tokenize text
        text = sample["text"]
        encoded_text = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        label = self._label_to_int(sample["label"])

        return image, encoded_text["input_ids"].squeeze(0), encoded_text["attention_mask"].squeeze(0), label

    def _label_to_int(self, label):
        mapping = {"positive": 0, "neutral": 1, "negative": 2}
        return mapping.get(label, 1)  # Default to neutral if unknown


In [None]:
class MVSADataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None, max_length=256):
        self.data_dir = data_dir
        self.label_file = label_file
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.samples = []

        # Read CSV or TXT file
        with open(self.label_file, "r", encoding="utf-8") as f:
            lines = f.readlines()

        for line in lines:
            name, sentiment = line.strip().split(",")  # Assuming name and sentiment are comma-separated

            # Define paths for image and text files
            img_path = os.path.join(self.data_dir, "data", f"{name}.jpg")
            txt_path = os.path.join(self.data_dir, "data", f"{name}.txt")

            # Skip missing pairs
            if not os.path.exists(img_path) or not os.path.exists(txt_path):
                continue

            # If both files exist, add sample to the list
            with open(txt_path, "r", encoding="utf-8") as txt_file:
                text = txt_file.read().strip()

            # Add sample dictionary to the list
            self.samples.append({
                'id': name,
                'text': text,
                'image': img_path,
                'label': sentiment
            })

        print(f"Total samples after cleaning: {len(self.samples)}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        img = Image.open(sample['image']).convert("RGB")
        text = sample['text']
        label = int(sample['label'])  # Assuming sentiment is an integer (0, 1, 2, etc.)

        # Apply transformation if provided
        if self.transform:
            img = self.transform(img)

        # Tokenize the text if a tokenizer is provided
        if self.tokenizer:
            encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
            input_ids = encoding['input_ids'].squeeze(0)  # Remove the batch dimension
            attention_mask = encoding['attention_mask'].squeeze(0)  # Remove the batch dimension
        else:
            input_ids = None
            attention_mask = None

        return img, input_ids, attention_mask, label


In [None]:
dataset = MVSADataset(
    data_dir="/content/mvsa_dataset/MVSA_Single",
    label_file="/content/mvsa_dataset/MVSA_Single/labelResultAll.txt",
    transform=transform,
    tokenizer=tokenizer
)

# Now you can proceed with the train/validation split and DataLoader creation as before


Total samples after cleaning: 0


In [None]:
with open("/content/mvsa_dataset/MVSA_Single/labelResultAll.txt", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i < 5:  # Print the first 5 lines for inspection
            print(line.strip())



ID	text,image
1	neutral,positive
2	neutral,positive
3	neutral,positive
4	positive,positive


In [None]:
class MVSADataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None, max_length=256):
        self.data_dir = data_dir
        self.label_file = label_file
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.samples = []

        # Read the label file
        with open(self.label_file, "r", encoding="utf-8", errors='ignore') as f:
            lines = f.readlines()

        for line in lines:
            # Split the line into the ID and labels (text_label, image_label)
            parts = line.strip().split("\t")
            sample_id = parts[0]
            text_label, image_label = parts[1].split(",")  # Split the labels on comma

            # Define paths for image and text files
            img_path = os.path.join(self.data_dir, "data", f"{sample_id}.jpg")
            txt_path = os.path.join(self.data_dir, "data", f"{sample_id}.txt")

            # Check if the image and text files exist
            if not os.path.exists(img_path) or not os.path.exists(txt_path):
                print(f"Skipping missing pair -> {img_path} or {txt_path}")
                continue

            # Read the text content, using 'ignore' to bypass encoding issues
            with open(txt_path, "r", encoding="utf-8", errors='ignore') as txt_file:
                text = txt_file.read().strip()

            # Add the sample to the list
            self.samples.append({
                'id': sample_id,
                'text': text,
                'image': img_path,
                'text_label': text_label,
                'image_label': image_label
            })

        print(f"Total samples after cleaning: {len(self.samples)}")


In [None]:
dataset = MVSADataset(
    data_dir="/content/mvsa_dataset/MVSA_Single",
    label_file="/content/mvsa_dataset/MVSA_Single/labelResultAll.txt",
    transform=transform,
    tokenizer=tokenizer
)

# Now you can proceed with the train/validation split and DataLoader creation as before


Skipping missing pair -> /content/mvsa_dataset/MVSA_Single/data/ID.jpg or /content/mvsa_dataset/MVSA_Single/data/ID.txt
Total samples after cleaning: 4869


In [None]:
# Inspect the structure of a sample
print(dataset.samples[:5])  # Print the first 5 samples to check their structure



[{'id': '1', 'text': 'How I feel today #legday #jelly #aching #gym', 'image': '/content/mvsa_dataset/MVSA_Single/data/1.jpg', 'text_label': 'neutral', 'image_label': 'positive'}, {'id': '2', 'text': 'grattis min griskulting!!!???? va bara tvungen oki s? sch ? @ingenkommeratttrodig #pig #happybday #wow #lovely #cut', 'image': '/content/mvsa_dataset/MVSA_Single/data/2.jpg', 'text_label': 'neutral', 'image_label': 'positive'}, {'id': '3', 'text': 'RT @polynminion: The moment I found my favourite tV character. #PROFOUNDLOVE', 'image': '/content/mvsa_dataset/MVSA_Single/data/3.jpg', 'text_label': 'neutral', 'image_label': 'positive'}, {'id': '4', 'text': '#escort We have a young and energetic team and we pride ourselves on offering the highes #hoer', 'image': '/content/mvsa_dataset/MVSA_Single/data/4.jpg', 'text_label': 'positive', 'image_label': 'positive'}, {'id': '5', 'text': 'RT @chrisashaffer: Went to SSC today to be a "movie star" to rep for the Deaf and got to meet an energetic great

In [None]:
missing = 0
for sample in dataset.samples:
    name = sample['id']  # Access the 'id' from the sample dictionary
    img = os.path.join("/content/mvsa_dataset/MVSA_Single/data", f"{name}.jpg")
    txt = os.path.join("/content/mvsa_dataset/MVSA_Single/data", f"{name}.txt")
    if not os.path.exists(img) or not os.path.exists(txt):
        print(f"Missing pair -> {img} or {txt}")
        missing += 1

print(f"Total missing: {missing}")


Total missing: 0


In [None]:
import os

data_folder = "/content/mvsa_dataset/MVSA_Single/data"

# List all image and text files
img_files = [f for f in os.listdir(data_folder) if f.endswith(".jpg")]
txt_files = [f for f in os.listdir(data_folder) if f.endswith(".txt")]

print(f"Number of images: {len(img_files)}")
print(f"Number of text files: {len(txt_files)}")

# Optionally: show a few filenames
print("Sample image files:", img_files[:5])
print("Sample text files:", txt_files[:5])


Number of images: 4869
Number of text files: 4869
Sample image files: ['2597.jpg', '2619.jpg', '1469.jpg', '2445.jpg', '4345.jpg']
Sample text files: ['4.txt', '3991.txt', '2072.txt', '3752.txt', '4355.txt']


In [None]:
label_file = "/content/mvsa_dataset/MVSA_Single/labelResultAll.txt"
data_dir = "/content/mvsa_dataset/MVSA_Single/data"

valid_samples = []

with open(label_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) < 2:
            continue

        name, label = parts[0], parts[1]
        img_path = os.path.join(data_dir, f"{name}.jpg")
        txt_path = os.path.join(data_dir, f"{name}.txt")

        if os.path.exists(img_path) and os.path.exists(txt_path):
            valid_samples.append((name, label))

print(f"✅ Valid pairs: {len(valid_samples)}")


✅ Valid pairs: 4869


In [None]:
import os

for root, dirs, files in os.walk("/content/MVSA_Single"):
    print(f"{root} -> {len(files)} files")


In [None]:
with open("/content/mvsa_dataset/MVSA_Single/labelResultAll.txt", "r", encoding="utf-8") as f:
    for _ in range(10):
        print(f.readline())


ID	text,image

1	neutral,positive

2	neutral,positive

3	neutral,positive

4	positive,positive

5	positive,positive

6	positive,positive

7	positive,positive

8	neutral,positive

9	positive,positive



In [None]:
label_file = "/content/mvsa_dataset/MVSA_Single/labelResultAll.txt"
data_dir = "/content/mvsa_dataset/MVSA_Single/data"

valid_samples = []

with open(label_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) < 2:
            continue

        name, label = parts[0], parts[1]
        img_path = os.path.join(data_dir, f"{name}.jpg")
        txt_path = os.path.join(data_dir, f"{name}.txt")

        if os.path.exists(img_path) and os.path.exists(txt_path):
            valid_samples.append((name, label))

print(f"✅ Valid pairs: {len(valid_samples)}")


✅ Valid pairs: 4869


In [None]:
import os

label_file = "/content/MVSA_Single/MVSA_Single/labelResultAll.txt"
data_dir = "/content/MVSA_Single/MVSA_Single"

image_ext = ".jpg"
valid_samples = []

with open(label_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) < 2:
            continue

        name, label = parts[0], parts[1]

        img_path = os.path.join(data_dir, "data", f"{name}{image_ext}")
        txt_path = os.path.join(data_dir, "data", f"{name}.txt")

        if os.path.exists(img_path) and os.path.exists(txt_path):
            valid_samples.append((name, label))

print(f"✅ Valid pairs found: {len(valid_samples)}")


✅ Valid pairs found: 4869


In [None]:
class MVSADataset(Dataset):
    def __init__(self, samples, data_dir, transform=None, tokenizer=None):
        self.samples = samples
        self.data_dir = data_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.label_map = {"positive": 0, "neutral": 1, "negative": 2}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_name, label_str = self.samples[idx]
        label_str = label_str.split(',')[0].strip().lower()
        label = self.label_map.get(label_str, 1)  # Default to neutral if unknown

        img_path = os.path.join(self.data_dir, "data", f"{img_name}.jpg")
        txt_path = os.path.join(self.data_dir, "data", f"{img_name}.txt")

        image = Image.open(img_path).convert("RGB")
        text = open(txt_path, "r", encoding="utf-8").read().strip()

        if self.transform:
            image = self.transform(image)

        text_inputs = self.tokenizer(text, padding="max_length", truncation=True, return_tensors="pt", max_length=128)

        return image, text_inputs["input_ids"].squeeze(0), text_inputs["attention_mask"].squeeze(0), torch.tensor(label)


In [None]:
from torchvision import transforms
from transformers import BertTokenizer

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

dataset = MVSADataset(valid_samples, data_dir="/content/MVSA_Single/MVSA_Single", transform=transform, tokenizer=tokenizer)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
label_file = "/content/MVSA_Single/MVSA_Single/labelResultAll.txt"
data_dir = "/content/MVSA_Single/data"

valid_samples = []

with open(label_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) < 2:
            continue

        name, label = parts[0], parts[1]
        img_path = os.path.join(data_dir, f"{name}.jpg")
        txt_path = os.path.join(data_dir, f"{name}.txt")

        if os.path.exists(img_path) and os.path.exists(txt_path):
            valid_samples.append((name, label))

print(f"✅ Valid pairs: {len(valid_samples)}")


✅ Valid pairs: 0


In [None]:
# Show a few lines from the label file
label_file_path = "/content/MVSA_Single/MVSA_Single/labelResultAll.txt"

with open(label_file_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(f"[{i}] {line.strip()}")
        if i == 9:
            break


[0] ID	text,image
[1] 1	neutral,positive
[2] 2	neutral,positive
[3] 3	neutral,positive
[4] 4	positive,positive
[5] 5	positive,positive
[6] 6	positive,positive
[7] 7	positive,positive
[8] 8	neutral,positive
[9] 9	positive,positive


In [None]:
import os

sample_id = "123"  # <-- Replace this with a real number from step 1 output

img_path = f"/content/MVSA_Single/MVSA_Single/data/{sample_id}.jpg"
txt_path = f"/content/MVSA_Single/MVSA_Single/data/{sample_id}.txt"

print("Image exists:", os.path.exists(img_path))
print("Text exists:", os.path.exists(txt_path))


Image exists: True
Text exists: True


In [None]:
import os

data_path = "/content/MVSA_Single/MVSA_Single/data"
files = os.listdir(data_path)

print("Sample files:")
print(files[:10])


Sample files:
['2785.jpg', '416.jpg', '2615.jpg', '3292.txt', '3829.txt', '2574.jpg', '230.txt', '1366.txt', '750.txt', '2869.jpg']


In [None]:
class MVSADataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None, tokenizer=None):
        self.data_dir = data_dir
        self.label_file = label_file
        self.transform = transform
        self.tokenizer = tokenizer
        self.samples = []
        self.label_map = {"positive": 0, "neutral": 1, "negative": 2}

        with open(self.label_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue

                file_id = parts[0]
                label_str = parts[1].split(',')[0].strip().lower()  # handle things like "positive,neutral"

                img_path = os.path.join(self.data_dir, f"{file_id}.jpg")
                text_path = os.path.join(self.data_dir, f"{file_id}.txt")

                if os.path.exists(img_path) and os.path.exists(text_path):
                    label = self.label_map.get(label_str)
                    if label is not None:
                        self.samples.append((img_path, text_path, label))

        print(f"✅ Total valid samples loaded: {len(self.samples)}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, text_path, label = self.samples[idx]

        image = Image.open(img_path).convert("RGB")
        text = open(text_path, "r", encoding="utf-8").read().strip()

        if self.transform:
            image = self.transform(image)

        text_inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        return (
            image,
            text_inputs["input_ids"].squeeze(0),
            text_inputs["attention_mask"].squeeze(0),
            label
        )


In [None]:
dataset = MVSADataset(
    data_dir="/content/MVSA_Single/MVSA_Single/data",
    label_file="/content/MVSA_Single/MVSA_Single/labelResultAll.txt",
    transform=your_transforms,
    tokenizer=tokenizer
)


✅ Total valid samples loaded: 4869


In [None]:
from torch.utils.data import DataLoader, random_split

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)



In [None]:
import torch.nn as nn
import torchvision.models as models
from transformers import BertModel

class MultimodalClassifier(nn.Module):
    def __init__(self):
        super(MultimodalClassifier, self).__init__()

        # Image feature extractor
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 256)

        # Text feature extractor
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.text_fc = nn.Linear(self.bert.config.hidden_size, 256)

        # Combined classifier
        self.classifier = nn.Sequential(
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 3)  # 3 sentiment classes
        )

    def forward(self, image, input_ids, attention_mask):
        img_feat = self.cnn(image)
        text_feat = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        text_feat = self.text_fc(text_feat)

        combined = torch.cat((img_feat, text_feat), dim=1)
        out = self.classifier(combined)
        return out


In [None]:
import torch
from torch import nn, optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalClassifier().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
# Safe text loading with fallback encoding
try:
    with open("/content/MVSA_Single/MVSA_Single/labelResultAll.txt", "r", encoding="utf-8") as f:
        text = f.read().strip()
except UnicodeDecodeError:
    with open("/content/MVSA_Single/MVSA_Single/labelResultAll.txt", "r", encoding="latin1") as f:
        text = f.read().strip()


In [None]:
from PIL import Image

def __getitem__(self, idx):
    img_name, sentiment = self.samples[idx]
    img_path = os.path.join(self.data_dir, "data", f"{img_name}.jpg")
    text_path = os.path.join(self.data_dir, "data", f"{img_name}.txt")

    # Load image
    image = Image.open(img_path).convert("RGB")

    # Try loading text with utf-8, fallback to latin1 if it fails
    try:
        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
    except UnicodeDecodeError:
        with open(text_path, "r", encoding="latin1") as f:
            text = f.read().strip()

    # Apply transforms
    if self.transform:
        image = self.transform(image)

    # Tokenize text
    tokens = self.tokenizer(
        text, padding="max_length", max_length=128, truncation=True, return_tensors="pt"
    )

    label = self.label2idx[sentiment]

    return (
        image,
        tokens["input_ids"].squeeze(0),
        tokens["attention_mask"].squeeze(0),
        torch.tensor(label),
    )


In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
from chardet import detect  # Optional: for auto-encoding detection (needs `pip install chardet`)

def __getitem__(self, idx):
    img_path, text_path, label = self.valid_pairs[idx]

    image = Image.open(img_path).convert("RGB")

    try:
        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
    except UnicodeDecodeError:
        # Option 1: Try ISO-8859-1 (Latin-1)
        with open(text_path, "r", encoding="ISO-8859-1") as f:
            text = f.read().strip()

        # Option 2 (better): Use chardet to detect encoding
        # with open(text_path, "rb") as f:
        #     raw = f.read()
        #     encoding = detect(raw)["encoding"]
        #     text = raw.decode(encoding).strip()

    if self.transform:
        image = self.transform(image)

    encoding = self.tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = encoding["input_ids"].squeeze(0)
    attention_mask = encoding["attention_mask"].squeeze(0)

    return image, input_ids, attention_mask, label


In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)  # Adjust learning rate if needed


In [None]:
try:
    with open("/content/MVSA_Single/MVSA_Single/labelResultAll.txt", "r", encoding="utf-8") as f:
        text = f.read().strip()
except UnicodeDecodeError:
    with open("/content/MVSA_Single/MVSA_Single/labelResultAll.txt", "r", encoding="latin-1") as f:
        text = f.read().strip()


In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from transformers import BertTokenizer

class MultimodalDataset(Dataset):
    def __init__(self, data_dir, labels_dict, transform=None, tokenizer=None, max_length=128):
        self.data_dir = data_dir
        self.labels_dict = labels_dict
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = self._get_valid_samples()

    def _get_valid_samples(self):
        samples = []
        for key, label in self.labels_dict.items():
            img_path = os.path.join(self.data_dir, f"{key}.jpg")
            txt_path = os.path.join(self.data_dir, f"{key}.txt")
            if os.path.isfile(img_path) and os.path.isfile(txt_path):
                samples.append((img_path, txt_path, label))
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, txt_path, label = self.samples[idx]

        # Load image
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load text (handle encoding errors)
        with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().strip()

        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length",
                                  truncation=True, max_length=self.max_length)
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        # Map labels to int
        label_map = {"positive": 0, "neutral": 1, "negative": 2}
        label = label_map[label.lower()]

        return image, input_ids, attention_mask, label


In [None]:
def load_labels(label_file_path):
    labels_dict = {}
    with open(label_file_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                key = parts[0]
                label = parts[1].lower()
                if label in ["positive", "neutral", "negative"]:
                    labels_dict[key] = label
    return labels_dict


In [None]:
import os

print("Images & Text files:", len(os.listdir("/content/MVSA_Single/MVSA_Single/data")))
!head -n 5 /content/MVSA_Single/MVSA_Single/labelResultAll.txt


Images & Text files: 9738
ID	text,image
1	neutral,positive
2	neutral,positive
3	neutral,positive
4	positive,positive


In [None]:
import csv

def load_labels(label_file_path):
    labels_dict = {}
    with open(label_file_path, "r", encoding="utf-8", errors="ignore") as f:
        reader = csv.reader(f, delimiter='\t')  # Tab-separated
        next(reader)  # Skip the header line
        for row in reader:
            if len(row) == 2:
                sample_id = row[0].strip()
                label_str = row[1].strip()  # e.g. "neutral,positive"
                text_label = label_str.split(',')[0].lower()  # Get the text label only
                if text_label in ["positive", "neutral", "negative"]:
                    labels_dict[sample_id] = text_label
    return labels_dict


In [None]:
import os
import csv

def load_labels(label_file_path):
    labels_dict = {}
    with open(label_file_path, "r", encoding="utf-8", errors="ignore") as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)  # Skip header
        for row in reader:
            if len(row) == 2:
                sample_id = row[0].strip()
                label_str = row[1].strip()
                text_label = label_str.split(',')[0].lower()  # Use only text label
                if text_label in ["positive", "neutral", "negative"]:
                    labels_dict[sample_id] = text_label
    return labels_dict


In [None]:
from torch.utils.data import Dataset
from PIL import Image
import torch
from torchvision import transforms
from transformers import BertTokenizer

class MVSADataset(Dataset):
    def __init__(self, data_dir, labels_dict, transform=None, tokenizer=None, max_length=128):
        self.data_dir = data_dir
        self.labels_dict = labels_dict
        self.samples = []
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = {"positive": 0, "neutral": 1, "negative": 2}

        for id_, label in labels_dict.items():
            img_path = os.path.join(data_dir, "data", f"{id_}.jpg")
            txt_path = os.path.join(data_dir, "data", f"{id_}.txt")
            if os.path.exists(img_path) and os.path.exists(txt_path):
                self.samples.append((img_path, txt_path, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, txt_path, label = self.samples[idx]

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().strip()

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return (
            image,
            encoding["input_ids"].squeeze(0),
            encoding["attention_mask"].squeeze(0),
            torch.tensor(self.label2id[label])
        )


In [None]:
from transformers import BertTokenizer
from torchvision import transforms

# Paths
label_file_path = "/content/MVSA_Single/MVSA_Single/labelResultAll.txt"
data_dir = "/content/MVSA_Single/MVSA_Single"

# Load labels
labels_dict = load_labels(label_file_path)

# Tokenizer & Transforms
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Create dataset
dataset = MVSADataset(
    data_dir=data_dir,
    labels_dict=labels_dict,
    transform=transform,
    tokenizer=tokenizer
)

print(f"Total samples: {len(dataset)}")


Total samples: 4869


In [None]:
from torch.utils.data import random_split, DataLoader

# Split dataset (80% train, 20% val)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = random_split(dataset, [train_size, val_size])

# Dataloaders
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16)

print(f"Train: {len(train_set)}, Val: {len(val_set)}")


Train: 3895, Val: 974


In [None]:
import torch.nn as nn
from torchvision.models import resnet18
from transformers import BertModel

class MultiModalSentimentModel(nn.Module):
    def __init__(self):
        super(MultiModalSentimentModel, self).__init__()
        # Image encoder
        self.cnn = resnet18(pretrained=True)
        self.cnn.fc = nn.Identity()  # Output 512-d features

        # Text encoder
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.text_fc = nn.Linear(self.bert.config.hidden_size, 256)

        # Combined classifier
        self.classifier = nn.Sequential(
            nn.Linear(512 + 256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 3)  # 3 classes
        )

    def forward(self, image, input_ids, attention_mask):
        img_feat = self.cnn(image)
        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = self.text_fc(text_outputs.pooler_output)

        combined = torch.cat((img_feat, text_feat), dim=1)
        output = self.classifier(combined)
        return output


In [None]:
import torch
from torch import nn, optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultiModalSentimentModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

num_epochs = 5


In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss, correct = 0, 0

    loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]")
    for images, input_ids, attention_mask, labels in loop:
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total_loss += loss.item()

        loop.set_postfix(loss=loss.item(), acc=100*correct/len(train_loader.dataset))

    # Validation after each epoch
    model.eval()
    val_correct = 0
    with torch.no_grad():
        for images, input_ids, attention_mask, labels in val_loader:
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(images, input_ids, attention_mask)
            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()

    val_acc = 100 * val_correct / len(val_loader.dataset)
    print(f"Validation Accuracy: {val_acc:.2f}%\n")


Epoch [1/5]: 100%|██████████| 244/244 [01:55<00:00,  2.11it/s, acc=53.5, loss=1.27]


Validation Accuracy: 72.07%



Epoch [2/5]: 100%|██████████| 244/244 [01:53<00:00,  2.14it/s, acc=76.2, loss=1.22]


Validation Accuracy: 73.41%



Epoch [3/5]: 100%|██████████| 244/244 [01:53<00:00,  2.15it/s, acc=85.7, loss=0.282]


Validation Accuracy: 76.18%



Epoch [4/5]: 100%|██████████| 244/244 [01:53<00:00,  2.15it/s, acc=92, loss=0.404]


Validation Accuracy: 74.85%



Epoch [5/5]: 100%|██████████| 244/244 [01:52<00:00,  2.16it/s, acc=96.4, loss=0.056]


Validation Accuracy: 74.74%



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Define the path in Google Drive
model_save_path = "/content/drive/MyDrive/Colab Notebooks/multimodal_sentiment_model.pth"

# Save the model
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to: {model_save_path}")

In [None]:
from PIL import Image
from transformers import BertTokenizer
import torch
import torch.nn.functional as F

# Load tokenizer (same as used during training)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define label mapping (adjust if your order is different)
idx2label = {0: "negative", 1: "neutral", 2: "positive"}

# Prediction function
def predict(image_path, text, model, transform, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.eval()
    model.to(device)

    # Image preprocessing
    image = Image.open(image_path).convert("RGB")
    if transform:
        image = transform(image).unsqueeze(0).to(device)  # Add batch dim

    # Text preprocessing
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(image, input_ids, attention_mask)
        probs = F.softmax(outputs, dim=1)
        pred_label = torch.argmax(probs, dim=1).item()

    return idx2label[pred_label], probs.cpu().numpy().squeeze()


In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
import torch.nn as nn
from transformers import BertModel

class MultimodalSentimentModel(nn.Module):
    def __init__(self, text_model_name='bert-base-uncased', num_classes=3):
        super(MultimodalSentimentModel, self).__init__()

        # Text branch (BERT)
        self.text_model = BertModel.from_pretrained(text_model_name)
        self.text_fc = nn.Linear(self.text_model.config.hidden_size, 256)

        # Image branch (ResNet)
        resnet = torchvision.models.resnet18(pretrained=True)
        resnet.fc = nn.Identity()  # remove final FC layer
        self.image_model = resnet
        self.image_fc = nn.Linear(512, 256)

        # Combined
        self.classifier = nn.Sequential(
            nn.Linear(256 + 256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, image, input_ids, attention_mask):
        # Text encoding
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(text_outputs.pooler_output)

        # Image encoding
        image_features = self.image_model(image)
        image_features = self.image_fc(image_features)

        # Concatenate
        combined = torch.cat((image_features, text_features), dim=1)
        output = self.classifier(combined)
        return output


In [None]:
resnet = torchvision.models.resnet18(pretrained=True)


NameError: name 'torchvision' is not defined

In [None]:
import torchvision

resnet = torchvision.models.resnet18(pretrained=True)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 125MB/s]


In [None]:
import torch.nn as nn
from transformers import BertModel

class MultimodalSentimentModel(nn.Module):
    def __init__(self, text_model_name='bert-base-uncased', num_classes=3):
        super(MultimodalSentimentModel, self).__init__()

        # Text branch (BERT)
        self.text_model = BertModel.from_pretrained(text_model_name)
        self.text_fc = nn.Linear(self.text_model.config.hidden_size, 256)

        # Image branch (ResNet)
        resnet = torchvision.models.resnet18(pretrained=True)
        resnet.fc = nn.Identity()  # remove final FC layer
        self.image_model = resnet
        self.image_fc = nn.Linear(512, 256)

        # Combined
        self.classifier = nn.Sequential(
            nn.Linear(256 + 256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, image, input_ids, attention_mask):
        # Text encoding
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(text_outputs.pooler_output)

        # Image encoding
        image_features = self.image_model(image)
        image_features = self.image_fc(image_features)

        # Concatenate
        combined = torch.cat((image_features, text_features), dim=1)
        output = self.classifier(combined)
        return output


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalSentimentModel(num_classes=3).to(device)


In [None]:
from sklearn.metrics import accuracy_score, f1_score


In [None]:
# Imports
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
batch_size = 16
num_epochs = 5
learning_rate = 2e-5

# Assuming 'dataset' is already created using your custom Dataset class
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

# Split the dataset
train_set, val_set = random_split(dataset, [train_size, val_size])

# Dataloaders
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

# Optimizer & Loss
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    true_labels = []
    pred_labels = []

    loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]")
    for images, input_ids, attention_mask, labels in loop:
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs, dim=1)
        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(preds.cpu().numpy())

        loop.set_postfix(loss=loss.item())

    train_accuracy = accuracy_score(true_labels, pred_labels)
    train_f1 = f1_score(true_labels, pred_labels, average='weighted')

    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Train F1 Score: {train_f1:.4f}")


NameError: name 'dataset' is not defined

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import pandas as pd
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Paths
data_dir = "/content/MVSA_Single/MVSA_Single/data"
labels_file = "/content/MVSA_Single/MVSA_Single/labels.csv"

# Load labels
df = pd.read_csv(labels_file)

# Label mapping
label_map = {"positive": 0, "neutral": 1, "negative": 2}

# Define Dataset
class MVSADataset(Dataset):
    def __init__(self, dataframe, data_dir, transform=None):
        self.dataframe = dataframe
        self.data_dir = data_dir
        self.transform = transform
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        id = str(row["ID"])

        img_path = os.path.join(self.data_dir, f"{id}.jpg")
        text_path = os.path.join(self.data_dir, f"{id}.txt")

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        try:
            with open(text_path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
        except UnicodeDecodeError:
            with open(text_path, 'r', encoding='latin-1') as f:
                text = f.read().strip()

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        # Use the first label only (image label)
        label = label_map[row["image"]]

        return image, input_ids, attention_mask, torch.tensor(label)

# Image Transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Create dataset
dataset = MVSADataset(df, data_dir, transform)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16)

# Dummy model (replace this with your actual model)
import torch.nn as nn
class DummyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.image_branch = nn.Sequential(
            nn.Flatten(),
            nn.Linear(3 * 224 * 224, 128)
        )
        self.text_branch = nn.Sequential(
            nn.Linear(768, 128)
        )
        self.classifier = nn.Sequential(
            nn.ReLU(),
            nn.Linear(256, 3)
        )

    def forward(self, image, input_ids, attention_mask):
        img_feat = self.image_branch(image)
        text_feat = self.text_branch(input_ids.float())  # Dummy placeholder
        combined = torch.cat((img_feat, text_feat), dim=1)
        return self.classifier(combined)

# Train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DummyModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(5):
    model.train()
    total_preds, total_labels = [], []
    loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/5]")

    for images, input_ids, attention_mask, labels in loop:
        images, input_ids, attention_mask, labels = images.to(device), input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs, dim=1)
        total_preds.extend(preds.cpu().numpy())
        total_labels.extend(labels.cpu().numpy())

        loop.set_postfix(loss=loss.item())

    acc = accuracy_score(total_labels, total_preds)
    f1 = f1_score(total_labels, total_preds, average='weighted')
    print(f"Train Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/MVSA_Single/MVSA_Single/labels.csv'

In [None]:
# Imports
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
batch_size = 16
num_epochs = 5
learning_rate = 2e-5

# Assuming 'dataset' is already created using your custom Dataset class
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

# Split the dataset
train_set, val_set = random_split(dataset, [train_size, val_size])

# Dataloaders
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

# Optimizer & Loss
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    true_labels = []
    pred_labels = []

    loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]")
    for images, input_ids, attention_mask, labels in loop:
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs, dim=1)
        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(preds.cpu().numpy())

        loop.set_postfix(loss=loss.item())

    train_accuracy = accuracy_score(true_labels, pred_labels)
    train_f1 = f1_score(true_labels, pred_labels, average='weighted')

    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Train F1 Score: {train_f1:.4f}")


NameError: name 'dataset' is not defined