In [1]:
from google.colab import files

uploaded = files.upload()  # Upload your multimodal_only_samples.zip

Saving multimodal_only_samples.zip to multimodal_only_samples.zip


In [3]:
import zipfile
import os

zip_path = "multimodal_only_samples.zip"  # Replace with your actual file name if different
extract_path = "/content/fakeddit_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Unzipped files:")
os.listdir(extract_path)

Unzipped files:


['multimodal_only_samples']

In [4]:
import pandas as pd

# Update path to the correct directory
train_df = pd.read_csv('/content/fakeddit_data/multimodal_only_samples/multimodal_train.tsv', sep='\t')
val_df = pd.read_csv('/content/fakeddit_data/multimodal_only_samples/multimodal_validate.tsv', sep='\t')
test_df = pd.read_csv('/content/fakeddit_data/multimodal_only_samples/multimodal_test_public.tsv', sep='\t')

In [5]:
columns = ["clean_title", "image_url", "2_way_label"]
train_df = train_df[columns].dropna()
val_df   = val_df[columns].dropna()
test_df  = test_df[columns].dropna()

In [6]:
for df in [train_df, val_df, test_df]:
    df.rename(columns={"clean_title": "text", "image_url": "image_url", "2_way_label": "label"}, inplace=True)

In [7]:
train_df.head()

Unnamed: 0,text,image_url,label
0,my walgreens offbrand mucinex was engraved wit...,https://external-preview.redd.it/WylDbZrnbvZdB...,1
1,this concerned sink with a tiny hat,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,0
2,hackers leak emails from uae ambassador to us,https://external-preview.redd.it/6fNhdbc6K1vFA...,1
3,puppy taking in the view,https://external-preview.redd.it/HLtVNhTR6wtYt...,1
4,i found a face in my sheet music too,https://preview.redd.it/ri7ut2wn8kv01.jpg?widt...,0


In [8]:
# Print original dataset sizes
print("Original dataset sizes:")
print(f"Training samples:   {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples:       {len(test_df)}")

Original dataset sizes:
Training samples:   562466
Validation samples: 59169
Test samples:       59163


In [9]:
train_df = train_df.sample(n=5000, random_state=42)
val_df = val_df.sample(n=1000, random_state=42)
test_df = test_df.sample(n=1000, random_state=42)

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-6ehxi9og
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-6ehxi9og
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [12]:
from torch.utils.data import Dataset
from PIL import Image
import requests
from io import BytesIO
import clip
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")  # or another model

class FakedditURLDataset(Dataset):
    def __init__(self, dataframe, preprocess):
        self.df = dataframe
        self.preprocess = preprocess

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        url = row["image_url"]
        text = row["text"]
        label = int(row["label"])

        # Load image from URL
        try:
            response = requests.get(url, timeout=5)
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image = self.preprocess(image)
        except:
            # Use a blank image if load fails
            image = Image.new("RGB", (224, 224), (255, 255, 255))
            image = self.preprocess(image)

        try:
            if len(text.split()) > 75:
                summary = summarizer(text, max_length=60, min_length=30, do_sample=False)[0]["summary_text"]
            else:
                summary = text
        except Exception as e:
            print(f"Summarization failed for sample {idx}: {e}")
            summary = text[:300]  # fallback

        tokenized_text = clip.tokenize([summary], truncate=True)[0]

        return image, tokenized_text, torch.tensor(label)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [13]:
from torch.utils.data import DataLoader
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

train_dataset = FakedditURLDataset(train_df, preprocess)
val_dataset   = FakedditURLDataset(val_df, preprocess)
test_dataset  = FakedditURLDataset(test_df, preprocess)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=128)
test_loader  = DataLoader(test_dataset, batch_size=128)

100%|███████████████████████████████████████| 338M/338M [00:11<00:00, 31.8MiB/s]


In [14]:
import torch.nn as nn
from sklearn.metrics import accuracy_score

In [15]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, embed_dim=512):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(embed_dim * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)  # Binary output
        )

    def forward(self, image_feat, text_feat):
        x = torch.cat((image_feat, text_feat), dim=1)
        return self.fc(x)

In [16]:
print(f"Number of batches in train_loader: {len(train_loader)}")
print(f"Number of batches in val_loader: {len(val_loader)}")

Number of batches in train_loader: 40
Number of batches in val_loader: 8


In [17]:
!pip install tqdm



In [18]:
from tqdm import tqdm

print("Starting training...")

model = FakeNewsClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)

print(f"Number of batches in train_loader: {len(train_loader)}")

# Training loop
for epoch in range(5):  # Change epochs as needed
    print(f"\n=== Epoch {epoch+1} ===")

    model.train()
    total_loss = 0
    correct = 0
    total = 0

    # tqdm progress bar wrapper
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}")

    for batch_idx, (images, texts, labels) in progress_bar:
        print(f"\nBatch {batch_idx+1}")

        # Check batch contents
        print("Raw batch shapes:")
        print("Images:", images.shape)
        print("Texts:", texts.shape)
        print("Labels:", labels.shape)

        images, texts, labels = images.to(device), texts.to(device), labels.to(device)
        print("Moved batch to device")

        # Extract features using CLIP
        with torch.no_grad():
            print("Encoding image features...")
            image_features = clip_model.encode_image(images).float()
            print("Encoding text features...")
            text_features = clip_model.encode_text(texts).float()
            print("Image features:", image_features.shape)
            print("Text features:", text_features.shape)

        # Forward pass
        outputs = model(image_features, text_features)

        # Check output shapes for debugging
        print(f"Outputs shape: {outputs.shape}, Labels shape: {labels.shape}")

        # Compute loss
        loss = criterion(outputs, labels)
        print(f"Loss: {loss.item():.4f}")

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # Accuracy calculation
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        accuracy = 100 * correct / total

        # Update progress bar
        progress_bar.set_postfix(loss=loss.item())
        avg_loss = total_loss / len(train_loader)
        epoch_accuracy = 100 * correct / total
        print(f"Epoch {epoch+1} Complete - Avg Loss = {avg_loss:.4f} - Accuracy = {epoch_accuracy:.2f}%")

Starting training...
Number of batches in train_loader: 40

=== Epoch 1 ===


Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]


Batch 1
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:   2%|▎         | 1/40 [00:40<26:18, 40.47s/it, loss=0.698]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6982
Epoch 1 Complete - Avg Loss = 0.0175 - Accuracy = 43.75%

Batch 2
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:   5%|▌         | 2/40 [01:18<24:41, 39.00s/it, loss=0.684]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6838
Epoch 1 Complete - Avg Loss = 0.0346 - Accuracy = 50.78%

Batch 3
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:   8%|▊         | 3/40 [01:51<22:26, 36.39s/it, loss=0.671]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6705
Epoch 1 Complete - Avg Loss = 0.0513 - Accuracy = 52.86%

Batch 4
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  10%|█         | 4/40 [02:25<21:15, 35.42s/it, loss=0.647]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6473
Epoch 1 Complete - Avg Loss = 0.0675 - Accuracy = 55.27%

Batch 5
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  12%|█▎        | 5/40 [02:59<20:14, 34.69s/it, loss=0.632]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6319
Epoch 1 Complete - Avg Loss = 0.0833 - Accuracy = 56.88%

Batch 6
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  15%|█▌        | 6/40 [03:33<19:32, 34.49s/it, loss=0.62]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6202
Epoch 1 Complete - Avg Loss = 0.0988 - Accuracy = 58.85%

Batch 7
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  18%|█▊        | 7/40 [04:08<19:08, 34.79s/it, loss=0.601]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6013
Epoch 1 Complete - Avg Loss = 0.1138 - Accuracy = 59.49%

Batch 8
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  20%|██        | 8/40 [04:52<20:03, 37.62s/it, loss=0.586]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5860
Epoch 1 Complete - Avg Loss = 0.1285 - Accuracy = 60.16%

Batch 9
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  22%|██▎       | 9/40 [05:29<19:20, 37.43s/it, loss=0.607]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.6075
Epoch 1 Complete - Avg Loss = 0.1437 - Accuracy = 59.81%

Batch 10
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  25%|██▌       | 10/40 [06:05<18:36, 37.21s/it, loss=0.598]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5979
Epoch 1 Complete - Avg Loss = 0.1586 - Accuracy = 59.61%

Batch 11
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  28%|██▊       | 11/40 [06:42<17:53, 37.01s/it, loss=0.523]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5228
Epoch 1 Complete - Avg Loss = 0.1717 - Accuracy = 60.23%

Batch 12
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  30%|███       | 12/40 [07:16<16:53, 36.18s/it, loss=0.574]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5741
Epoch 1 Complete - Avg Loss = 0.1860 - Accuracy = 60.61%

Batch 13
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  32%|███▎      | 13/40 [07:50<15:56, 35.44s/it, loss=0.53]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5296
Epoch 1 Complete - Avg Loss = 0.1993 - Accuracy = 60.76%

Batch 14
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  35%|███▌      | 14/40 [08:24<15:11, 35.06s/it, loss=0.534]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5341
Epoch 1 Complete - Avg Loss = 0.2126 - Accuracy = 61.38%

Batch 15
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  38%|███▊      | 15/40 [09:00<14:39, 35.18s/it, loss=0.56]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5595
Epoch 1 Complete - Avg Loss = 0.2266 - Accuracy = 61.46%

Batch 16
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  40%|████      | 16/40 [09:34<13:57, 34.91s/it, loss=0.529]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.5291
Epoch 1 Complete - Avg Loss = 0.2398 - Accuracy = 61.72%

Batch 17
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  42%|████▎     | 17/40 [10:09<13:25, 35.00s/it, loss=0.488]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4876
Epoch 1 Complete - Avg Loss = 0.2520 - Accuracy = 62.32%

Batch 18
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  45%|████▌     | 18/40 [10:43<12:44, 34.77s/it, loss=0.459]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4594
Epoch 1 Complete - Avg Loss = 0.2635 - Accuracy = 63.19%

Batch 19
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  48%|████▊     | 19/40 [11:18<12:11, 34.82s/it, loss=0.479]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4789
Epoch 1 Complete - Avg Loss = 0.2755 - Accuracy = 63.65%

Batch 20
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  50%|█████     | 20/40 [11:52<11:27, 34.40s/it, loss=0.479]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4791
Epoch 1 Complete - Avg Loss = 0.2875 - Accuracy = 64.14%

Batch 21
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  52%|█████▎    | 21/40 [12:27<10:58, 34.65s/it, loss=0.469]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4688
Epoch 1 Complete - Avg Loss = 0.2992 - Accuracy = 64.69%

Batch 22
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  55%|█████▌    | 22/40 [13:01<10:22, 34.59s/it, loss=0.456]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4562
Epoch 1 Complete - Avg Loss = 0.3106 - Accuracy = 65.13%

Batch 23
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  57%|█████▊    | 23/40 [13:36<09:48, 34.61s/it, loss=0.406]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4059
Epoch 1 Complete - Avg Loss = 0.3207 - Accuracy = 65.93%

Batch 24
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  60%|██████    | 24/40 [14:09<09:07, 34.22s/it, loss=0.472]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4718
Epoch 1 Complete - Avg Loss = 0.3325 - Accuracy = 66.37%

Batch 25
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  62%|██████▎   | 25/40 [14:45<08:37, 34.49s/it, loss=0.46]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4601
Epoch 1 Complete - Avg Loss = 0.3440 - Accuracy = 66.91%

Batch 26
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  65%|██████▌   | 26/40 [15:23<08:19, 35.65s/it, loss=0.457]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4567
Epoch 1 Complete - Avg Loss = 0.3555 - Accuracy = 67.37%

Batch 27
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  68%|██████▊   | 27/40 [16:00<07:48, 36.02s/it, loss=0.431]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4309
Epoch 1 Complete - Avg Loss = 0.3662 - Accuracy = 67.97%

Batch 28
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  70%|███████   | 28/40 [16:35<07:09, 35.75s/it, loss=0.393]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3932
Epoch 1 Complete - Avg Loss = 0.3761 - Accuracy = 68.50%

Batch 29
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  72%|███████▎  | 29/40 [17:09<06:28, 35.29s/it, loss=0.451]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4509
Epoch 1 Complete - Avg Loss = 0.3873 - Accuracy = 68.75%

Batch 30
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  75%|███████▌  | 30/40 [17:45<05:55, 35.58s/it, loss=0.422]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4224
Epoch 1 Complete - Avg Loss = 0.3979 - Accuracy = 69.30%

Batch 31
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  78%|███████▊  | 31/40 [18:20<05:16, 35.21s/it, loss=0.452]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4516
Epoch 1 Complete - Avg Loss = 0.4092 - Accuracy = 69.53%

Batch 32
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  80%|████████  | 32/40 [18:56<04:44, 35.52s/it, loss=0.401]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4010
Epoch 1 Complete - Avg Loss = 0.4192 - Accuracy = 70.02%

Batch 33
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  82%|████████▎ | 33/40 [19:32<04:09, 35.60s/it, loss=0.401]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4009
Epoch 1 Complete - Avg Loss = 0.4292 - Accuracy = 70.45%

Batch 34
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  85%|████████▌ | 34/40 [20:06<03:31, 35.30s/it, loss=0.369]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3689
Epoch 1 Complete - Avg Loss = 0.4385 - Accuracy = 70.80%

Batch 35
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  88%|████████▊ | 35/40 [20:42<02:57, 35.47s/it, loss=0.428]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4276
Epoch 1 Complete - Avg Loss = 0.4491 - Accuracy = 71.07%

Batch 36
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  90%|█████████ | 36/40 [21:18<02:21, 35.43s/it, loss=0.46]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4598
Epoch 1 Complete - Avg Loss = 0.4606 - Accuracy = 71.29%

Batch 37
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  92%|█████████▎| 37/40 [21:54<01:47, 35.75s/it, loss=0.422]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4222
Epoch 1 Complete - Avg Loss = 0.4712 - Accuracy = 71.60%

Batch 38
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  95%|█████████▌| 38/40 [22:36<01:15, 37.57s/it, loss=0.39]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3903
Epoch 1 Complete - Avg Loss = 0.4809 - Accuracy = 71.79%

Batch 39
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1:  98%|█████████▊| 39/40 [23:12<00:37, 37.19s/it, loss=0.383]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3828
Epoch 1 Complete - Avg Loss = 0.4905 - Accuracy = 72.10%

Batch 40
Raw batch shapes:
Images: torch.Size([8, 3, 224, 224])
Texts: torch.Size([8, 77])
Labels: torch.Size([8])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 1: 100%|██████████| 40/40 [23:14<00:00, 34.87s/it, loss=0.434]


Image features: torch.Size([8, 512])
Text features: torch.Size([8, 512])
Outputs shape: torch.Size([8, 2]), Labels shape: torch.Size([8])
Loss: 0.4342
Epoch 1 Complete - Avg Loss = 0.5014 - Accuracy = 72.10%

=== Epoch 2 ===


Epoch 2:   0%|          | 0/40 [00:00<?, ?it/s]


Batch 1
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:   2%|▎         | 1/40 [00:33<21:50, 33.61s/it, loss=0.382]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3824
Epoch 2 Complete - Avg Loss = 0.0096 - Accuracy = 81.25%

Batch 2
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:   5%|▌         | 2/40 [01:09<22:15, 35.13s/it, loss=0.35]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3505
Epoch 2 Complete - Avg Loss = 0.0183 - Accuracy = 82.42%

Batch 3
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:   8%|▊         | 3/40 [01:43<21:23, 34.70s/it, loss=0.312]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3122
Epoch 2 Complete - Avg Loss = 0.0261 - Accuracy = 85.16%

Batch 4
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  10%|█         | 4/40 [02:21<21:34, 35.95s/it, loss=0.327]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3273
Epoch 2 Complete - Avg Loss = 0.0343 - Accuracy = 85.74%

Batch 5
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  12%|█▎        | 5/40 [02:55<20:27, 35.06s/it, loss=0.327]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3268
Epoch 2 Complete - Avg Loss = 0.0425 - Accuracy = 86.25%

Batch 6
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  15%|█▌        | 6/40 [03:29<19:40, 34.73s/it, loss=0.358]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3576
Epoch 2 Complete - Avg Loss = 0.0514 - Accuracy = 86.07%

Batch 7
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  18%|█▊        | 7/40 [04:02<18:48, 34.20s/it, loss=0.321]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3207
Epoch 2 Complete - Avg Loss = 0.0594 - Accuracy = 86.16%

Batch 8
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  20%|██        | 8/40 [04:36<18:11, 34.11s/it, loss=0.309]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3092
Epoch 2 Complete - Avg Loss = 0.0672 - Accuracy = 86.62%

Batch 9
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  22%|██▎       | 9/40 [05:09<17:28, 33.81s/it, loss=0.325]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3246
Epoch 2 Complete - Avg Loss = 0.0753 - Accuracy = 86.72%

Batch 10
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  25%|██▌       | 10/40 [05:43<16:51, 33.72s/it, loss=0.372]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3721
Epoch 2 Complete - Avg Loss = 0.0846 - Accuracy = 86.48%

Batch 11
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  28%|██▊       | 11/40 [06:17<16:20, 33.80s/it, loss=0.353]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3531
Epoch 2 Complete - Avg Loss = 0.0934 - Accuracy = 86.01%

Batch 12
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  30%|███       | 12/40 [06:51<15:53, 34.04s/it, loss=0.361]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3615
Epoch 2 Complete - Avg Loss = 0.1024 - Accuracy = 85.87%

Batch 13
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  32%|███▎      | 13/40 [07:25<15:17, 33.99s/it, loss=0.313]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3135
Epoch 2 Complete - Avg Loss = 0.1103 - Accuracy = 86.06%

Batch 14
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  35%|███▌      | 14/40 [08:01<15:00, 34.63s/it, loss=0.286]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2865
Epoch 2 Complete - Avg Loss = 0.1174 - Accuracy = 86.22%

Batch 15
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  38%|███▊      | 15/40 [08:38<14:39, 35.16s/it, loss=0.317]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3169
Epoch 2 Complete - Avg Loss = 0.1254 - Accuracy = 86.09%

Batch 16
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  40%|████      | 16/40 [09:11<13:54, 34.75s/it, loss=0.292]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2921
Epoch 2 Complete - Avg Loss = 0.1327 - Accuracy = 86.28%

Batch 17
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  42%|████▎     | 17/40 [09:47<13:22, 34.90s/it, loss=0.343]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3429
Epoch 2 Complete - Avg Loss = 0.1412 - Accuracy = 86.35%

Batch 18
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  45%|████▌     | 18/40 [10:22<12:48, 34.95s/it, loss=0.325]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3254
Epoch 2 Complete - Avg Loss = 0.1494 - Accuracy = 86.11%

Batch 19
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  48%|████▊     | 19/40 [10:54<11:55, 34.08s/it, loss=0.384]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3842
Epoch 2 Complete - Avg Loss = 0.1590 - Accuracy = 85.98%

Batch 20
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  50%|█████     | 20/40 [11:27<11:18, 33.94s/it, loss=0.369]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3691
Epoch 2 Complete - Avg Loss = 0.1682 - Accuracy = 85.82%

Batch 21
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  52%|█████▎    | 21/40 [12:01<10:40, 33.71s/it, loss=0.282]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2823
Epoch 2 Complete - Avg Loss = 0.1753 - Accuracy = 85.94%

Batch 22
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  55%|█████▌    | 22/40 [12:37<10:18, 34.39s/it, loss=0.245]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2447
Epoch 2 Complete - Avg Loss = 0.1814 - Accuracy = 86.19%

Batch 23
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  57%|█████▊    | 23/40 [13:10<09:40, 34.17s/it, loss=0.261]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2605
Epoch 2 Complete - Avg Loss = 0.1879 - Accuracy = 86.35%

Batch 24
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  60%|██████    | 24/40 [13:44<09:06, 34.17s/it, loss=0.335]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3351
Epoch 2 Complete - Avg Loss = 0.1963 - Accuracy = 86.23%

Batch 25
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  62%|██████▎   | 25/40 [14:18<08:29, 33.95s/it, loss=0.372]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3725
Epoch 2 Complete - Avg Loss = 0.2056 - Accuracy = 86.03%

Batch 26
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  65%|██████▌   | 26/40 [14:52<07:55, 34.00s/it, loss=0.3]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2999
Epoch 2 Complete - Avg Loss = 0.2131 - Accuracy = 86.00%

Batch 27
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  68%|██████▊   | 27/40 [15:27<07:24, 34.19s/it, loss=0.369]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3686
Epoch 2 Complete - Avg Loss = 0.2223 - Accuracy = 85.82%

Batch 28
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  70%|███████   | 28/40 [16:01<06:51, 34.27s/it, loss=0.235]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2351
Epoch 2 Complete - Avg Loss = 0.2282 - Accuracy = 85.99%

Batch 29
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  72%|███████▎  | 29/40 [16:35<06:17, 34.27s/it, loss=0.319]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3186
Epoch 2 Complete - Avg Loss = 0.2361 - Accuracy = 86.02%

Batch 30
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  75%|███████▌  | 30/40 [17:09<05:42, 34.21s/it, loss=0.297]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2974
Epoch 2 Complete - Avg Loss = 0.2436 - Accuracy = 85.96%

Batch 31
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  78%|███████▊  | 31/40 [17:43<05:07, 34.13s/it, loss=0.365]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3649
Epoch 2 Complete - Avg Loss = 0.2527 - Accuracy = 85.94%

Batch 32
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  80%|████████  | 32/40 [18:18<04:34, 34.25s/it, loss=0.281]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2813
Epoch 2 Complete - Avg Loss = 0.2597 - Accuracy = 86.06%

Batch 33
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  82%|████████▎ | 33/40 [18:52<03:59, 34.26s/it, loss=0.336]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3360
Epoch 2 Complete - Avg Loss = 0.2681 - Accuracy = 86.03%

Batch 34
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  85%|████████▌ | 34/40 [19:26<03:24, 34.16s/it, loss=0.25]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2498
Epoch 2 Complete - Avg Loss = 0.2744 - Accuracy = 86.12%

Batch 35
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  88%|████████▊ | 35/40 [20:00<02:50, 34.08s/it, loss=0.38]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3804
Epoch 2 Complete - Avg Loss = 0.2839 - Accuracy = 86.00%

Batch 36
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  90%|█████████ | 36/40 [20:34<02:16, 34.09s/it, loss=0.304]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3045
Epoch 2 Complete - Avg Loss = 0.2915 - Accuracy = 86.09%

Batch 37
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  92%|█████████▎| 37/40 [21:07<01:41, 33.87s/it, loss=0.294]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2944
Epoch 2 Complete - Avg Loss = 0.2989 - Accuracy = 86.11%

Batch 38
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  95%|█████████▌| 38/40 [21:42<01:08, 34.05s/it, loss=0.295]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2954
Epoch 2 Complete - Avg Loss = 0.3062 - Accuracy = 86.16%

Batch 39
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2:  98%|█████████▊| 39/40 [22:16<00:33, 33.97s/it, loss=0.361]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3606
Epoch 2 Complete - Avg Loss = 0.3153 - Accuracy = 86.08%

Batch 40
Raw batch shapes:
Images: torch.Size([8, 3, 224, 224])
Texts: torch.Size([8, 77])
Labels: torch.Size([8])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 2: 100%|██████████| 40/40 [22:18<00:00, 33.46s/it, loss=0.441]


Image features: torch.Size([8, 512])
Text features: torch.Size([8, 512])
Outputs shape: torch.Size([8, 2]), Labels shape: torch.Size([8])
Loss: 0.4412
Epoch 2 Complete - Avg Loss = 0.3263 - Accuracy = 86.04%

=== Epoch 3 ===


Epoch 3:   0%|          | 0/40 [00:00<?, ?it/s]


Batch 1
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:   2%|▎         | 1/40 [00:33<21:36, 33.24s/it, loss=0.239]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2390
Epoch 3 Complete - Avg Loss = 0.0060 - Accuracy = 89.84%

Batch 2
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:   5%|▌         | 2/40 [01:06<21:14, 33.53s/it, loss=0.228]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2279
Epoch 3 Complete - Avg Loss = 0.0117 - Accuracy = 90.23%

Batch 3
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:   8%|▊         | 3/40 [01:42<21:15, 34.48s/it, loss=0.266]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2656
Epoch 3 Complete - Avg Loss = 0.0183 - Accuracy = 89.58%

Batch 4
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  10%|█         | 4/40 [02:17<20:54, 34.84s/it, loss=0.256]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2560
Epoch 3 Complete - Avg Loss = 0.0247 - Accuracy = 89.45%

Batch 5
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  12%|█▎        | 5/40 [02:51<20:00, 34.31s/it, loss=0.238]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2376
Epoch 3 Complete - Avg Loss = 0.0306 - Accuracy = 89.38%

Batch 6
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  15%|█▌        | 6/40 [03:25<19:20, 34.13s/it, loss=0.256]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2557
Epoch 3 Complete - Avg Loss = 0.0370 - Accuracy = 89.45%

Batch 7
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  18%|█▊        | 7/40 [03:59<18:51, 34.28s/it, loss=0.371]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3707
Epoch 3 Complete - Avg Loss = 0.0463 - Accuracy = 88.62%

Batch 8
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  20%|██        | 8/40 [04:33<18:09, 34.06s/it, loss=0.376]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3763
Epoch 3 Complete - Avg Loss = 0.0557 - Accuracy = 87.79%

Batch 9
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  22%|██▎       | 9/40 [05:06<17:32, 33.94s/it, loss=0.249]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2488
Epoch 3 Complete - Avg Loss = 0.0619 - Accuracy = 87.93%

Batch 10
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  25%|██▌       | 10/40 [05:41<17:03, 34.11s/it, loss=0.193]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1933
Epoch 3 Complete - Avg Loss = 0.0668 - Accuracy = 88.52%

Batch 11
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  28%|██▊       | 11/40 [06:16<16:34, 34.29s/it, loss=0.295]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2948
Epoch 3 Complete - Avg Loss = 0.0741 - Accuracy = 88.49%

Batch 12
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  30%|███       | 12/40 [06:50<16:03, 34.39s/it, loss=0.163]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1630
Epoch 3 Complete - Avg Loss = 0.0782 - Accuracy = 88.87%

Batch 13
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  32%|███▎      | 13/40 [07:23<15:18, 34.00s/it, loss=0.244]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2440
Epoch 3 Complete - Avg Loss = 0.0843 - Accuracy = 89.00%

Batch 14
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  35%|███▌      | 14/40 [07:57<14:41, 33.89s/it, loss=0.208]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2080
Epoch 3 Complete - Avg Loss = 0.0895 - Accuracy = 89.29%

Batch 15
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  38%|███▊      | 15/40 [08:30<13:59, 33.59s/it, loss=0.297]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2975
Epoch 3 Complete - Avg Loss = 0.0970 - Accuracy = 89.43%

Batch 16
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  40%|████      | 16/40 [09:04<13:26, 33.59s/it, loss=0.247]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2467
Epoch 3 Complete - Avg Loss = 0.1031 - Accuracy = 89.40%

Batch 17
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  42%|████▎     | 17/40 [09:37<12:54, 33.66s/it, loss=0.273]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2734
Epoch 3 Complete - Avg Loss = 0.1100 - Accuracy = 89.29%

Batch 18
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  45%|████▌     | 18/40 [10:11<12:21, 33.69s/it, loss=0.277]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2770
Epoch 3 Complete - Avg Loss = 0.1169 - Accuracy = 89.15%

Batch 19
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  48%|████▊     | 19/40 [10:45<11:49, 33.77s/it, loss=0.36]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3599
Epoch 3 Complete - Avg Loss = 0.1259 - Accuracy = 88.77%

Batch 20
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  50%|█████     | 20/40 [11:19<11:17, 33.87s/it, loss=0.273]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2734
Epoch 3 Complete - Avg Loss = 0.1327 - Accuracy = 88.75%

Batch 21
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  52%|█████▎    | 21/40 [11:53<10:40, 33.72s/it, loss=0.256]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2556
Epoch 3 Complete - Avg Loss = 0.1391 - Accuracy = 88.76%

Batch 22
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  55%|█████▌    | 22/40 [12:27<10:09, 33.88s/it, loss=0.238]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2382
Epoch 3 Complete - Avg Loss = 0.1451 - Accuracy = 88.96%

Batch 23
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  57%|█████▊    | 23/40 [13:01<09:36, 33.93s/it, loss=0.156]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1565
Epoch 3 Complete - Avg Loss = 0.1490 - Accuracy = 89.27%

Batch 24
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  60%|██████    | 24/40 [13:35<09:06, 34.14s/it, loss=0.21]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2103
Epoch 3 Complete - Avg Loss = 0.1542 - Accuracy = 89.39%

Batch 25
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  62%|██████▎   | 25/40 [14:08<08:26, 33.79s/it, loss=0.313]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3134
Epoch 3 Complete - Avg Loss = 0.1621 - Accuracy = 89.28%

Batch 26
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  65%|██████▌   | 26/40 [14:43<07:55, 33.98s/it, loss=0.24]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2404
Epoch 3 Complete - Avg Loss = 0.1681 - Accuracy = 89.36%

Batch 27
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  68%|██████▊   | 27/40 [15:17<07:22, 34.05s/it, loss=0.257]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2565
Epoch 3 Complete - Avg Loss = 0.1745 - Accuracy = 89.38%

Batch 28
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  70%|███████   | 28/40 [15:51<06:48, 34.01s/it, loss=0.231]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2313
Epoch 3 Complete - Avg Loss = 0.1803 - Accuracy = 89.51%

Batch 29
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  72%|███████▎  | 29/40 [16:26<06:17, 34.33s/it, loss=0.213]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2131
Epoch 3 Complete - Avg Loss = 0.1856 - Accuracy = 89.52%

Batch 30
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  75%|███████▌  | 30/40 [17:01<05:44, 34.42s/it, loss=0.322]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3224
Epoch 3 Complete - Avg Loss = 0.1937 - Accuracy = 89.43%

Batch 31
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  78%|███████▊  | 31/40 [17:36<05:12, 34.70s/it, loss=0.27]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2704
Epoch 3 Complete - Avg Loss = 0.2004 - Accuracy = 89.49%

Batch 32
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  80%|████████  | 32/40 [18:10<04:36, 34.54s/it, loss=0.245]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2455
Epoch 3 Complete - Avg Loss = 0.2065 - Accuracy = 89.45%

Batch 33
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  82%|████████▎ | 33/40 [18:46<04:03, 34.82s/it, loss=0.254]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2537
Epoch 3 Complete - Avg Loss = 0.2129 - Accuracy = 89.44%

Batch 34
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  85%|████████▌ | 34/40 [19:21<03:29, 34.89s/it, loss=0.281]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2808
Epoch 3 Complete - Avg Loss = 0.2199 - Accuracy = 89.45%

Batch 35
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  88%|████████▊ | 35/40 [19:55<02:53, 34.74s/it, loss=0.281]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2811
Epoch 3 Complete - Avg Loss = 0.2269 - Accuracy = 89.46%

Batch 36
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  90%|█████████ | 36/40 [20:29<02:18, 34.51s/it, loss=0.289]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2889
Epoch 3 Complete - Avg Loss = 0.2342 - Accuracy = 89.50%

Batch 37
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  92%|█████████▎| 37/40 [21:04<01:44, 34.71s/it, loss=0.244]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2436
Epoch 3 Complete - Avg Loss = 0.2402 - Accuracy = 89.57%

Batch 38
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  95%|█████████▌| 38/40 [21:39<01:09, 34.65s/it, loss=0.406]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.4062
Epoch 3 Complete - Avg Loss = 0.2504 - Accuracy = 89.31%

Batch 39
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3:  98%|█████████▊| 39/40 [22:13<00:34, 34.47s/it, loss=0.244]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2437
Epoch 3 Complete - Avg Loss = 0.2565 - Accuracy = 89.26%

Batch 40
Raw batch shapes:
Images: torch.Size([8, 3, 224, 224])
Texts: torch.Size([8, 77])
Labels: torch.Size([8])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 3: 100%|██████████| 40/40 [22:15<00:00, 33.38s/it, loss=0.0359]


Image features: torch.Size([8, 512])
Text features: torch.Size([8, 512])
Outputs shape: torch.Size([8, 2]), Labels shape: torch.Size([8])
Loss: 0.0359
Epoch 3 Complete - Avg Loss = 0.2574 - Accuracy = 89.28%

=== Epoch 4 ===


Epoch 4:   0%|          | 0/40 [00:00<?, ?it/s]


Batch 1
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:   2%|▎         | 1/40 [00:34<22:13, 34.20s/it, loss=0.253]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2534
Epoch 4 Complete - Avg Loss = 0.0063 - Accuracy = 86.72%

Batch 2
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:   5%|▌         | 2/40 [01:08<21:37, 34.14s/it, loss=0.19]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1904
Epoch 4 Complete - Avg Loss = 0.0111 - Accuracy = 89.45%

Batch 3
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:   8%|▊         | 3/40 [01:42<21:00, 34.06s/it, loss=0.249]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2493
Epoch 4 Complete - Avg Loss = 0.0173 - Accuracy = 89.32%

Batch 4
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  10%|█         | 4/40 [02:15<20:16, 33.80s/it, loss=0.272]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2724
Epoch 4 Complete - Avg Loss = 0.0241 - Accuracy = 89.45%

Batch 5
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  12%|█▎        | 5/40 [02:48<19:37, 33.63s/it, loss=0.206]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2063
Epoch 4 Complete - Avg Loss = 0.0293 - Accuracy = 90.00%

Batch 6
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  15%|█▌        | 6/40 [03:23<19:12, 33.88s/it, loss=0.296]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2965
Epoch 4 Complete - Avg Loss = 0.0367 - Accuracy = 89.32%

Batch 7
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  18%|█▊        | 7/40 [03:57<18:37, 33.86s/it, loss=0.259]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2587
Epoch 4 Complete - Avg Loss = 0.0432 - Accuracy = 89.17%

Batch 8
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  20%|██        | 8/40 [04:31<18:05, 33.92s/it, loss=0.286]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2858
Epoch 4 Complete - Avg Loss = 0.0503 - Accuracy = 89.45%

Batch 9
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  22%|██▎       | 9/40 [05:04<17:25, 33.72s/it, loss=0.247]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2475
Epoch 4 Complete - Avg Loss = 0.0565 - Accuracy = 89.50%

Batch 10
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  25%|██▌       | 10/40 [05:37<16:46, 33.54s/it, loss=0.166]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1661
Epoch 4 Complete - Avg Loss = 0.0607 - Accuracy = 89.92%

Batch 11
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  28%|██▊       | 11/40 [06:10<16:07, 33.36s/it, loss=0.22]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2198
Epoch 4 Complete - Avg Loss = 0.0662 - Accuracy = 90.13%

Batch 12
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  30%|███       | 12/40 [06:44<15:39, 33.57s/it, loss=0.242]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2423
Epoch 4 Complete - Avg Loss = 0.0722 - Accuracy = 89.97%

Batch 13
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  32%|███▎      | 13/40 [07:18<15:08, 33.67s/it, loss=0.2]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2004
Epoch 4 Complete - Avg Loss = 0.0772 - Accuracy = 90.14%

Batch 14
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  35%|███▌      | 14/40 [07:52<14:38, 33.80s/it, loss=0.237]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2371
Epoch 4 Complete - Avg Loss = 0.0831 - Accuracy = 90.07%

Batch 15
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  38%|███▊      | 15/40 [08:26<14:07, 33.90s/it, loss=0.203]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2027
Epoch 4 Complete - Avg Loss = 0.0882 - Accuracy = 90.36%

Batch 16
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  40%|████      | 16/40 [09:00<13:34, 33.95s/it, loss=0.204]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2039
Epoch 4 Complete - Avg Loss = 0.0933 - Accuracy = 90.53%

Batch 17
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  42%|████▎     | 17/40 [09:34<12:57, 33.82s/it, loss=0.165]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1654
Epoch 4 Complete - Avg Loss = 0.0974 - Accuracy = 90.85%

Batch 18
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  45%|████▌     | 18/40 [10:08<12:23, 33.81s/it, loss=0.262]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2616
Epoch 4 Complete - Avg Loss = 0.1040 - Accuracy = 90.76%

Batch 19
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  48%|████▊     | 19/40 [10:42<11:50, 33.84s/it, loss=0.253]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2526
Epoch 4 Complete - Avg Loss = 0.1103 - Accuracy = 90.62%

Batch 20
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  50%|█████     | 20/40 [11:15<11:15, 33.77s/it, loss=0.173]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1730
Epoch 4 Complete - Avg Loss = 0.1146 - Accuracy = 90.70%

Batch 21
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  52%|█████▎    | 21/40 [11:50<10:50, 34.23s/it, loss=0.214]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2143
Epoch 4 Complete - Avg Loss = 0.1200 - Accuracy = 90.70%

Batch 22
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  55%|█████▌    | 22/40 [12:24<10:13, 34.09s/it, loss=0.303]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3034
Epoch 4 Complete - Avg Loss = 0.1276 - Accuracy = 90.52%

Batch 23
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  57%|█████▊    | 23/40 [12:57<09:34, 33.81s/it, loss=0.259]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2594
Epoch 4 Complete - Avg Loss = 0.1341 - Accuracy = 90.49%

Batch 24
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  60%|██████    | 24/40 [13:31<09:00, 33.77s/it, loss=0.305]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3054
Epoch 4 Complete - Avg Loss = 0.1417 - Accuracy = 90.40%

Batch 25
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  62%|██████▎   | 25/40 [14:09<08:43, 34.87s/it, loss=0.149]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1486
Epoch 4 Complete - Avg Loss = 0.1454 - Accuracy = 90.59%

Batch 26
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  65%|██████▌   | 26/40 [14:47<08:24, 36.05s/it, loss=0.165]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1653
Epoch 4 Complete - Avg Loss = 0.1495 - Accuracy = 90.72%

Batch 27
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  68%|██████▊   | 27/40 [15:25<07:53, 36.41s/it, loss=0.265]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2655
Epoch 4 Complete - Avg Loss = 0.1562 - Accuracy = 90.65%

Batch 28
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  70%|███████   | 28/40 [15:59<07:09, 35.83s/it, loss=0.221]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2208
Epoch 4 Complete - Avg Loss = 0.1617 - Accuracy = 90.74%

Batch 29
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  72%|███████▎  | 29/40 [16:33<06:29, 35.42s/it, loss=0.216]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2160
Epoch 4 Complete - Avg Loss = 0.1671 - Accuracy = 90.79%

Batch 30
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  75%|███████▌  | 30/40 [17:09<05:53, 35.35s/it, loss=0.172]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1723
Epoch 4 Complete - Avg Loss = 0.1714 - Accuracy = 90.89%

Batch 31
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  78%|███████▊  | 31/40 [17:42<05:13, 34.85s/it, loss=0.225]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2251
Epoch 4 Complete - Avg Loss = 0.1770 - Accuracy = 90.83%

Batch 32
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  80%|████████  | 32/40 [18:18<04:40, 35.02s/it, loss=0.217]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2168
Epoch 4 Complete - Avg Loss = 0.1825 - Accuracy = 90.84%

Batch 33
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  82%|████████▎ | 33/40 [18:52<04:03, 34.72s/it, loss=0.259]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2592
Epoch 4 Complete - Avg Loss = 0.1889 - Accuracy = 90.84%

Batch 34
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  85%|████████▌ | 34/40 [19:27<03:29, 34.97s/it, loss=0.23]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2304
Epoch 4 Complete - Avg Loss = 0.1947 - Accuracy = 90.88%

Batch 35
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  88%|████████▊ | 35/40 [20:01<02:53, 34.68s/it, loss=0.248]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2478
Epoch 4 Complete - Avg Loss = 0.2009 - Accuracy = 90.83%

Batch 36
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  90%|█████████ | 36/40 [20:36<02:18, 34.66s/it, loss=0.182]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1818
Epoch 4 Complete - Avg Loss = 0.2054 - Accuracy = 90.91%

Batch 37
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  92%|█████████▎| 37/40 [21:11<01:44, 34.86s/it, loss=0.201]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2007
Epoch 4 Complete - Avg Loss = 0.2105 - Accuracy = 90.96%

Batch 38
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  95%|█████████▌| 38/40 [21:46<01:09, 34.72s/it, loss=0.194]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1937
Epoch 4 Complete - Avg Loss = 0.2153 - Accuracy = 91.04%

Batch 39
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4:  98%|█████████▊| 39/40 [22:19<00:34, 34.36s/it, loss=0.236]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2356
Epoch 4 Complete - Avg Loss = 0.2212 - Accuracy = 90.99%

Batch 40
Raw batch shapes:
Images: torch.Size([8, 3, 224, 224])
Texts: torch.Size([8, 77])
Labels: torch.Size([8])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 4: 100%|██████████| 40/40 [22:21<00:00, 33.55s/it, loss=0.263]


Image features: torch.Size([8, 512])
Text features: torch.Size([8, 512])
Outputs shape: torch.Size([8, 2]), Labels shape: torch.Size([8])
Loss: 0.2634
Epoch 4 Complete - Avg Loss = 0.2278 - Accuracy = 90.98%

=== Epoch 5 ===


Epoch 5:   0%|          | 0/40 [00:00<?, ?it/s]


Batch 1
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:   2%|▎         | 1/40 [00:34<22:11, 34.14s/it, loss=0.215]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2147
Epoch 5 Complete - Avg Loss = 0.0054 - Accuracy = 91.41%

Batch 2
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:   5%|▌         | 2/40 [01:07<21:26, 33.86s/it, loss=0.177]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1775
Epoch 5 Complete - Avg Loss = 0.0098 - Accuracy = 91.41%

Batch 3
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:   8%|▊         | 3/40 [01:42<21:10, 34.34s/it, loss=0.311]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.3107
Epoch 5 Complete - Avg Loss = 0.0176 - Accuracy = 89.58%

Batch 4
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  10%|█         | 4/40 [02:18<20:56, 34.90s/it, loss=0.27]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2702
Epoch 5 Complete - Avg Loss = 0.0243 - Accuracy = 89.84%

Batch 5
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  12%|█▎        | 5/40 [02:52<20:06, 34.48s/it, loss=0.16]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1595
Epoch 5 Complete - Avg Loss = 0.0283 - Accuracy = 90.62%

Batch 6
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  15%|█▌        | 6/40 [03:25<19:16, 34.02s/it, loss=0.219]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2189
Epoch 5 Complete - Avg Loss = 0.0338 - Accuracy = 90.89%

Batch 7
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  18%|█▊        | 7/40 [03:59<18:41, 33.97s/it, loss=0.228]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2278
Epoch 5 Complete - Avg Loss = 0.0395 - Accuracy = 90.85%

Batch 8
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  20%|██        | 8/40 [04:33<18:08, 34.02s/it, loss=0.255]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2545
Epoch 5 Complete - Avg Loss = 0.0458 - Accuracy = 91.02%

Batch 9
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  22%|██▎       | 9/40 [05:07<17:31, 33.91s/it, loss=0.2]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2002
Epoch 5 Complete - Avg Loss = 0.0509 - Accuracy = 91.32%

Batch 10
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  25%|██▌       | 10/40 [05:41<17:05, 34.19s/it, loss=0.193]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1928
Epoch 5 Complete - Avg Loss = 0.0557 - Accuracy = 91.33%

Batch 11
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  28%|██▊       | 11/40 [06:15<16:29, 34.14s/it, loss=0.253]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2527
Epoch 5 Complete - Avg Loss = 0.0620 - Accuracy = 91.34%

Batch 12
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  30%|███       | 12/40 [06:49<15:53, 34.06s/it, loss=0.26]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2601
Epoch 5 Complete - Avg Loss = 0.0685 - Accuracy = 91.15%

Batch 13
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  32%|███▎      | 13/40 [07:23<15:13, 33.85s/it, loss=0.228]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2279
Epoch 5 Complete - Avg Loss = 0.0742 - Accuracy = 91.11%

Batch 14
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  35%|███▌      | 14/40 [07:57<14:46, 34.10s/it, loss=0.185]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1849
Epoch 5 Complete - Avg Loss = 0.0788 - Accuracy = 91.35%

Batch 15
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  38%|███▊      | 15/40 [08:31<14:08, 33.93s/it, loss=0.154]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1542
Epoch 5 Complete - Avg Loss = 0.0827 - Accuracy = 91.51%

Batch 16
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  40%|████      | 16/40 [09:05<13:33, 33.88s/it, loss=0.168]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1679
Epoch 5 Complete - Avg Loss = 0.0869 - Accuracy = 91.85%

Batch 17
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  42%|████▎     | 17/40 [09:38<12:58, 33.84s/it, loss=0.153]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1533
Epoch 5 Complete - Avg Loss = 0.0907 - Accuracy = 92.05%

Batch 18
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  45%|████▌     | 18/40 [10:12<12:22, 33.74s/it, loss=0.165]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1648
Epoch 5 Complete - Avg Loss = 0.0948 - Accuracy = 92.10%

Batch 19
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  48%|████▊     | 19/40 [10:46<11:50, 33.84s/it, loss=0.243]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2430
Epoch 5 Complete - Avg Loss = 0.1009 - Accuracy = 91.86%

Batch 20
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  50%|█████     | 20/40 [11:20<11:17, 33.88s/it, loss=0.174]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1739
Epoch 5 Complete - Avg Loss = 0.1052 - Accuracy = 91.91%

Batch 21
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  52%|█████▎    | 21/40 [11:53<10:40, 33.70s/it, loss=0.2]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2002
Epoch 5 Complete - Avg Loss = 0.1102 - Accuracy = 92.11%

Batch 22
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  55%|█████▌    | 22/40 [12:29<10:16, 34.25s/it, loss=0.147]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1474
Epoch 5 Complete - Avg Loss = 0.1139 - Accuracy = 92.22%

Batch 23
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  57%|█████▊    | 23/40 [13:02<09:38, 34.01s/it, loss=0.146]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1456
Epoch 5 Complete - Avg Loss = 0.1176 - Accuracy = 92.29%

Batch 24
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  60%|██████    | 24/40 [13:36<09:05, 34.12s/it, loss=0.172]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1723
Epoch 5 Complete - Avg Loss = 0.1219 - Accuracy = 92.35%

Batch 25
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  62%|██████▎   | 25/40 [14:11<08:33, 34.25s/it, loss=0.208]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2079
Epoch 5 Complete - Avg Loss = 0.1271 - Accuracy = 92.34%

Batch 26
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  65%|██████▌   | 26/40 [14:45<08:00, 34.31s/it, loss=0.209]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2088
Epoch 5 Complete - Avg Loss = 0.1323 - Accuracy = 92.28%

Batch 27
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  68%|██████▊   | 27/40 [15:19<07:22, 34.03s/it, loss=0.17]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1698
Epoch 5 Complete - Avg Loss = 0.1365 - Accuracy = 92.33%

Batch 28
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  70%|███████   | 28/40 [15:53<06:46, 33.91s/it, loss=0.16]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1604
Epoch 5 Complete - Avg Loss = 0.1406 - Accuracy = 92.30%

Batch 29
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  72%|███████▎  | 29/40 [16:27<06:14, 34.08s/it, loss=0.206]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2065
Epoch 5 Complete - Avg Loss = 0.1457 - Accuracy = 92.27%

Batch 30
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  75%|███████▌  | 30/40 [17:02<05:42, 34.25s/it, loss=0.184]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1843
Epoch 5 Complete - Avg Loss = 0.1503 - Accuracy = 92.27%

Batch 31
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  78%|███████▊  | 31/40 [17:38<05:12, 34.78s/it, loss=0.2]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2001
Epoch 5 Complete - Avg Loss = 0.1553 - Accuracy = 92.29%

Batch 32
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  80%|████████  | 32/40 [18:12<04:37, 34.74s/it, loss=0.211]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2110
Epoch 5 Complete - Avg Loss = 0.1606 - Accuracy = 92.24%

Batch 33
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  82%|████████▎ | 33/40 [18:47<04:02, 34.69s/it, loss=0.176]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1764
Epoch 5 Complete - Avg Loss = 0.1650 - Accuracy = 92.23%

Batch 34
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  85%|████████▌ | 34/40 [19:22<03:28, 34.68s/it, loss=0.133]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1331
Epoch 5 Complete - Avg Loss = 0.1683 - Accuracy = 92.30%

Batch 35
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  88%|████████▊ | 35/40 [19:55<02:50, 34.19s/it, loss=0.184]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1843
Epoch 5 Complete - Avg Loss = 0.1729 - Accuracy = 92.30%

Batch 36
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  90%|█████████ | 36/40 [20:29<02:16, 34.15s/it, loss=0.142]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1418
Epoch 5 Complete - Avg Loss = 0.1765 - Accuracy = 92.38%

Batch 37
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  92%|█████████▎| 37/40 [21:01<01:41, 33.72s/it, loss=0.273]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.2727
Epoch 5 Complete - Avg Loss = 0.1833 - Accuracy = 92.31%

Batch 38
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  95%|█████████▌| 38/40 [21:34<01:06, 33.38s/it, loss=0.147]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1468
Epoch 5 Complete - Avg Loss = 0.1870 - Accuracy = 92.37%

Batch 39
Raw batch shapes:
Images: torch.Size([128, 3, 224, 224])
Texts: torch.Size([128, 77])
Labels: torch.Size([128])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5:  98%|█████████▊| 39/40 [22:07<00:33, 33.29s/it, loss=0.172]

Image features: torch.Size([128, 512])
Text features: torch.Size([128, 512])
Outputs shape: torch.Size([128, 2]), Labels shape: torch.Size([128])
Loss: 0.1716
Epoch 5 Complete - Avg Loss = 0.1913 - Accuracy = 92.39%

Batch 40
Raw batch shapes:
Images: torch.Size([8, 3, 224, 224])
Texts: torch.Size([8, 77])
Labels: torch.Size([8])
Moved batch to device
Encoding image features...
Encoding text features...


Epoch 5: 100%|██████████| 40/40 [22:09<00:00, 33.23s/it, loss=0.368]

Image features: torch.Size([8, 512])
Text features: torch.Size([8, 512])
Outputs shape: torch.Size([8, 2]), Labels shape: torch.Size([8])
Loss: 0.3682
Epoch 5 Complete - Avg Loss = 0.2005 - Accuracy = 92.38%





In [23]:
from sklearn.metrics import classification_report
def evaluate_model(model, data_loader, clip_model, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0

    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for images, texts, labels in tqdm(data_loader, desc="Evaluating"):
            images = images.to(device)
            texts = texts.to(device)
            labels = labels.to(device)

            # Extract CLIP features
            image_features = clip_model.encode_image(images).float()
            text_features = clip_model.encode_text(texts).float()

            # Forward pass through classifier
            outputs = model(image_features, text_features)

            # Compute loss
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Get predictions
            _, predicted = torch.max(outputs, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"\n✅ Evaluation Complete — Accuracy: {accuracy * 100:.2f}%")
    print(f"📉 Average Loss: {total_loss / len(data_loader):.4f}")
    print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds, target_names=["Real", "Fake"]))

    return accuracy

In [24]:
print("\n=== Evaluating on Validation Set ===")
evaluate_model(model, val_loader, clip_model, device)

print("\n=== Evaluating on Test Set ===")
evaluate_model(model, test_loader, clip_model, device)


=== Evaluating on Validation Set ===


Evaluating: 100%|██████████| 8/8 [04:21<00:00, 32.65s/it]



✅ Evaluation Complete — Accuracy: 88.60%
📉 Average Loss: 0.2715

📊 Classification Report:
               precision    recall  f1-score   support

        Real       0.92      0.88      0.90       579
        Fake       0.84      0.90      0.87       421

    accuracy                           0.89      1000
   macro avg       0.88      0.89      0.88      1000
weighted avg       0.89      0.89      0.89      1000


=== Evaluating on Test Set ===


Evaluating: 100%|██████████| 8/8 [04:32<00:00, 34.03s/it]


✅ Evaluation Complete — Accuracy: 88.60%
📉 Average Loss: 0.2801

📊 Classification Report:
               precision    recall  f1-score   support

        Real       0.95      0.86      0.90       608
        Fake       0.81      0.93      0.86       392

    accuracy                           0.89      1000
   macro avg       0.88      0.89      0.88      1000
weighted avg       0.89      0.89      0.89      1000






0.886

In [29]:
torch.save(model.state_dict(), "clip_fakenews_classifier.pth")
print(" Model saved.")

 Model saved.


In [26]:
from google.colab import files
files.download("clip_fakenews_classifier.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
model = FakeNewsClassifier().to(device)
model.load_state_dict(torch.load("clip_fakenews_classifier.pth"))
model.eval()

FakeNewsClassifier(
  (fc): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [28]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [30]:
import gradio as gr

def predict_from_gradio(text, image):
    model.eval()

    try:
        image_input = preprocess(image).unsqueeze(0).to(device)
    except:
        image_input = preprocess(Image.new("RGB", (224, 224), (255, 255, 255))).unsqueeze(0).to(device)

    text_input = clip.tokenize([text]).to(device)

    with torch.no_grad():
        image_features = clip_model.encode_image(image_input).float()
        text_features = clip_model.encode_text(text_input).float()
        outputs = model(image_features, text_features)
        _, predicted = torch.max(outputs, 1)

    label = predicted.item()
    return "Real News" if label == 0 else "Fake News"

In [33]:
interface = gr.Interface(
    fn=predict_from_gradio,
    inputs=[
        gr.Textbox(label="News Headline / Description", placeholder="Enter the news content here..."),
        gr.Image(type="pil", label="Associated Image", sources=["upload", "clipboard"])
    ],
    outputs=gr.Textbox(label="Result"),
    title="📰 Fake News Detection",
    description="This app uses OpenAI's CLIP model to detect fake news based on text + image."
)

interface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2e816617e35b9c9ccb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


