# function to make text + image dataset

In [None]:
import json
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as T

class PDEImageTextDataset(Dataset):
    def __init__(self, jsonl_path):
        self.jsonl_path = Path(jsonl_path)
        self.root = self.jsonl_path.parent  # ← IMPORTANT

        with open(self.jsonl_path, "r") as f:
            self.samples = [json.loads(l) for l in f]

        self.transform = T.Compose([
            T.Resize((224, 224)),
            T.ToTensor(),
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        image_path = self.root / sample["image"]  # ← FIX
        image = Image.open(image_path).convert("RGB")

        text = sample["text"]

        return self.transform(image), text



# Vision Only CLIP

In [None]:
from torch.utils.data import Dataset
import torch.nn as nn
import torchvision.models as models
import torch

class VisionOnlyDataset(Dataset):
    def __init__(self, base_dataset):
        self.base = base_dataset

    def __len__(self):
        return len(self.base)

    def __getitem__(self, idx):
        image, _ = self.base[idx]
        return image


class VisionOnlyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = models.resnet18(weights="IMAGENET1K_V1")
        self.backbone.fc = nn.Linear(512, 1)

    def forward(self, x):
        return self.backbone(x)

def train_vision_only(model, dataloader, optimizer, device, epochs=20):
    model.train()
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        total_loss = 0.0

        for images in dataloader:
            images = images.to(device)
            targets = torch.zeros(images.size(0), 1, device=device)

            preds = model(images)
            loss = criterion(preds, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1} | Loss {total_loss/len(dataloader):.4f}")


In [None]:
from torch.utils.data import DataLoader

base_dataset = PDEImageTextDataset(
    "/Users/divyam/Course/Project Arbeit/pde_solver/vl_dataset/annotations.jsonl"
)

vision_dataset = VisionOnlyDataset(base_dataset)
vision_loader = DataLoader(vision_dataset, batch_size=32, shuffle=True)

device = "mps"

vision_model = VisionOnlyModel().to(device)
optimizer = torch.optim.AdamW(vision_model.parameters(), lr=3e-4)

train_vision_only(
    vision_model,
    vision_loader,
    optimizer,
    device=device,
    epochs=20
)


Epoch 1 | Loss 0.1344
Epoch 2 | Loss 0.0014
Epoch 3 | Loss 0.0003
Epoch 4 | Loss 0.0001
Epoch 5 | Loss 0.0001
Epoch 6 | Loss 0.0001
Epoch 7 | Loss 0.0001
Epoch 8 | Loss 0.0000
Epoch 9 | Loss 0.0000
Epoch 10 | Loss 0.0000
Epoch 11 | Loss 0.0000
Epoch 12 | Loss 0.0000
Epoch 13 | Loss 0.0000
Epoch 14 | Loss 0.0000
Epoch 15 | Loss 0.0000
Epoch 16 | Loss 0.0000
Epoch 17 | Loss 0.0000
Epoch 18 | Loss 0.0000
Epoch 19 | Loss 0.0000
Epoch 20 | Loss 0.0000



# Vision Text CLIP

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VisionTextCLIP(nn.Module):
    def __init__(self, vision_model, text_dim, embed_dim=256):
        super().__init__()
        self.vision = vision_model
        self.vision.fc = nn.Identity()
        self.image_proj = nn.Linear(512, embed_dim)

        self.text_proj = nn.Linear(text_dim, embed_dim)

    def forward(self, images, text_emb):
        img_feat = self.image_proj(self.vision(images))
        txt_feat = self.text_proj(text_emb)

        img_feat = F.normalize(img_feat, dim=1)
        txt_feat = F.normalize(txt_feat, dim=1)

        return img_feat, txt_feat
    
def clip_loss(img_emb, txt_emb, temperature=0.07):
    logits = img_emb @ txt_emb.T / temperature
    labels = torch.arange(len(img_emb)).to(img_emb.device)

    loss_i = nn.CrossEntropyLoss()(logits, labels)
    loss_t = nn.CrossEntropyLoss()(logits.T, labels)
    return (loss_i + loss_t) / 2

In [4]:
import torch
from torch.utils.data import DataLoader

def train_clip(
    model,
    dataloader,
    text_encoder,
    optimizer,
    device,
    epochs=10
):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0

        for images, texts in dataloader:
            images = images.to(device)

            with torch.no_grad():
                text_emb = torch.from_numpy(
                text_encoder.encode(texts)
                ).to(device)

            img_emb, txt_emb = model(images, text_emb)
            loss = clip_loss(img_emb, txt_emb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1} | Loss {total_loss/len(dataloader):.4f}")


In [5]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F

class SimpleCLIP(nn.Module):
    def __init__(self, embed_dim=512):
        super().__init__()

        backbone = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        self.image_encoder = nn.Sequential(
            *list(backbone.children())[:-1]
        )

        self.image_proj = nn.Linear(512, embed_dim)
        self.text_proj = nn.Linear(384, embed_dim)  # adjust to your text encoder

    def forward(self, images, text_emb):
        img_feat = self.image_encoder(images).squeeze(-1).squeeze(-1)
        img_emb = F.normalize(self.image_proj(img_feat), dim=-1)
        txt_emb = F.normalize(self.text_proj(text_emb), dim=-1)
        return img_emb, txt_emb


In [None]:
from sentence_transformers import SentenceTransformer

device = "mps"

text_encoder = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device=device
)

In [None]:
# vision + correct text

dataset = PDEImageTextDataset("/Users/divyam/Course/Project Arbeit/pde_solver/vl_dataset/annotations.jsonl")
loader = DataLoader(dataset, batch_size=32, shuffle=True)

model = SimpleCLIP(embed_dim=512).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

train_clip(
    model,
    loader,
    text_encoder,
    optimizer,
    device="mps",
    epochs=20
)


Epoch 1 | Loss 3.4788
Epoch 2 | Loss 3.3713
Epoch 3 | Loss 3.2135
Epoch 4 | Loss 3.0028
Epoch 5 | Loss 2.5931
Epoch 6 | Loss 2.0965
Epoch 7 | Loss 1.7331
Epoch 8 | Loss 1.2601
Epoch 9 | Loss 1.0609
Epoch 10 | Loss 0.9560
Epoch 11 | Loss 0.8178
Epoch 12 | Loss 0.7361
Epoch 13 | Loss 0.7432
Epoch 14 | Loss 0.6881
Epoch 15 | Loss 0.6874
Epoch 16 | Loss 0.6179
Epoch 17 | Loss 0.6154
Epoch 18 | Loss 0.6362
Epoch 19 | Loss 0.6069
Epoch 20 | Loss 0.6248


In [None]:
# vision + correct text

import torchvision.models as models

import torchvision.models as models
import torch.nn as nn

vision = models.resnet18(weights="IMAGENET1K_V1")
vision.fc = nn.Linear(512, 1)  # regression

model = vision

device = "mps"

model = VisionTextCLIP(
    vision_model=model,
    text_dim=384,      
    embed_dim=512
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

train_clip(
    model,
    loader,
    text_encoder,
    optimizer,
    device="mps",
    epochs=20
)

Epoch 1 | Loss 3.4668
Epoch 2 | Loss 3.3695
Epoch 3 | Loss 3.3011
Epoch 4 | Loss 3.0453
Epoch 5 | Loss 2.6769
Epoch 6 | Loss 2.1905
Epoch 7 | Loss 1.6729
Epoch 8 | Loss 1.4171
Epoch 9 | Loss 1.0982
Epoch 10 | Loss 0.9219
Epoch 11 | Loss 0.7607
Epoch 12 | Loss 0.7543
Epoch 13 | Loss 0.7205
Epoch 14 | Loss 0.6921
Epoch 15 | Loss 0.6217
Epoch 16 | Loss 0.6441
Epoch 17 | Loss 0.6111
Epoch 18 | Loss 0.5942
Epoch 19 | Loss 0.5734
Epoch 20 | Loss 0.6029


# Vision + Shuffled text

In [19]:
import random

class ShuffledTextDataset(PDEImageTextDataset):
    def __init__(self, jsonl_path):
        super().__init__(jsonl_path)
        self.texts = [s["text"] for s in self.samples]

    def __getitem__(self, idx):
        image, _ = super().__getitem__(idx)
        random_text = random.choice(self.texts)
        return image, random_text


In [None]:
dataset = ShuffledTextDataset("/Users/divyam/Course/Project Arbeit/pde_solver/vl_dataset/annotations.jsonl")
loader = DataLoader(dataset, batch_size=32, shuffle=True)

train_clip(
    model,
    loader,
    text_encoder,
    optimizer,
    device="mps",
    epochs=20
)


Epoch 1 | Loss 4.2607
Epoch 2 | Loss 3.4420
Epoch 3 | Loss 3.4373
Epoch 4 | Loss 3.4344
Epoch 5 | Loss 3.4357
Epoch 6 | Loss 3.4329
Epoch 7 | Loss 3.4372
Epoch 8 | Loss 3.4346
Epoch 9 | Loss 3.4327
Epoch 10 | Loss 3.4369
Epoch 11 | Loss 3.4340
Epoch 12 | Loss 3.4345
Epoch 13 | Loss 3.4349
Epoch 14 | Loss 3.4345
Epoch 15 | Loss 3.4355
Epoch 16 | Loss 3.4371
Epoch 17 | Loss 3.4334
Epoch 18 | Loss 3.4377
Epoch 19 | Loss 3.4350
Epoch 20 | Loss 3.4378
