In [3]:
import os
import re
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image

# Dataset class without metadata, labels from filename
class LipDataset(Dataset):
    def __init__(self, img_dir="./images", transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(img_dir) if f.lower().endswith(('.jpg', '.jpeg', '.heic'))]

    def __len__(self):
        return len(self.image_files)

    def extract_hgb_from_filename(self, filename):
        # Extract HgB value from pattern: HgB_17.3gdl_...
        match = re.search(r"_(\d+\.\d+)gdl", filename)
        if match:
            return float(match.group(1))
        else:
            raise ValueError(f"Could not extract HgB from filename {filename}")

    def __getitem__(self, idx):
        filename = self.image_files[idx]
        filepath = os.path.join(self.img_dir, filename)
        image = Image.open(filepath).convert('RGB')
        if self.transform:
            image = self.transform(image)
        label = self.extract_hgb_from_filename(filename)
        return image, torch.tensor(label, dtype=torch.float32), filename

# Model definition updated to only take images as input
class HgBRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        from torchvision.models import resnet18
        self.backbone = resnet18(pretrained=True)
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, 1)

    def forward(self, image):
        return self.backbone(image).squeeze()

def main():
    img_dir = "./images"
    batch_size = 16
    epochs = 15
    lr = 1e-4
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    dataset = LipDataset(img_dir=img_dir, transform=transform)

    # Train/validation split 80/20
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

    model = HgBRegressor().to(device)
    criterion = nn.L1Loss()  # MAE Loss
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        running_train_loss = 0.0
        for images, labels, _ in train_loader:
            images, labels = images.to(device), labels.to(device)
            preds = model(images)
            loss = criterion(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_train_loss += loss.item() * images.size(0)
        train_loss = running_train_loss / train_size

        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for images, labels, _ in val_loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images)
                loss = criterion(preds, labels)
                running_val_loss += loss.item() * images.size(0)
        val_loss = running_val_loss / val_size

        print(f"Epoch {epoch+1}/{epochs} Train MAE: {train_loss:.4f} Val MAE: {val_loss:.4f}")

    torch.save(model.state_dict(), "hgb_regressor.pth")

    # Inference on all images and save predictions with filenames
    model.eval()
    all_filenames = []
    all_preds = []
    with torch.no_grad():
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
        for images, _, filenames in loader:
            images = images.to(device)
            outputs = model(images)
            all_preds.extend(outputs.cpu().numpy())
            all_filenames.extend(filenames)

    pred_df = pd.DataFrame({
        "filename": all_filenames,
        "predicted_hgb": all_preds
    })
    pred_df.to_csv("predictions.csv", index=False)
    print("Predictions saved to predictions.csv")

if __name__ == "__main__":
    main()




RuntimeError: DataLoader worker (pid(s) 21960, 20436, 36052, 5940) exited unexpectedly