<a href="https://colab.research.google.com/github/ChamillaTerp/AML2025_FinalProject/blob/main/Pierre/MultimodalUniverse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms.v2 as transforms
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, Image
from tqdm.auto import tqdm

In [4]:
dr3_dataset = load_dataset("MultimodalUniverse/gaia")
dr3_dataset = dr3_dataset["train"]

README.md:   0%|          | 0.00/21.6k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/168M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [5]:
class DR3Dataset(Dataset):
  def __init__(self, dataset, spectrum_size=128):
    self.dataset = dataset
    self.spectrum_size = spectrum_size

  def __len__(self):
    return len(self.dataset)

  def _pad_spectrum(self, spectrum) -> torch.Tensor:
    return F.pad(spectrum, (0, self.spectrum_size - spectrum.shape[-1]), "constant", 0)

  def __getitem__(self, index):
    item = self.dataset[index]

    spectrum = item["spectral_coefficients"]["coeff"]
    spectrum = (torch.Tensor(spectrum) - 61.6579) / 6810.9331
    spectrum = self._pad_spectrum(spectrum)
    spectrum_err = item["spectral_coefficients"]["coeff_error"]
    spectrum_err = (torch.Tensor(spectrum_err) - 1.5306) / 24.7735
    spectrum_err = self._pad_spectrum(spectrum_err)

    mh = item["gspphot"]["mh_gspphot"]
    mh_lower = item["gspphot"]["mh_gspphot_lower"]
    mh_upper = item["gspphot"]["mh_gspphot_upper"]

    X = torch.stack([spectrum, spectrum_err])
    Y = torch.Tensor([mh, mh_lower, mh_upper])

    return X, Y

dr3 = DR3Dataset(dr3_dataset)

In [6]:
def compute_stats(dr3):
  """
  Compute the mean and standard deviation for each channel of X.
  """
  batch_size = 1024
  X_mean = torch.zeros(2)
  X_std = torch.zeros(2)
  dr3 = DataLoader(dr3, batch_size=batch_size)

  for X, _ in tqdm(dr3):
    X_mean += X.mean(dim=(0, 2))
    X_std += X.std(dim=(0, 2))

  X_mean /= len(dr3)
  X_std /= len(dr3)

  return X_mean, X_std

compute_stats(dr3)

  0%|          | 0/98 [00:00<?, ?it/s]

(tensor([0.0013, 0.0087]), tensor([1.0000, 0.9997]))

In [7]:
dr3 = DR3Dataset(dr3_dataset, scaler_coeff=scaler_coeff, scaler_coeff_err=scaler_coeff_err)
dr3[0]

NameError: name 'scaler_coeff' is not defined

In [None]:
class DR3Regressor(nn.Module):
  def __init__(self, spectrum_width=128, act_fn=nn.SELU, pool_fn=nn.MaxPool1d):
    super().__init__()

    self.spectrum_width = spectrum_width
    self.act_fn = act_fn
    self.pool_fn = pool_fn

    self.features = nn.Sequential(
        nn.Conv1d(2, 64, 3, padding=1),
        self.act_fn(inplace=True),
        nn.Conv1d(64, 64, 3, padding=1),
        self.act_fn(inplace=True),
        self.pool_fn(2),
        nn.Conv1d(64, 128, 3, padding=1),
        self.act_fn(inplace=True),
        nn.Conv1d(128, 128, 3, padding=1),
        self.act_fn(inplace=True),
        self.pool_fn(2),
    )

    self.linear = nn.Sequential(
        nn.Linear(256, 128),
        act_fn(inplace=True),
        nn.Linear(128, 64),
        act_fn(inplace=True),
        nn.Linear(64, 3)
    )

  def forward(self, x):
    x = self.features(x)
    x = self.linear(x)
    return x

dr3_regressor = DR3Regressor()
test_batch = torch.rand(32, 2, 128)
test_batch.shape
dr3_regressor(dr3[0][0].unsqueeze(0))

In [None]:
dr3[0][1]

In [None]:
class DR3Trainer:
  def __init__(self, lr=5e-4, device=None):
    self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
    self.model = DR3Regressor().to(self.device)

    self.lr = lr
    self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
    self.loss_fn = nn.MSELoss()

  def train_step(self, X, Y):
    self.model.train()
    self.optimizer.zero_grad()

    preds = self.model(X)
    loss = self.loss_fn(preds, Y)
    loss.backward()
    self.optimizer.step()

    return loss.item()

  def test_step(self, X, Y):
    self.model.eval()

    with torch.inference_mode():
      preds = self.model(X)
      loss = self.loss_fn(preds, Y)

    return loss.item()

  def train(self, train_loader, test_loader, epochs=10):
    train_losses = []
    test_losses = []

    for epoch in tqdm(range(epochs)):
      train_loss = 0
      test_loss = 0

      progress_bar = tqdm(train_loader)

      for X, Y in progress_bar:
        X = X.to(self.device)
        Y = Y.to(self.device)
        loss = self.train_step(X, Y)
        train_loss += loss
        progress_bar.set_postfix({"train_loss": loss})

      for X, Y in test_loader:
        X = X.to(self.device)
        Y = Y.to(self.device)
        test_loss += self.test_step(X, Y)

      train_loss /= len(train_loader)
      test_loss /= len(test_loader)

      train_losses.append(train_loss)
      test_losses.append(test_loss)

      print(f"Epoch {epoch+1}/{epochs}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

    return train_losses, test_losses

In [None]:
dr3_train, dr3_test = torch.utils.data.random_split(dr3, [0.8, 0.2])
train_loader = DataLoader(dr3_train, batch_size=32, shuffle=True)
test_loader = DataLoader(dr3_test, batch_size=32)

trainer = DR3Trainer(lr=1e-10, device="cuda")
trainer.train(train_loader, test_loader, epochs=10)