ChEB-AI · sfluegel05 · Nov 5, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 21, 2025
diff --git a/chebai/callbacks/epoch_metrics.py b/chebai/callbacks/epoch_metrics.py
@@ -62,7 +62,8 @@ def update(self, preds: torch.Tensor, labels: torch.Tensor) -> None:
             labels (torch.Tensor): Ground truth labels.
         """
         tps = torch.sum(
-            torch.logical_and(preds > self.threshold, labels.to(torch.bool)), dim=0
+            torch.logical_and(preds > self.threshold, labels.to(torch.bool)),
+            dim=0,
         )
         self.true_positives += tps
         self.positive_predictions += torch.sum(preds > self.threshold, dim=0)

diff --git a/chebai/models/base.py b/chebai/models/base.py
@@ -4,6 +4,7 @@
 
 import torch
 from lightning.pytorch.core.module import LightningModule
+from lightning.pytorch.utilities.rank_zero import rank_zero_info
 
 from chebai.preprocessing.structures import XYData
 
@@ -106,7 +107,8 @@ def _get_prediction_and_labels(
         Returns:
             Tuple[torch.Tensor, torch.Tensor]: Predictions and labels.
         """
-        return output, labels
+        # cast labels to int
+        return output, labels.to(torch.int) if labels is not None else labels
 
     def _process_labels_in_batch(self, batch: XYData) -> torch.Tensor:
         """
@@ -158,6 +160,13 @@ def _process_for_loss(
         """
         return model_output, labels, loss_kwargs
 
+    def on_train_epoch_start(self) -> None:
+        # pass current epoch to datamodule if it has the attribute curr_epoch (for PubChemBatched dataset)
+        rank_zero_info(f"Starting epoch {self.current_epoch}")
+        if hasattr(self.trainer.datamodule, "curr_epoch"):
+            rank_zero_info(f"Setting datamodule.curr_epoch to {self.current_epoch}")
+            self.trainer.datamodule.curr_epoch = self.current_epoch
+
     def training_step(
         self, batch: XYData, batch_idx: int
     ) -> Dict[str, Union[torch.Tensor, Any]]:
@@ -310,6 +319,8 @@ def _execute(
                 for metric_name, metric in metrics.items():
                     metric.update(pr, tar)
                 self._log_metrics(prefix, metrics, len(batch))
+        if isinstance(d, dict) and "loss" not in d:
+            print(f"d has keys {d.keys()}, log={log}, criterion={self.criterion}")
         return d
 
     def _log_metrics(self, prefix: str, metrics: torch.nn.Module, batch_size: int):

diff --git a/chebai/models/classic_ml.py b/chebai/models/classic_ml.py
@@ -0,0 +1,97 @@
+import os
+import pickle as pkl
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import torch
+import tqdm
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression
+
+from chebai.models.base import ChebaiBaseNet
+
+LR_MODEL_PATH = os.path.join("models", "LR")
+
+
+class LogisticRegression(ChebaiBaseNet):
+    """
+    Logistic Regression model using scikit-learn, wrapped to fit the ChebaiBaseNet interface.
+    """
+
+    def __init__(
+        self,
+        out_dim: int,
+        input_dim: int,
+        only_predict_classes: Optional[List] = None,
+        n_classes=1528,
+        **kwargs,
+    ):
+        super().__init__(out_dim=out_dim, input_dim=input_dim, **kwargs)
+        self.models = [
+            SklearnLogisticRegression(solver="liblinear") for _ in range(n_classes)
+        ]
+        # indices of classes (in the dataset used for training) where a model should be trained
+        self.only_predict_classes = only_predict_classes
+
+    def forward(self, x: Dict[str, Any], **kwargs) -> torch.Tensor:
+        print(
+            f"forward called with x[features].shape {x['features'].shape}, self.training {self.training}"
+        )
+        if self.training:
+            self.fit_sklearn(x["features"], x["labels"])
+        preds = []
+        for model in self.models:
+            try:
+                p = torch.from_numpy(model.predict(x["features"])).float()
+                p = p.to(x["features"].device)
+                preds.append(p)
+            except NotFittedError:
+                preds.append(
+                    torch.zeros((x["features"].shape[0]), device=(x["features"].device))
+                )
+            except AttributeError:
+                preds.append(
+                    torch.zeros((x["features"].shape[0]), device=(x["features"].device))
+                )
+        preds = torch.stack(preds, dim=1)
+        print(f"preds shape {preds.shape}")
+        return preds.squeeze(-1)
+
+    def fit_sklearn(self, X, y):
+        """
+        Fit the underlying sklearn model. X and y should be numpy arrays.
+        """
+        for i, model in tqdm.tqdm(enumerate(self.models), desc="Fitting models"):
+            import os
+
+            if os.path.exists(os.path.join(LR_MODEL_PATH, f"LR_model_{i}.pkl")):
+                print(f"Loading model {i} from file")
+                self.models[i] = pkl.load(
+                    open(os.path.join(LR_MODEL_PATH, f"LR_model_{i}.pkl"), "rb")
+                )
+            else:
+                if (
+                    self.only_predict_classes and i not in self.only_predict_classes
+                ):  # only try these classes
+                    continue
+                try:
+                    model.fit(X, y[:, i])
+                except ValueError:
+                    self.models[i] = PlaceholderModel()
+                # dump
+                pkl.dump(
+                    model, open(os.path.join(LR_MODEL_PATH, f"LR_model_{i}.pkl"), "wb")
+                )
+
+    def configure_optimizers(self, **kwargs):
+        pass
+
+
+class PlaceholderModel:
+    """Acts like a trained model, but isn't. Use this if training fails and you need a placeholder."""
+
+    def __init__(self, default_prediction=1):
+        self.default_prediction = default_prediction
+
+    def predict(self, preds):
+        return np.ones(preds.shape[0]) * self.default_prediction
diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -224,6 +224,7 @@ def __init__(
         config: Optional[Dict[str, Any]] = None,
         pretrained_checkpoint: Optional[str] = None,
         load_prefix: Optional[str] = None,
+        freeze_electra: bool = False,
         **kwargs: Any,
     ):
         # Remove this property in order to prevent it from being stored as a
@@ -262,6 +263,10 @@ def __init__(
         else:
             self.electra = ElectraModel(config=self.config)
 
+        if freeze_electra:
+            for param in self.electra.parameters():
+                param.requires_grad = False
+
     def _process_for_loss(
         self,
         model_output: Dict[str, Tensor],

diff --git a/chebai/models/lstm.py b/chebai/models/lstm.py
@@ -1,31 +1,55 @@
 import logging
 
 from torch import nn
-from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
 from chebai.models.base import ChebaiBaseNet
 
 logging.getLogger("pysmiles").setLevel(logging.CRITICAL)
 
 
 class ChemLSTM(ChebaiBaseNet):
-    def __init__(self, in_d, out_d, num_classes, **kwargs):
-        super().__init__(num_classes, **kwargs)
-        self.lstm = nn.LSTM(in_d, out_d, batch_first=True)
-        self.embedding = nn.Embedding(800, 100)
+    def __init__(
+        self,
+        out_d,
+        in_d,
+        num_classes,
+        criterion: nn.Module = None,
+        num_layers=6,
+        dropout=0.2,
+        **kwargs,
+    ):
+        super().__init__(
+            out_dim=out_d,
+            input_dim=in_d,
+            criterion=criterion,
+            num_classes=num_classes,
+            **kwargs,
+        )
+        self.lstm = nn.LSTM(
+            in_d,
+            out_d,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=True,
+            num_layers=num_layers,
+        )
+        self.embedding = nn.Embedding(1400, in_d)
         self.output = nn.Sequential(
-            nn.Linear(out_d, in_d),
+            nn.Linear(out_d * 2, out_d),
             nn.ReLU(),
             nn.Dropout(0.2),
-            nn.Linear(in_d, num_classes),
+            nn.Linear(out_d, num_classes),
         )
 
-    def forward(self, data):
-        x = data.x
-        x_lens = data.lens
+    def forward(self, data, *args, **kwargs):
+        x = data["features"]
+        x_lens = data["model_kwargs"]["lens"]
         x = self.embedding(x)
         x = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
-        x = self.lstm(x)[1][0]
-        # = pad_packed_sequence(x, batch_first=True)[0]
+        x = self.lstm(x)[0]
+        x = pad_packed_sequence(x, batch_first=True)[0][
+            :, 0
+        ]  # reduce sequence dimension to first element
         x = self.output(x)
-        return x.squeeze(0)
+        return x