In [10]:
!pip install transformers datasets scikit-learn torch --quiet

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaPreTrainedModel, Trainer, TrainingArguments, RobertaConfig

In [11]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

In [12]:
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, smiles_list, labels=None, masks=None):
        self.encodings = tokenizer(smiles_list, truncation=True, padding='max_length', max_length=128)
        self.labels = labels
        self.masks = masks

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            labels = np.nan_to_num(self.labels[idx], nan=0.0)
            item['labels'] = torch.tensor(labels, dtype=torch.float32)
            if self.masks is not None:
                item['label_mask'] = torch.tensor(self.masks[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [13]:
class ChemBERTaRegressor(RobertaPreTrainedModel):
    def __init__(self, config, num_targets=5):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(config.hidden_size, num_targets)
        self.loss_fn = nn.MSELoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
      outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
      pooled_output = outputs.pooler_output
      pooled_output = self.dropout(pooled_output)
      logits = self.regressor(pooled_output)

      if labels is not None:
          if logits.shape != labels.shape:
              raise ValueError(f"Shape mismatch: logits {logits.shape}, labels {labels.shape}")
          if torch.isnan(logits).any():
              print("Warning: NaN detected in predictions")
          if torch.isnan(labels).any():
              print("Warning: NaN detected in labels")

          loss = self.loss_fn(logits, labels)
          return {"loss": loss, "logits": logits}
      else:
          return {"logits": logits}

In [14]:
class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        model.train()
        inputs = self._prepare_inputs(inputs)

        loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        return loss.detach()

In [15]:
X = train_df['SMILES'].tolist()
y = train_df[targets].values

valid_mask = ~np.isnan(y)
y_for_scaling = np.nan_to_num(y, nan=0.0)

scaler = StandardScaler()
y_scaled = np.zeros_like(y_for_scaling)

for i, target in enumerate(targets):
    valid_indices = valid_mask[:, i]
    if valid_indices.sum() > 1:
        target_values = y[valid_indices, i:i+1]
        scaler_target = StandardScaler()
        scaler_target.fit(target_values)

        y_scaled[:, i] = scaler_target.transform(y_for_scaling[:, i:i+1]).flatten()

        if not hasattr(scaler, 'target_scalers'):
            scaler.target_scalers = {}
        scaler.target_scalers[i] = scaler_target
    else:
        print(f"Warning: Not enough valid values for target {target}")
        y_scaled[:, i] = y_for_scaling[:, i]

for i, target in enumerate(targets):
    valid_indices = valid_mask[:, i]
    if valid_indices.sum() > 0:
        orig_mean = np.nanmean(y[:, i])
        orig_std = np.nanstd(y[:, i])
        scaled_mean = np.mean(y_scaled[valid_indices, i])
        scaled_std = np.std(y_scaled[valid_indices, i])
        print(f"{target}: orig_mean={orig_mean:.3f}, orig_std={orig_std:.3f} -> scaled_mean={scaled_mean:.3f}, scaled_std={scaled_std:.3f}")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
test_dataset = SMILESDataset(test_df['SMILES'].tolist())
test_loader = DataLoader(test_dataset, batch_size=8)

Tg: orig_mean=96.452, orig_std=111.119 -> scaled_mean=0.000, scaled_std=1.000
FFV: orig_mean=0.367, orig_std=0.030 -> scaled_mean=0.000, scaled_std=1.000
Tc: orig_mean=0.256, orig_std=0.089 -> scaled_mean=0.000, scaled_std=1.000
Density: orig_mean=0.985, orig_std=0.146 -> scaled_mean=0.000, scaled_std=1.000
Rg: orig_mean=16.420, orig_std=4.605 -> scaled_mean=-0.000, scaled_std=1.000


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
fold_maes = []
test_preds_folds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"--- Fold {fold+1} ---")

    X_train = [X[i] for i in train_idx]
    X_val = [X[i] for i in val_idx]
    y_train = y_scaled[train_idx]
    y_val = y_scaled[val_idx]
    mask_train = valid_mask[train_idx]
    mask_val = valid_mask[val_idx]

    train_dataset = SMILESDataset(X_train, y_train, mask_train)
    val_dataset = SMILESDataset(X_val, y_val, mask_val)

    config = RobertaConfig.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
    model = ChemBERTaRegressor.from_pretrained("seyonec/ChemBERTa-zinc-base-v1", config=config).to(device)

    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="steps",
        eval_steps=50,
        logging_strategy="steps",
        logging_steps=50,
        save_strategy="no",
        learning_rate=1e-5,
        warmup_steps=100,
        weight_decay=0.01,
        dataloader_num_workers=0,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    try:
        trainer.train()
    except Exception as e:
        print(f"Training failed for fold {fold+1}: {e}")
        continue

    val_preds_scaled = trainer.predict(val_dataset).predictions

    val_preds = np.zeros_like(val_preds_scaled)
    y_val_original = np.zeros_like(y_val)

    for i, target in enumerate(targets):
        if hasattr(scaler, 'target_scalers') and i in scaler.target_scalers:
            val_preds[:, i] = scaler.target_scalers[i].inverse_transform(
                val_preds_scaled[:, i:i+1]).flatten()
            y_val_original[:, i] = scaler.target_scalers[i].inverse_transform(
                y_val[:, i:i+1]).flatten()
        else:
            val_preds[:, i] = val_preds_scaled[:, i]
            y_val_original[:, i] = y_val[:, i]

    target_maes = []
    for i, target in enumerate(targets):
        valid_indices = mask_val[:, i]
        if valid_indices.sum() > 0:
            target_mae = mean_absolute_error(
                y_val_original[valid_indices, i],
                val_preds[valid_indices, i]
            )
            target_maes.append(target_mae)
            print(f"  {target} MAE: {target_mae:.4f} (n={valid_indices.sum()})")
        else:
            print(f"  {target}: No valid samples")

    if target_maes:
        val_mae = np.mean(target_maes)
        fold_maes.append(val_mae)
        print(f"Fold {fold+1} Average MAE: {val_mae:.4f}")

    test_preds_scaled = trainer.predict(test_dataset).predictions
    test_preds = np.zeros_like(test_preds_scaled)

    for i, target in enumerate(targets):
        if hasattr(scaler, 'target_scalers') and i in scaler.target_scalers:
            test_preds[:, i] = scaler.target_scalers[i].inverse_transform(
                test_preds_scaled[:, i:i+1]).flatten()
        else:
            test_preds[:, i] = test_preds_scaled[:, i]

    test_preds_folds.append(test_preds)

Using device: cuda
--- Fold 1 ---


Some weights of ChemBERTaRegressor were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,13.7004,10.627661
100,8.1321,4.385361
150,3.9723,3.781864
200,4.3923,3.64717
250,4.131,3.596702
300,3.7036,3.596913
350,4.0301,3.491379
400,3.7171,3.37757
450,4.0123,3.332675
500,3.7206,3.315939


  Tg MAE: 96.3588 (n=87)
  FFV MAE: 0.0347 (n=1419)
  Tc MAE: 0.1415 (n=145)
  Density MAE: 0.6198 (n=123)
  Rg MAE: 10.5548 (n=124)
Fold 1 Average MAE: 21.5419


--- Fold 2 ---


Some weights of ChemBERTaRegressor were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,14.1366,10.941912
100,6.9272,4.668564
150,4.0469,4.143402
200,4.1348,4.067165
250,3.921,3.98085
300,3.8768,3.936046
350,3.4452,3.932721
400,3.72,3.806639
450,3.5029,3.772038
500,3.3511,3.707113


  Tg MAE: 112.6584 (n=112)
  FFV MAE: 0.0356 (n=1393)
  Tc MAE: 0.1249 (n=144)
  Density MAE: 0.6285 (n=117)
  Rg MAE: 10.4588 (n=119)
Fold 2 Average MAE: 24.7812


--- Fold 3 ---


Some weights of ChemBERTaRegressor were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,14.0866,10.600896
100,6.982,4.404428
150,4.2971,3.96167
200,3.8396,3.810129
250,3.7609,3.771698
300,4.2179,3.691284
350,4.274,3.660346
400,3.7359,3.565757
450,4.0554,3.547144
500,3.4893,3.507576


  Tg MAE: 111.4069 (n=113)
  FFV MAE: 0.0348 (n=1409)
  Tc MAE: 0.1346 (n=138)
  Density MAE: 0.6520 (n=113)
  Rg MAE: 11.1525 (n=112)
Fold 3 Average MAE: 24.6762


--- Fold 4 ---


Some weights of ChemBERTaRegressor were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,14.5372,10.583584
100,7.1236,4.561779
150,4.4179,4.010751
200,3.8845,3.867895
250,4.0465,3.791548
300,3.1487,3.849463
350,3.8488,3.694699
400,4.0314,3.616822
450,3.8159,3.607808
500,3.6892,3.566598


  Tg MAE: 105.5811 (n=104)
  FFV MAE: 0.0348 (n=1402)
  Tc MAE: 0.1317 (n=153)
  Density MAE: 0.6840 (n=130)
  Rg MAE: 11.3570 (n=130)
Fold 4 Average MAE: 23.5577


--- Fold 5 ---


Some weights of ChemBERTaRegressor were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,14.5102,10.606109
100,7.2507,4.486927
150,3.0978,4.061602
200,4.4009,3.929353
250,4.3348,3.865764
300,4.0198,3.761167
350,3.648,3.75086
400,3.7269,3.65039
450,3.293,3.647531
500,3.3569,3.677191


  Tg MAE: 101.5340 (n=95)
  FFV MAE: 0.0351 (n=1407)
  Tc MAE: 0.1418 (n=157)
  Density MAE: 0.7080 (n=130)
  Rg MAE: 11.5285 (n=129)
Fold 5 Average MAE: 22.7895


In [17]:
final_test_preds = np.mean(test_preds_folds, axis=0)
print(f"\nAverage CV MAE: {np.mean(fold_maes):.4f}")

submission = pd.DataFrame(final_test_preds, columns=targets)
submission.insert(0, 'id', test_df['id'])
submission.to_csv("submission.csv", index=False)
print("submission.csv file has been created woohoo! abhinav, shaan, and sahil are the best.")


Average CV MAE: 23.4693
submission.csv file has been created woohoo! abhinav, shaan, and sahil are the best.
