# Polaris Leaderboard Submission Pipeline

This notebook:
1. Loads the pretrained 15M small dataset model
2. Loads a Polaris benchmark dataset
3. Performs 5-fold cross validation with early stopping to determine optimal training epochs
4. Trains the final model on the complete training set
5. Makes predictions on the test set
6. Submits results to Polaris leaderboard


In [None]:
import polaris as po
import numpy as np
import pandas as pd
import tempfile
import logging
from pathlib import Path
from typing import Dict, List, Tuple
import os

# Deep learning imports
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from sklearn.model_selection import KFold

# Set environment variables
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:
# Import custom modules
from molencoder.utils.callbacks import BestEpochTracker
from molencoder.finetune.dataset_preprocessor import preprocess_polaris_benchmark
from molencoder.finetune.label_scaler import LabelScaler


## Configuration


In [None]:
# Configuration
MODEL_NAME = "fabikru/model_15M_small_ds_masking_30_predicted_hparams"
BENCHMARK_NAME = "tdcommons/caco2-wang"  # Change this to your desired benchmark
N_SPLITS = 5  # Number of cross-validation folds
SMILES_COLUMN = "smiles"
MAX_LENGTH = 500
OWNER_NAME = "fabikru"  # Change this to your Polaris username

# Fixed hyperparameters (based on your existing code)
TRAINING_CONFIG = {
    "per_device_train_batch_size": 64,
    "per_device_eval_batch_size": 64,
    "learning_rate": 8e-4,
    "weight_decay": 1e-5,
    "warmup_steps": 100,
    "optim": "schedule_free_adamw",
    "lr_scheduler_type": "constant",
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "adam_epsilon": 1e-8,
    "fp16": False,
    "bf16": True,
    "dataloader_num_workers": 8,
    "dataloader_pin_memory": True,
    "tf32": True,
    "max_grad_norm": 1.0,
    "early_stopping_patience": 3,
    "early_stopping_threshold": 0.001,
}

print(f"Model: {MODEL_NAME}")
print(f"Benchmark: {BENCHMARK_NAME}")
print(f"Cross-validation folds: {N_SPLITS}")


## Load and Preprocess Dataset


In [None]:
# Load the Polaris benchmark
logger.info(f"Loading benchmark: {BENCHMARK_NAME}")
benchmark = po.load_benchmark(BENCHMARK_NAME)

# Get train and test splits
train_data, test_data = benchmark.get_train_test_split()

# Convert to DataFrames and then to Hugging Face datasets
train_df = train_data.as_dataframe()
test_df = test_data.as_dataframe()

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Train columns: {train_df.columns.tolist()}")
print(f"Train data sample:")
print(train_df.head())


In [None]:
# Create Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

print(f"Train dataset columns: {train_dataset.column_names}")
print(f"Test dataset columns: {test_dataset.column_names}")

# Determine column names from the train dataset (test might not have labels)
def determine_columns(dataset):
    """Determine SMILES and label columns from the dataset"""
    row = dataset[0]
    cols = list(row.keys())
    
    smiles_col, label_col = None, None
    for col in cols:
        if isinstance(row[col], str) and ('smiles' in col.lower() or 'mol' in col.lower()):
            smiles_col = col
        elif isinstance(row[col], (int, float)) and col != '__index_level_0__':
            label_col = col
    
    if smiles_col is None:
        # Fallback: assume first string column is SMILES
        for col in cols:
            if isinstance(row[col], str):
                smiles_col = col
                break
    
    if label_col is None:
        # Fallback: assume first numeric column is labels
        for col in cols:
            if isinstance(row[col], (int, float)) and col != '__index_level_0__':
                label_col = col
                break
    
    return smiles_col, label_col

smiles_col, label_col = determine_columns(train_dataset)
print(f"SMILES column: {smiles_col}")
print(f"Label column: {label_col}")

# Rename SMILES column if needed (both datasets should have this)
if smiles_col and smiles_col != "smiles":
    if smiles_col in train_dataset.column_names:
        train_dataset = train_dataset.rename_column(smiles_col, "smiles")
    if smiles_col in test_dataset.column_names:
        test_dataset = test_dataset.rename_column(smiles_col, "smiles")

# Rename label column if needed (only train dataset should have this)
if label_col and label_col != "labels":
    if label_col in train_dataset.column_names:
        train_dataset = train_dataset.rename_column(label_col, "labels")
    # Don't try to rename in test dataset - it might not have labels

# Remove any unwanted columns from train dataset
wanted_columns_train = ["smiles", "labels"]
for col in list(train_dataset.column_names):
    if col not in wanted_columns_train:
        train_dataset = train_dataset.remove_columns([col])

# Remove any unwanted columns from test dataset (only keep smiles)
wanted_columns_test = ["smiles"]
for col in list(test_dataset.column_names):
    if col not in wanted_columns_test:
        test_dataset = test_dataset.remove_columns([col])

print(f"Final train dataset columns: {train_dataset.column_names}")
print(f"Final test dataset columns: {test_dataset.column_names}")
print(f"Sample train data: {train_dataset[0]}")
print(f"Sample test data: {test_dataset[0]}")


## Setup Label Scaling and Tokenization


In [None]:
# Initialize label scaler (only for train dataset)
label_scaler = LabelScaler(train_dataset)
print(f"Label scaling - Median: {label_scaler.median:.4f}, IQR: {label_scaler.iqr:.4f}")

# Scale labels in train dataset only (test dataset doesn't have labels)
train_dataset_scaled = label_scaler.scale_labels(train_dataset)

# Load tokenizer and tokenize datasets
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["smiles"], truncation=True, max_length=MAX_LENGTH)

train_dataset_tokenized = train_dataset_scaled.map(tokenize_function, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)  # No scaling for test

print("Tokenization completed.")
print(f"Sample tokenized train data: {train_dataset_tokenized[0]}")
print(f"Sample tokenized test data: {test_dataset_tokenized[0]}")


## Cross-Validation to Determine Optimal Epochs


In [None]:
def train_fold(train_fold_dataset, val_fold_dataset, fold_num):
    """Train a single fold and return the best epoch number"""
    logger.info(f"Training fold {fold_num + 1}/{N_SPLITS}")
    
    # Load fresh model for this fold
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=1
    )
    
    # Setup data collator
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    
    # Setup callbacks
    best_epoch_tracker = BestEpochTracker()
    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=TRAINING_CONFIG["early_stopping_patience"],
        early_stopping_threshold=TRAINING_CONFIG["early_stopping_threshold"]
    )
    
    with tempfile.TemporaryDirectory() as temp_dir:
        training_args = TrainingArguments(
            output_dir=temp_dir,
            logging_dir=temp_dir,
            num_train_epochs=100,  # Large number, early stopping will control
            per_device_train_batch_size=TRAINING_CONFIG["per_device_train_batch_size"],
            per_device_eval_batch_size=TRAINING_CONFIG["per_device_eval_batch_size"],
            learning_rate=TRAINING_CONFIG["learning_rate"],
            weight_decay=TRAINING_CONFIG["weight_decay"],
            warmup_steps=TRAINING_CONFIG["warmup_steps"],
            optim=TRAINING_CONFIG["optim"],
            lr_scheduler_type=TRAINING_CONFIG["lr_scheduler_type"],
            adam_beta1=TRAINING_CONFIG["adam_beta1"],
            adam_beta2=TRAINING_CONFIG["adam_beta2"],
            adam_epsilon=TRAINING_CONFIG["adam_epsilon"],
            fp16=TRAINING_CONFIG["fp16"],
            bf16=TRAINING_CONFIG["bf16"],
            dataloader_num_workers=TRAINING_CONFIG["dataloader_num_workers"],
            dataloader_pin_memory=TRAINING_CONFIG["dataloader_pin_memory"],
            tf32=TRAINING_CONFIG["tf32"],
            max_grad_norm=TRAINING_CONFIG["max_grad_norm"],
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            eval_on_start=False,
            save_total_limit=2,
            torch_compile=False,  # Disable for short CV runs
            metric_for_best_model="eval_loss",
            greater_is_better=False,
        )
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_fold_dataset,
            eval_dataset=val_fold_dataset,
            data_collator=data_collator,
            callbacks=[early_stopping, best_epoch_tracker],
        )
        
        trainer.train()
        
        # Get the best epoch from the callback
        best_epoch = best_epoch_tracker.best_epoch
        best_loss = best_epoch_tracker.best_eval_loss
        
        logger.info(f"Fold {fold_num + 1} - Best epoch: {best_epoch}, Best loss: {best_loss:.4f}")
        
        return best_epoch, best_loss


In [None]:
# Perform cross-validation
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
best_epochs = []
cv_losses = []

# Convert to numpy for KFold splitting
indices = np.arange(len(train_dataset_tokenized))

for fold_num, (train_idx, val_idx) in enumerate(kf.split(indices)):
    # Create fold datasets
    train_fold_dataset = train_dataset_tokenized.select(train_idx.tolist())
    val_fold_dataset = train_dataset_tokenized.select(val_idx.tolist())
    
    logger.info(f"Fold {fold_num + 1}: Train size: {len(train_fold_dataset)}, Val size: {len(val_fold_dataset)}")
    
    # Train this fold
    best_epoch, best_loss = train_fold(train_fold_dataset, val_fold_dataset, fold_num)
    
    best_epochs.append(best_epoch)
    cv_losses.append(best_loss)

# Calculate average optimal epochs
avg_best_epochs = np.mean(best_epochs)
std_best_epochs = np.std(best_epochs)
avg_cv_loss = np.mean(cv_losses)

print("\n=== Cross-Validation Results ===")
print(f"Best epochs per fold: {best_epochs}")
print(f"CV losses per fold: {[f'{loss:.4f}' for loss in cv_losses]}")
print(f"Average best epochs: {avg_best_epochs:.1f} ± {std_best_epochs:.1f}")
print(f"Average CV loss: {avg_cv_loss:.4f}")

# Use the average as our target epochs for final training
final_epochs = int(np.round(avg_best_epochs))
print(f"\nUsing {final_epochs} epochs for final training")


## Train Final Model on Complete Training Set


In [None]:
logger.info("Training final model on complete training dataset")

# Load fresh model for final training
final_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=1
)

# Setup data collator
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

with tempfile.TemporaryDirectory() as temp_dir:
    final_training_args = TrainingArguments(
        output_dir=temp_dir,
        logging_dir=temp_dir,
        num_train_epochs=final_epochs,
        per_device_train_batch_size=TRAINING_CONFIG["per_device_train_batch_size"],
        per_device_eval_batch_size=TRAINING_CONFIG["per_device_eval_batch_size"],
        learning_rate=TRAINING_CONFIG["learning_rate"],
        weight_decay=TRAINING_CONFIG["weight_decay"],
        warmup_steps=TRAINING_CONFIG["warmup_steps"],
        optim=TRAINING_CONFIG["optim"],
        lr_scheduler_type=TRAINING_CONFIG["lr_scheduler_type"],
        adam_beta1=TRAINING_CONFIG["adam_beta1"],
        adam_beta2=TRAINING_CONFIG["adam_beta2"],
        adam_epsilon=TRAINING_CONFIG["adam_epsilon"],
        fp16=TRAINING_CONFIG["fp16"],
        bf16=TRAINING_CONFIG["bf16"],
        dataloader_num_workers=TRAINING_CONFIG["dataloader_num_workers"],
        dataloader_pin_memory=TRAINING_CONFIG["dataloader_pin_memory"],
        tf32=TRAINING_CONFIG["tf32"],
        max_grad_norm=TRAINING_CONFIG["max_grad_norm"],
        eval_strategy="no",  # No evaluation during final training
        save_strategy="no",  # No saving during final training
        torch_compile=False,  # Disable compilation for small dataset
        eval_on_start=False,
        logging_steps=50,
    )
    
    final_trainer = Trainer(
        model=final_model,
        args=final_training_args,
        train_dataset=train_dataset_tokenized,
        data_collator=data_collator,
    )
    
    # Train the final model
    final_trainer.train()
    
    logger.info("Final training completed")
    
    # Make predictions on test set
    logger.info("Making predictions on test set")
    test_results = final_trainer.predict(test_dataset_tokenized)
    
    # Get predictions and rescale them
    import numpy as np
    scaled_predictions = np.array(test_results.predictions).flatten()
    rescaled_predictions = label_scaler.scale_predictions(scaled_predictions.tolist())
    
    print(f"Generated {len(rescaled_predictions)} predictions")
    print(f"Sample predictions: {rescaled_predictions[:5]}")
    print(f"Prediction stats - Mean: {np.mean(rescaled_predictions):.4f}, Std: {np.std(rescaled_predictions):.4f}")


## Evaluate and Submit to Polaris

In [None]:
# Evaluate predictions using the benchmark
logger.info("Evaluating predictions")
results = benchmark.evaluate(rescaled_predictions)

print(results)

In [None]:
!polaris login

In [None]:
results.name = "MolEncoder"

In [None]:
results.upload_to_hub(owner="fabikru")