
The pipeline implemented in the notebook typically follows these steps:

1. **Data loading**  
   Load `train.csv` and `test.csv` into pandas DataFrames. SMILES strings are read from the `SMILES` column for each polymer.

2. **Molecule parsing and descriptor computation**  
   - Use `safe_mol_from_smiles` to safely parse SMILES into RDKit molecule objects.  
   - Compute molecular descriptors with `compute_rdkit_descriptors`.  
   - Generate Morgan fingerprints with `mol_to_morgan_fp_array`.  
   - Combine descriptors and fingerprints into a numeric feature matrix using `build_feature_matrix`.

3. **Data cleaning & preprocessing**  
   - Handle infinite values and drop columns with too many missing values using `data_cleaning`.  
   - Fill remaining missing values with `KNNImputer` and clip extreme values.  
   - Scale features before modeling using `StandardScaler` when required.

4. **Feature selection**  
   - Remove low-variance features and highly correlated columns.  
   - Perform model-based feature selection using `feature_selection_pipeline`, which trains a LightGBM on a target (Tg) and selects top features by importance.

5. **Modeling per target**  
   - Train a LightGBM regressor for each property using `train_lgb_per_target`.  
   - Use K-Fold cross-validation to generate out-of-fold predictions.  
   - Scale test data with the same scaler, and average predictions across folds.

6. **Ensembling & postprocessing**  
   - Combine predictions from multiple models or folds to reduce variance and improve robustness.

7. **Submission generation**  
   - Create `submission.csv` containing predicted values for all target properties, ready for Kaggle submission.


In [1]:
import numpy as np
import pandas as pd
import warnings
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings("ignore", category=UserWarning)
import os
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import lightgbm as lgb

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
INPUT_DIR = "./neurips-open-polymer-prediction-2025"
train = pd.read_csv(os.path.join(INPUT_DIR, "train.csv"))
test  = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"))
train.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [None]:
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

# Na counts
na_counts = train[targets].isna().sum()
print(na_counts, train.shape)


Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64 (7973, 7)


In [None]:
def safe_mol_from_smiles(smi):
    """Safely convert a SMILES string into an RDKit Mol object.
    Returns None if parsing fails."""
    try:
        m = Chem.MolFromSmiles(smi)
        return m
    except Exception:
        return None


def compute_rdkit_descriptors(df_smiles, desc_names=None):
    """Compute RDKit molecular descriptors for a Series of SMILES.
    Returns a DataFrame with one row per molecule."""
    if desc_names is None:
        # Use all standard RDKit descriptors
        desc_names = [d[0] for d in Descriptors._descList]

    calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)

    # Convert SMILES to Mol objects
    mols = [safe_mol_from_smiles(s) for s in df_smiles]

    rows = []
    for m in mols:
        if m is None:
            # If conversion failed, fill with NaN values
            rows.append([np.nan] * len(desc_names))
        else:
            try:
                vals = calc.CalcDescriptors(m)
                # Convert descriptor values to floats, fallback to NaN
                vals = [float(v) if v is not None else np.nan for v in vals]
                rows.append(vals)
            except Exception:
                rows.append([np.nan] * len(desc_names))

    df_desc = pd.DataFrame(rows, columns=desc_names)
    df_desc.index = df_smiles.index
    return df_desc


def mol_to_morgan_fp_array(smiles, radius=2, nBits=1024):
    """Compute a Morgan fingerprint (as a 0/1 numpy array) from a SMILES string."""
    try:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            return np.zeros(nBits, dtype=np.uint8)

        bv = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits)

        # Convert RDKit ExplicitBitVect to numpy array
        arr = np.zeros((nBits,), dtype=np.uint8)
        for i, bit in enumerate(bv):
            arr[i] = int(bit)

        return arr

    except Exception:
        return np.zeros(nBits, dtype=np.uint8)


In [None]:
def build_feature_matrix(df, df_desc, fp_bits=1024):
    """
    Build feature matrix by combining RDKit descriptors and Morgan fingerprints.
    Returns the full DataFrame and the list of feature names.
    """

    # Descriptor features
    X_desc = df_desc.copy()

    # Morgan fingerprints as a stacked array
    fps = np.vstack(df['SMILES'].apply(lambda s: mol_to_morgan_fp_array(s, nBits=fp_bits)).values)

    # Convert fingerprints to DataFrame
    df_fp = pd.DataFrame(fps, columns=[f'FP_{i}' for i in range(fp_bits)], index=df.index)

    # Merge descriptors + fingerprints
    X_full = pd.concat([X_desc, df_fp], axis=1)

    # Ensure numeric output
    X_full = X_full.apply(pd.to_numeric, errors='coerce')

    return X_full, X_full.columns.tolist()


In [None]:
def data_cleaning(X, max_na_frac=0.3, clip_value=1e8, n_neighbors=5):
    """
    Clean feature matrix by removing high-NA columns, imputing missing values,
    and clipping extreme outliers.
    """
        
    X = X.replace([np.inf, -np.inf], np.nan)

    # Drop columns with too many NaNs
    na_frac = X.isnull().mean()
    keep_cols = na_frac[na_frac < max_na_frac].index.tolist()
    X = X[keep_cols]

    # KNN imputer for remaining NaNs
    imputer = KNNImputer(n_neighbors=n_neighbors)
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

    # Clip extreme values
    X_imputed = X_imputed.clip(-clip_value, clip_value)

    return X_imputed


def weighted_mae(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    targets = y_true.columns
    K = len(targets)

    r = {}
    n = {}
    for t in targets:
        y_valid = y_true[t].dropna()
        r[t] = float(y_valid.max() - y_valid.min()) if len(y_valid) > 0 else 1.0
        n[t] = max(1, len(y_valid))

    sqrt_inv_n = np.array([np.sqrt(1 / n[t]) for t in targets])
    weight_norm = K * sqrt_inv_n / np.sum(sqrt_inv_n)

    w = {}
    for i, t in enumerate(targets):
        w[t] = (1.0 / r[t]) * weight_norm[i]

    abs_err = 0.0
    count = 0
    for t in targets:
        valid = ~y_true[t].isna()
        abs_err += np.sum(w[t] * np.abs(y_pred.loc[valid, t] - y_true.loc[valid, t]))
        count += valid.sum()

    wmae = abs_err / max(1, count)
    return float(wmae)


In [None]:
def feature_selection_pipeline(X, y_for_fs, corr_thresh=0.95, top_k=300):
    """
    Perform feature selection using:
    1) Variance filter
    2) Correlation filter
    3) LightGBM feature importance (keep top_k)
    """

    print("Starting feature selection...")
    print("Initial features:", X.shape[1])

    # 1) Remove zero-variance features
    vt = VarianceThreshold(threshold=0.0)
    X_vt = vt.fit_transform(X)
    kept_vt = X.columns[vt.get_support()]
    X = pd.DataFrame(X_vt, columns=kept_vt)
    print("After variance filter:", X.shape[1])

    # 2) Remove highly correlated features
    corr = X.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > corr_thresh)]
    X = X.drop(columns=to_drop)
    print(f"After correlation filter (th={corr_thresh}): {X.shape[1]}")

    # 3) LightGBM-based feature importance
    dtrain = lgb.Dataset(X, label=y_for_fs)
    params = {
        "objective": "regression",
        "metric": "mae",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "max_depth": -1,
        "verbose": -1,
        "seed": 42
    }

    model = lgb.train(params, dtrain, num_boost_round=150)

    # Rank features by importance
    importance = model.feature_importance()
    importance_df = pd.DataFrame({
        "feature": X.columns,
        "importance": importance
    }).sort_values("importance", ascending=False)

    # Keep top-k features
    selected = importance_df["feature"].iloc[:top_k].tolist()
    X = X[selected]

    print(f"After LGBM importance selection (top {top_k}): {X.shape[1]}")

    return X, selected



In [None]:
def train_lgb_per_target(X, y, X_test=None, n_splits=5, params=None, num_boost_round=2000, es_rounds=100):
    """
    Train LightGBM using K-fold CV and return OOF predictions, test predictions, and models.
    Uses callbacks for early stopping to support different LightGBM builds.
    """

    # Default params
    if params is None:
        params = {
            'objective': 'regression',
            'metric': 'mae',
            'learning_rate': 0.05,
            'num_leaves': 64,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'seed': 42,
            'verbosity': -1
        }

    # Convert to numpy
    X_np = X.values if isinstance(X, pd.DataFrame) else np.asarray(X)
    X_test_np = X_test.values if isinstance(X_test, pd.DataFrame) else (np.asarray(X_test) if X_test is not None else None)

    # Init result arrays
    oof = np.zeros(X_np.shape[0])
    preds_test = np.zeros(X_test_np.shape[0]) if X_test_np is not None else None

    # K-fold CV
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    models = []

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_np)):
        X_tr, X_val = X_np[tr_idx], X_np[val_idx]
        y_tr, y_val = np.asarray(y)[tr_idx], np.asarray(y)[val_idx]

        dtrain = lgb.Dataset(X_tr, label=y_tr)
        dval = lgb.Dataset(X_val, label=y_val)

        # Early stopping + silent logging
        callbacks = [
            lgb.early_stopping(es_rounds, verbose=False),
            lgb.log_evaluation(-1)
        ]

        model = lgb.train(params, dtrain, valid_sets=[dval],
                          num_boost_round=num_boost_round,
                          callbacks=callbacks)

        # Store predictions
        oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        if X_test_np is not None:
            preds_test += model.predict(X_test_np, num_iteration=model.best_iteration) / n_splits

        models.append(model)
        print(f"  Fold {fold} done. best_iter={model.best_iteration}")

    return oof, preds_test, models




In [None]:
# Reset indices for consistency
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# Compute RDKit descriptors
print("Computing RDKit descriptors for train...")
train_desc = compute_rdkit_descriptors(train["SMILES"])
print("Computing RDKit descriptors for test...")
test_desc = compute_rdkit_descriptors(test["SMILES"])

# Build feature matrices (descriptors + fingerprints)
print("Building feature matrices...")
FP_BITS = 1024
X_train_full, feat_names = build_feature_matrix(train, train_desc, fp_bits=FP_BITS)
X_test_full, _ = build_feature_matrix(test, test_desc, fp_bits=FP_BITS)

# Clean and align features
X_train_full = data_cleaning(X_train_full)
X_test_full = X_test_full.reindex(columns=X_train_full.columns, fill_value=np.nan)
X_test_full = X_test_full.fillna(X_test_full.median())

# Global feature selection using Tg (if available)
if "Tg" in train.columns:
    df_fs = train.dropna(subset=["Tg"]).reset_index(drop=True)
    X_fs = X_train_full.loc[df_fs.index]
    y_fs = df_fs["Tg"].values

    print("\nRunning global feature selection using Tg...")
    X_fs_new, selected_features = feature_selection_pipeline(X_fs, y_fs,
                                                             corr_thresh=0.93, top_k=300)

    # Apply selected features
    X_train_full = X_train_full[selected_features]
    X_test_full = X_test_full[selected_features]
else:
    print("Tg not found — skipping feature selection.")

# Prediction output container
predictions = pd.DataFrame({"id": test["id"]})
models_in_memory = {}
results = []

# Train a separate model per target
for target in targets:
    print("\n========================================")
    print(f"Training target: {target}")

    if target not in train.columns:
        continue

    df_t = train.dropna(subset=[target]).reset_index(drop=True)
    if len(df_t) < 30:  # skip if too few samples
        print("Too few samples — skipped.")
        continue

    # Extract aligned training data
    idx = df_t.index
    X_t = X_train_full.loc[idx].reset_index(drop=True)
    y_t = df_t[target].values

    # Standard scaling
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_t), columns=X_t.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test_full), columns=X_test_full.columns)

    # LightGBM training
    oof, test_preds, model_list = train_lgb_per_target(X_scaled, y_t, X_test_scaled)

    # Store predictions and model objects
    predictions[target] = test_preds
    models_in_memory[target] = {
        "models": model_list,
        "scaler": scaler,
        "features": X_scaled.columns.tolist(),
        "oof": oof,
    }

    # Compute OOF MAE
    mae = np.mean(np.abs(oof - y_t))
    results.append(mae)
    print(f"OOF MAE for {target}: {mae:.6f}")

# Fill targets missing from predictions
for t in targets:
    if t not in predictions.columns:
        predictions[t] = train[t].median()

# Save submission file
'''
OUT_PATH = "/kaggle/working/submission.csv"
predictions.to_csv(OUT_PATH, index=False)
print("\nSaved submission to:", OUT_PATH)
'''

'''

Computing RDKit descriptors for train...
Computing RDKit descriptors for test...
Building feature matrices...

Running global feature selection using Tg...
Starting feature selection...
Initial features: 1229
After variance filter: 1148
After correlation filter (th=0.93): 1079
After LGBM importance selection (top 300): 300

Training target: Tg
  Fold 0 done. best_iter=6
  Fold 1 done. best_iter=1
  Fold 2 done. best_iter=1
  Fold 3 done. best_iter=5
  Fold 4 done. best_iter=11
OOF MAE for Tg: 88.069889

Training target: FFV
  Fold 0 done. best_iter=1
  Fold 1 done. best_iter=1
  Fold 2 done. best_iter=2
  Fold 3 done. best_iter=2
  Fold 4 done. best_iter=2
OOF MAE for FFV: 0.020938

Training target: Tc
  Fold 0 done. best_iter=3
  Fold 1 done. best_iter=8
  Fold 2 done. best_iter=2
  Fold 3 done. best_iter=9
  Fold 4 done. best_iter=3
OOF MAE for Tc: 0.076767

Training target: Density
  Fold 0 done. best_iter=27
  Fold 1 done. best_iter=74
  Fold 2 done. best_iter=1
  Fold 3 done. best

'\nOUT_PATH = "/kaggle/working/submission.csv"\npredictions.to_csv(OUT_PATH, index=False)\nprint("\nSaved submission to:", OUT_PATH)\n'