In [1]:
# === STEP 1: Install RDKit (Kaggle offline) ===
import os
os.system('pip install --no-index /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl > /dev/null 2>&1')

# === STEP 2: Imports & Configuration ===
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, Descriptors
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor, early_stopping
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

RDLogger.DisableLog('rdApp.*')

TRAIN_PATH = "/kaggle/input/neurips-open-polymer-prediction-2025/train.csv"
TEST_PATH  = "/kaggle/input/neurips-open-polymer-prediction-2025/test.csv"
SUB_PATH   = "/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv"
TARGETS    = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
RANDOM_SEED = 42

# === STEP 3: Load Data ===
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
sub_df   = pd.read_csv(SUB_PATH)

# === STEP 4: Featurization Helpers ===
def featurize_morgan(smiles_series, radius=2, n_bits=2048):
    fps = []
    for smi in smiles_series:
        try:
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                fps.append([0]*n_bits)
            else:
                vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, n_bits)
                fps.append(list(vec))
        except:
            fps.append([0]*n_bits)
    return pd.DataFrame(fps, columns=[f'FP_{i}' for i in range(n_bits)])

desc_funcs = [d[1] for d in Descriptors._descList]
desc_names = [d[0] for d in Descriptors._descList]

def featurize_descriptors(smiles_series):
    feats = []
    for smi in smiles_series:
        try:
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                feats.append([np.nan]*len(desc_funcs))
            else:
                feats.append([fn(mol) for fn in desc_funcs])
        except:
            feats.append([np.nan]*len(desc_funcs))
    return pd.DataFrame(feats, columns=desc_names)

# === STEP 5: Generate Features ===
print("🔬 Featurizing training data...")
X_train_fp   = featurize_morgan(train_df['SMILES'])
X_train_desc = featurize_descriptors(train_df['SMILES'])
X_train      = pd.concat([X_train_fp, X_train_desc], axis=1)

print("🔬 Featurizing test data...")
X_test_fp    = featurize_morgan(test_df['SMILES'])
X_test_desc  = featurize_descriptors(test_df['SMILES'])
X_test       = pd.concat([X_test_fp, X_test_desc], axis=1)

y_train = train_df[TARGETS].copy()
test_ids = test_df['id'].copy()

# === STEP 6: Clean infinities & drop all-NaN columns ===
for df in (X_train, X_test):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

all_nan = X_train.columns[X_train.isna().all()]
if len(all_nan):
    X_train.drop(columns=all_nan, inplace=True)
    X_test.drop(columns=all_nan, inplace=True, errors='ignore')

# === STEP 7: Impute missing features ===
imp = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imp.fit_transform(X_train), columns=X_train.columns)
X_test  = pd.DataFrame(imp.transform(X_test),      columns=X_train.columns)

# Final clean function before modeling
def clean_df(df):
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
    df = df.clip(lower=-1e10, upper=1e10)
    return df.astype(np.float32)

X_train = clean_df(X_train)
X_test  = clean_df(X_test)

# === STEP 8: Prepare train/validation split ===
mask = y_train.notna().any(axis=1)
X_all = X_train.loc[mask].reset_index(drop=True)
y_all = y_train.loc[mask].reset_index(drop=True)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_all, y_all, test_size=0.1, random_state=RANDOM_SEED
)

# === STEP 9: Train models per target with progress bar ===
val_scores = {}
models = {}

print("🚀 Training models:")
for target in tqdm(TARGETS):
    idx_tr = y_tr[target].notna()
    idx_val = y_val[target].notna()
    if idx_tr.sum() == 0 or idx_val.sum() == 0:
        val_scores[target] = None
        continue

    X_tr_t = clean_df(X_tr.loc[idx_tr])
    y_tr_t = y_tr.loc[idx_tr, target].values
    X_val_t = clean_df(X_val.loc[idx_val])
    y_val_t = y_val.loc[idx_val, target].values

    # LightGBM
    lgb = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=64,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=-1
    )
    lgb.fit(
        X_tr_t, y_tr_t,
        eval_set=[(X_val_t, y_val_t)],
        eval_metric='mae',
        callbacks=[early_stopping(stopping_rounds=50, verbose=False)]
    )

    # Random Forest
    rf = RandomForestRegressor(n_estimators=200, random_state=RANDOM_SEED, n_jobs=-1)
    rf.fit(X_tr_t, y_tr_t)

    # Ensemble validation
    pred_val = 0.5 * lgb.predict(X_val_t) + 0.5 * rf.predict(X_val_t)
    val_scores[target] = mean_absolute_error(y_val_t, pred_val)
    models[target] = {'lgb': lgb, 'rf': rf}

print("\n✅ Validation MAE per target:")
for t, s in val_scores.items():
    print(f"  • {t}: {s:.4f}" if s is not None else f"  • {t}: no data")

# === STEP 10: Retrain on full data & predict test set ===
print("\n🚀 Retraining on full data and predicting test set:")
test_preds = pd.DataFrame(index=test_ids, columns=TARGETS)

for target in tqdm(TARGETS):
    idx_full = y_train[target].notna()
    if idx_full.sum() == 0:
        test_preds[target] = 0.0
        continue

    X_full = clean_df(X_train.loc[idx_full])
    y_full = y_train.loc[idx_full, target].values

    lgb = models[target]['lgb']
    rf  = models[target]['rf']

    lgb.fit(X_full, y_full)
    rf.fit(X_full, y_full)

    preds = 0.5 * lgb.predict(X_test) + 0.5 * rf.predict(X_test)
    test_preds[target] = preds

# === STEP 11: Create submission ===
submission = pd.DataFrame({'id': test_ids})
for t in TARGETS:
    submission[t] = test_preds[t].values

submission.to_csv('submission.csv', index=False)
print("✅ submission.csv created")


🔬 Featurizing training data...
🔬 Featurizing test data...
🚀 Training models:


100%|██████████| 5/5 [02:00<00:00, 24.10s/it]



✅ Validation MAE per target:
  • Tg: 53.8798
  • FFV: 0.0052
  • Tc: 0.0259
  • Density: 0.0371
  • Rg: 1.9206

🚀 Retraining on full data and predicting test set:


100%|██████████| 5/5 [02:35<00:00, 31.12s/it]

✅ submission.csv created



