In [1]:
# --- STEP 1: Load basic packages and data ---
import pandas as pd
import numpy as np

# Load datasets
train = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
submission = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')

# Target columns
TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

# --- STEP 2: Install RDKit offline ---
!pip install --no-index /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

# --- STEP 3: Feature Extraction with RDKit ---
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm

tqdm.pandas()

descriptor_funcs = [desc[1] for desc in Descriptors._descList]
descriptor_names = [desc[0] for desc in Descriptors._descList]

def featurize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return [np.nan] * len(descriptor_funcs)
        return [func(mol) for func in descriptor_funcs]
    except Exception as e:
        return [np.nan] * len(descriptor_funcs)

# Train set features
train_desc = train['SMILES'].progress_apply(featurize_smiles)
train_desc_df = pd.DataFrame(train_desc.tolist(), columns=descriptor_names)
X_train = pd.concat([train[['id']], train_desc_df], axis=1)
y_train = train[TARGETS]

# Test set features
test_desc = test['SMILES'].progress_apply(featurize_smiles)
X_test = pd.DataFrame(test_desc.tolist(), columns=descriptor_names)
X_test = pd.concat([test[['id']], X_test], axis=1)

# --- STEP 4: Preprocessing ---
X_train_features = X_train.drop(columns=['id'])
X_test_features = X_test.drop(columns=['id'])

# Fill missing values with train column means
train_feature_means = X_train_features.mean()
X_train_filled = X_train_features.fillna(train_feature_means)
X_test_filled = X_test_features.fillna(train_feature_means)

X_test_ids = X_test['id'].copy()

# --- STEP 5: Model Training with LightGBM ---
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping

# Initialize dicts
val_scores = {}
test_predictions = {}

for target in TARGETS:
    try:
        print(f"\n🔧 Training model for: {target}")

        valid_rows = y_train[target].notna()
        X_valid = X_train_filled.loc[valid_rows]
        y_valid = y_train[target].loc[valid_rows]

        X_tr, X_val, y_tr, y_val = train_test_split(
            X_valid, y_valid, test_size=0.2, random_state=42
        )

        model = LGBMRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric='mae',
            callbacks=[
                early_stopping(stopping_rounds=50, verbose=False)
            ]
        )

        val_scores[target] = mean_absolute_error(y_val, model.predict(X_val))
        test_predictions[target] = model.predict(X_test_filled)

    except Exception as e:
        print(f"⚠️ Failed to train model for {target}: {e}")
        val_scores[target] = None
        test_predictions[target] = [0.0] * len(X_test_filled)

# --- STEP 6: Create Submission ---
submission_df = pd.DataFrame({'id': X_test_ids})
for target in TARGETS:
    submission_df[target] = test_predictions.get(target, [0.0] * len(X_test_ids))

submission_df.to_csv('submission.csv', index=False)
print("✅ Submission file created: submission.csv")


Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


100%|██████████| 7973/7973 [02:19<00:00, 57.19it/s]
100%|██████████| 3/3 [00:00<00:00, 44.59it/s]



🔧 Training model for: Tg

🔧 Training model for: FFV

🔧 Training model for: Tc

🔧 Training model for: Density

🔧 Training model for: Rg
✅ Submission file created: submission.csv
