In [1]:
import random
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModel, RobertaTokenizer, RobertaModel
transformers.logging.set_verbosity_error()
import re
import tensorflow as tf
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", None)

# 1. Load the dataset
data0 = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
display(data0.info())
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
data = data0.drop('id',axis=1)
data.columns=['text']+targets
display(data)

TEST0 = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
TEST = TEST0[['SMILES']]
TEST.columns=['text']

# 2. Tokenization and data preparation
model_path="/kaggle/input/c/transformers/default/1/ChemBERTa-77M-MLM"
tokenizer = AutoTokenizer.from_pretrained(model_path)
max_len = 128

def create_data(text):
    encoded = tokenizer.batch_encode_plus(
        text.tolist(),
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True
    )

    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="float64")

    return {"input_ids": input_ids,"attention_mask": attention_masks}

from transformers import AutoModel, AutoTokenizer
import torch

def extract_roberta_features(texts, model_path, batch_size=32, max_len=512):
    """
    Extract features from text using RoBERTa model

    Args:
        texts: List of input text strings
        model_path: Path to model (local or Hugging Face hub name)
        batch_size: Batch size for processing
        max_len: Maximum sequence length

    Returns:
        numpy array of extracted features
    """
    # Load model and tokenizer
    model = AutoModel.from_pretrained(model_path, local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    features = []

    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        # Tokenization (using PyTorch tensors)
        encoded = tokenizer(
            batch_texts,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Move tensors to the same device as model
        encoded = {k: v.to(device) for k, v in encoded.items()}

        # Extract features
        with torch.no_grad():
            outputs = model(**encoded)

        # Average pooling (mean of last hidden states)
        pooled_features = outputs.last_hidden_state.mean(dim=1)

        # Move to CPU and convert to numpy
        features.append(pooled_features.cpu().numpy())

    return np.vstack(features)

# Learning rate scheduler
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch == 0:
        return learning_rate * 0.1
    else:
        return learning_rate * (0.95 ** epoch)

# 4. Train the model (modified for feature extraction approach)
def train_model(train_texts, train_targets):
    print("Extracting features from training data...")
    train_features = extract_roberta_features(train_texts, model_path)

    model = XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method="hist",
        verbosity=0
    )

    model.fit(train_features, train_targets)
    return model

# 5. Predict and evaluate (for feature extraction approach)
def evaluate_model(model, test_targets):
    # Extract features from test data
    print("Extracting features from test data...")
    test_texts = test['text'].tolist()
    test_features = extract_roberta_features(test_texts, model_path)

    # Make predictions
    predictions = model.predict(test_features)
    predictions = predictions.flatten()

    # Evaluation metrics
    mse = np.mean((predictions - test_targets) ** 2)
    mae = np.mean(np.abs(predictions - test_targets))
    rmse = np.sqrt(mse)

    # Correlation coefficient
    correlation = np.corrcoef(predictions, test_targets)[0, 1]

    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"Correlation: {correlation:.4f}")

    return predictions

def build_regression_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='mse',
                  metrics=['mae'])

    return model

def train_nn_model(train_texts, train_targets):
    print("Extracting features for NN...")
    train_features = extract_roberta_features(train_texts, model_path)

    model = build_regression_model(train_features.shape[1])
    model.fit(
        train_features, train_targets,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
        verbose=0
    )

    return model



if __name__ == "__main__":
    nn_models = {}
    xgb_models = {}

    for target in targets:
        filtered_df = data.dropna(subset=[target])
        train, test = train_test_split(filtered_df, test_size=0.1, random_state=42)

        train_texts = train['text'].tolist()
        test_texts = test['text'].tolist()
        train_targets = train[target].values
        test_targets = test[target].values

        print(f"\nTraining for target: {target}")

        if target in ['Tg', 'Rg']:
            model = train_nn_model(train_texts, train_targets)
            nn_models[target] = model
        elif target in ['Tc', 'FFV']:
            model = train_model(train_texts, train_targets)
            xgb_models[target] = model
        elif target == 'Density':
            nn_model = train_nn_model(train_texts, train_targets)
            xgb_model = train_model(train_texts, train_targets)
            nn_models[target] = nn_model
            xgb_models[target] = xgb_model

        print(f"Evaluating for target: {target}")
        test_features = extract_roberta_features(test_texts, model_path)

        if target in ['Tg', 'Rg']:
            predictions = nn_models[target].predict(test_features).flatten()
        elif target in ['Tc', 'FFV']:
            predictions = xgb_models[target].predict(test_features)
        elif target == 'Density':
            nn_pred = nn_models[target].predict(test_features).flatten()
            xgb_pred = xgb_models[target].predict(test_features)
            predictions = 0.2 * nn_pred + 0.8 * xgb_pred

        mse = np.mean((predictions - test_targets) ** 2)
        mae = np.mean(np.abs(predictions - test_targets))
        rmse = np.sqrt(mse)
        correlation = np.corrcoef(predictions, test_targets)[0, 1]

        print(f"MSE: {mse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"Correlation: {correlation:.4f}")



test_texts = TEST['text'].tolist()
test_features = extract_roberta_features(test_texts, model_path)

RESULT = np.zeros((len(TEST), len(targets)))

for i, target in enumerate(targets):
    if target in ['Tg', 'Rg']:
        pred = nn_models[target].predict(test_features).flatten()
    elif target in ['Tc', 'FFV']:
        pred = xgb_models[target].predict(test_features)
    elif target == 'Density':
        nn_pred = nn_models[target].predict(test_features).flatten()
        xgb_pred = xgb_models[target].predict(test_features)
        pred = 0.2 * nn_pred + 0.8 * xgb_pred

    RESULT[:, i] = pred

submit = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
submit.iloc[:, 1:] = RESULT
submit.to_csv('submission_chemberta.csv', index=False)
display(submit)


2025-06-29 12:30:42.427445: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751200242.702063      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751200242.784728      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7973 entries, 0 to 7972
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       7973 non-null   int64  
 1   SMILES   7973 non-null   object 
 2   Tg       511 non-null    float64
 3   FFV      7030 non-null   float64
 4   Tc       737 non-null    float64
 5   Density  613 non-null    float64
 6   Rg       614 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 436.2+ KB


None

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,text,Tg,FFV,Tc,Density,Rg
0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,
2,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,
3,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,
...,...,...,...,...,...,...
7968,*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1,,0.367498,,,
7969,*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...,,0.353280,,,
7970,*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...,,0.369411,,,
7971,*C=C(*)c1ccccc1C,261.662355,,,,



Training for target: Tg
Extracting features for NN...


2025-06-29 12:31:23.596190: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Evaluating for target: Tg
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
MSE: 3552.7252
MAE: 45.4965
RMSE: 59.6047
Correlation: 0.8167

Training for target: FFV
Extracting features from training data...
Evaluating for target: FFV
MSE: 0.0003
MAE: 0.0085
RMSE: 0.0185
Correlation: 0.8258

Training for target: Tc
Extracting features from training data...
Evaluating for target: Tc
MSE: 0.0025
MAE: 0.0320
RMSE: 0.0495
Correlation: 0.8511

Training for target: Density
Extracting features for NN...
Extracting features from training data...
Evaluating for target: Density
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
MSE: 0.0144
MAE: 0.0655
RMSE: 0.1200
Correlation: 0.4715

Training for target: Rg
Extracting features for NN...
Evaluating for target: Rg
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
MSE: 6.6502
MAE: 1.8487
RMSE: 2.5788
Correlation: 0.8340
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms

  submit.iloc[:, 1:] = RESULT
  submit.iloc[:, 1:] = RESULT
  submit.iloc[:, 1:] = RESULT
  submit.iloc[:, 1:] = RESULT
  submit.iloc[:, 1:] = RESULT


Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,136.274185,0.375326,0.217022,1.116531,20.669937
1,1422188626,183.227768,0.373053,0.23188,1.10297,22.36183
2,2032016830,129.4505,0.352207,0.257917,1.128183,19.003965


In [2]:
import shutil, os, subprocess, sys

# ────────────────────────────────────────────────────────────────
# 1. Fix the torch_spline_conv filename and move it to /kaggle/working
# ────────────────────────────────────────────────────────────────
src = "/kaggle/input/pyg-wheel/torch_spline_conv-1.2.2+pt26cpu-cp311-cp311-linux_x86_64 - Copy.whl"
dst = "/kaggle/working/torch_spline_conv-1.2.2+pt26cpu-cp311-cp311-linux_x86_64.whl"
if not os.path.exists(dst):
    shutil.copy(src, dst)

# ────────────────────────────────────────────────────────────────
# 2. Install the four other PyG wheels in one shot
# ────────────────────────────────────────────────────────────────
!pip install --no-index --find-links /kaggle/input/pyg-wheel \
    torch_scatter torch_sparse torch_cluster torch_geometric -q

# ────────────────────────────────────────────────────────────────
# 3. Install torch_spline_conv from the cleaned‑up filename
# ────────────────────────────────────────────────────────────────
!pip install /kaggle/working/torch_spline_conv-1.2.2+pt26cpu-cp311-cp311-linux_x86_64.whl -q

# ────────────────────────────────────────────────────────────────
# 4. Install RDKit directly from your wheel
# ────────────────────────────────────────────────────────────────
!pip install /kaggle/input/rdkit-wheels/rdkit-2023.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl -q

# ────────────────────────────────────────────────────────────────
# 5. Quick sanity check
# ────────────────────────────────────────────────────────────────
try:
    import torch, torch_geometric, rdkit
    from rdkit import Chem
    print("✅  PyTorch:", torch.__version__)
    print("✅  PyG    :", torch_geometric.__version__)
    print("✅  RDKit  :", rdkit.__version__)
    mol = Chem.MolFromSmiles("CCO")
    print("RDKit test (atoms):", mol.GetNumAtoms())
except Exception as e:
    print("⚠️  Install check failed:", e, file=sys.stderr)


import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
train_df = train_df.rename(columns={"SMILES": "text"})
test_df = test_df.rename(columns={"SMILES": "text"})
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

# Feature extractors
def atom_features(atom):
    return np.array([
        atom.GetAtomicNum(),
        atom.GetDegree(),
        atom.GetFormalCharge(),
        int(atom.GetHybridization()),
        atom.GetIsAromatic(),
        atom.GetTotalNumHs()
    ], dtype=np.float32)

def bond_features(bond):
    return np.array([
        bond.GetBondTypeAsDouble(),
        bond.GetIsConjugated(),
        bond.IsInRing()
    ], dtype=np.float32)

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    Chem.Kekulize(mol, clearAromaticFlags=True)
    mol = Chem.AddHs(mol)

    x = torch.tensor([atom_features(atom) for atom in mol.GetAtoms()], dtype=torch.float)

    edge_index = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append([i, j])
        edge_index.append([j, i])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    return Data(x=x, edge_index=edge_index)

class PolymerDataset(Dataset):
    def __init__(self, df, targets=None):
        self.graphs = [smiles_to_graph(smile) for smile in df['text']]
        self.graphs = [g for g in self.graphs if g is not None]
        self.targets = torch.tensor(targets[:len(self.graphs)], dtype=torch.float32).view(-1, 1) if targets is not None else None

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        if self.targets is not None:
            return self.graphs[idx], self.targets[idx]
        return self.graphs[idx]

class GNNModel(nn.Module):
    def __init__(self, in_channels, hidden_dim=128):
        super(GNNModel, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_dim, heads=4, concat=False)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.conv2 = GATConv(hidden_dim, hidden_dim, heads=4, concat=False)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.lin = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = global_mean_pool(x, batch)
        return self.lin(x)

def train_model(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data, target in loader:
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate_model(model, loader):
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for data, target in loader:
            data = data.to(device)
            target = target.to(device)
            output = model(data.x, data.edge_index, data.batch)
            all_preds.append(output.cpu().numpy())
            all_targets.append(target.cpu().numpy())
    preds = np.concatenate(all_preds)
    targets = np.concatenate(all_targets)
    mse = np.mean((preds - targets) ** 2)
    mae = np.mean(np.abs(preds - targets))
    rmse = np.sqrt(mse)
    corr = np.corrcoef(preds.flatten(), targets.flatten())[0, 1]
    return mse, mae, rmse, corr, preds

submission = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
result = np.zeros((len(test_df), len(targets)))

for i, target in enumerate(targets):
    print(f"\nTraining for target: {target}")
    filtered = train_df.dropna(subset=[target])
    train_data, val_data = train_test_split(filtered, test_size=0.1, random_state=42)

    scaler = StandardScaler()
    y_train = scaler.fit_transform(train_data[[target]])
    y_val = scaler.transform(val_data[[target]])

    train_dataset = PolymerDataset(train_data, targets=y_train)
    val_dataset = PolymerDataset(val_data, targets=y_val)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    model = GNNModel(in_channels=6).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.7)
    criterion = nn.MSELoss()

    for epoch in range(20):
        train_loss = train_model(model, train_loader, optimizer, criterion)
        scheduler.step()

    mse, mae, rmse, corr, _ = evaluate_model(model, val_loader)
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, Correlation: {corr:.4f}")

    test_dataset = PolymerDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=32)

    model.eval()
    preds = []
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            output = model(data.x, data.edge_index, data.batch)
            preds.append(output.cpu().numpy())
    preds = np.concatenate(preds).flatten()
    result[:, i] = scaler.inverse_transform(preds.reshape(-1, 1)).flatten()

submission.iloc[:, 1:] = result
submission.to_csv("submission_gnn.csv", index=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅  PyTorch: 2.6.0+cu124
✅  PyG    : 2.6.1
✅  RDKit  : 2023.09.4
RDKit test (atoms): 3
Using device: cpu

Training for target: Tg


  x = torch.tensor([atom_features(atom) for atom in mol.GetAtoms()], dtype=torch.float)


MSE: 0.4727, MAE: 0.5474, RMSE: 0.6876, Correlation: 0.8133

Training for target: FFV
MSE: 0.2902, MAE: 0.3871, RMSE: 0.5387, Correlation: 0.9051

Training for target: Tc
MSE: 0.2704, MAE: 0.3615, RMSE: 0.5200, Correlation: 0.8729

Training for target: Density
MSE: 0.3268, MAE: 0.3121, RMSE: 0.5716, Correlation: 0.7722

Training for target: Rg
MSE: 0.5050, MAE: 0.4683, RMSE: 0.7107, Correlation: 0.7324


  submission.iloc[:, 1:] = result
  submission.iloc[:, 1:] = result
  submission.iloc[:, 1:] = result
  submission.iloc[:, 1:] = result
  submission.iloc[:, 1:] = result


In [3]:
# Load both submission files
gnn_preds = pd.read_csv("submission_gnn.csv")
chemberta_preds = pd.read_csv("submission_chemberta.csv")

# Blend the predictions
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
ensemble = gnn_preds.copy()
for target in targets:
    if target in ['Tg', 'Rg']:
        ensemble[target] = gnn_preds[target]  # GNN stronger
    elif target in ['Tc', 'FFV']:
        ensemble[target] = chemberta_preds[target]  # ChemBERTa stronger
    elif target == 'Density':
        ensemble[target] = gnn_preds[target]  # Assuming GNN is better

# Save as the required submission file
ensemble.to_csv("submission.csv", index=False)