In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install pandas numpy rdkit-pypi scikit-learn joblib
!pip install tensorflow -U -q  # On Kaggle
#Operating on Kaggle

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model

In [None]:
train_data = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')

target_columns = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']  
smiles_column = 'SMILES'  

os.makedirs('models', exist_ok=True)
os.makedirs('results', exist_ok=True)

print("数据集基本信息：")
print(f"总样本数：{len(train_data)}")
print("各指标非空样本数：")
print(train_data[target_columns].notnull().sum())

In [None]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None: 
            return None

        fingerprint = AllChem.GetMorganFingerprintAsBitVect(
            mol, radius=radius, nBits=n_bits
        )

        return np.array(fingerprint)
    except Exception as e:
        print(f"处理SMILES时出错: {smiles}, 错误: {e}")
        return None

In [None]:
print("开始生成分子指纹...")
train_data['fingerprint'] = train_data[smiles_column].apply(smiles_to_fingerprint)

valid_mask = train_data['fingerprint'].notnull()
train_data = train_data[valid_mask].copy()
print(f"有效样本数: {len(train_data)} (已过滤无效SMILES或指纹生成失败的样本)")

In [None]:

target = 'FFV'

target_data = train_data.dropna(subset=[target]).copy()
X = np.stack(target_data['fingerprint'].values)
y = target_data[target].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = Sequential([
    Dense(512, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.2),
    
    Dense(128, activation='relu', kernel_regularizer=l2(0.0005)),
    
    Dense(1)
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mean_absolute_error'  
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=20, restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

y_train_pred = model.predict(X_train).flatten()
y_val_pred = model.predict(X_val).flatten()
print(f"训练集MAE: {mean_absolute_error(y_train, y_train_pred):.4f}")
print(f"验证集MAE: {mean_absolute_error(y_val, y_val_pred):.4f}")

model.save(f"models/mlp_{target}_model.h5")

joblib.dump(scaler, f"models/scaler_{target}_model.pkl")

In [None]:
rf_params = {
    'n_estimators': 100,
    'criterion': 'absolute_error',
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'random_state': 42,
    'n_jobs': -1  
}

In [None]:
for target in target_columns:
    if target != 'FFV':
        results = {}  
        
        print(f"\n----- 开始训练 {target} 模型 -----")
  
        target_data = train_data.dropna(subset=[target]).copy()
        print(f"用于训练{target}的样本数: {len(target_data)}")

        X = np.stack(target_data['fingerprint'].values)
        y = target_data[target].values

        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        model = RandomForestRegressor(** rf_params)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        
        results[target] = {
            'train_samples': len(X_train),
            'val_samples': len(X_val),
            'train_mae': train_mae,
            'val_mae': val_mae
        }
        
        print(f"{target} 模型训练集MAE: {train_mae:.4f}")
        print(f"{target} 模型验证集MAE: {val_mae:.4f}")

        model_path = f"models/rf_{target}_model.pkl"
        joblib.dump(model, model_path)
        print(f"{target} 模型已保存至: {model_path}")

In [None]:
target_columns = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

test_data = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
submission = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')

def generate_fingerprint(smiles, radius=2, nBits=2048):
    """从SMILES生成分子指纹"""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros(nBits)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
        return np.array(fp)
    except:
        return np.zeros(nBits)

if 'SMILES' in test_data.columns:
    print("从SMILES生成指纹特征...")
    test_data['fingerprint'] = test_data['SMILES'].apply(generate_fingerprint)
else:
    raise ValueError("测试数据中既没有'fingerprint'列也没有'SMILES'列，无法生成特征")

X_test = np.stack(test_data['fingerprint'].values)

scaler = joblib.load('models/scaler_FFV_model.pkl')
X_test_scaled = scaler.transform(X_test)

for target in target_columns:
    print(f"预测 {target}...")
    
    if target == 'FFV':
        model = load_model(f"models/mlp_{target}_model.h5")
        predictions = model.predict(X_test_scaled).flatten()
    else:
        model = joblib.load(f"models/rf_{target}_model.pkl")
        predictions = model.predict(X_test)
    
    submission[target] = predictions

submission.to_csv('submission.csv', index=False)
print("提交文件已生成: submission.csv")
    