In [None]:
#Hazardou materials test
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
import joblib
from sklearn.preprocessing import StandardScaler

# 加载 HW_list 数据
hw_list_data = pd.read_excel('HW_list.xlsx')

# 加载训练集以获取训练时使用的特征
training_data = pd.read_csv('imputed_selected_features_Toxcity.csv')

# 提取除去 'SMILES'、'Toxicity' 和 'Classification' 的特征
training_features = training_data.columns.difference(['SMILES', 'Toxicity', 'Classification'])

# 提取 HW_list 中的匹配特征
matching_features = [feature for feature in training_features if feature in hw_list_data.columns]
hw_matching_features = hw_list_data[matching_features]

# 检查是否有缺失的特征
missing_features = [feature for feature in training_features if feature not in hw_list_data.columns]
if missing_features:
    print(f"以下特征在 HW_list.xlsx 中缺失: {missing_features}")

# 函数：将 SMILES 转换为分子描述符和指纹
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # 提取分子描述符
    descriptors = [
        Descriptors.MolWt(mol),  # 分子量
        Descriptors.MolLogP(mol),  # LogP
        Descriptors.NumHDonors(mol),  # 氢键供体数量
        Descriptors.NumHAcceptors(mol)  # 氢键受体数量
    ]
    # 生成 Morgan 指纹
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fingerprint_array = np.zeros((2048,))
    Chem.DataStructs.ConvertToNumpyArray(fingerprint, fingerprint_array)
    # 合并描述符和指纹
    features = np.concatenate([descriptors, fingerprint_array])
    return features

# 处理 HW_list 中的 SMILES 并生成对应的分子描述符和指纹
hw_smiles_list = hw_list_data['SMILES'].tolist()
hw_smiles_features = []
for smiles in hw_smiles_list:
    feature = smiles_to_features(smiles)
    if feature is not None:
        hw_smiles_features.append(feature)

# 转换为 numpy 数组
hw_smiles_features = np.array(hw_smiles_features)

# 检查 SMILES 特征是否正确生成
print(f"SMILES 特征数量: {hw_smiles_features.shape[1]} (应为 2052，包含分子描述符和指纹)")

# 合并分子特征和 HW_list.xlsx 中的匹配特征
hw_all_features = np.hstack((hw_smiles_features, hw_matching_features.values))

# 检查合并后特征数量
print(f"合并后特征数量: {hw_all_features.shape[1]} (应为 2113)")

# 加载保存的 StandardScaler 和模型
scaler = joblib.load('scaler.pkl')
best_model = joblib.load('XGBoost_best_model.pkl')

# 对所有特征进行标准化
try:
    hw_features_scaled = scaler.transform(hw_all_features)
except ValueError as e:
    print(f"标准化错误: {e}")
    print(f"当前特征数: {hw_all_features.shape[1]}，期望特征数: {scaler.n_features_in_}")

# 使用保存的模型进行毒性预测
hw_toxicity_predictions = best_model.predict(hw_features_scaled)

# 输出预测结果
print("毒性预测结果：")
print(hw_toxicity_predictions)

# 如果需要将预测结果添加到原数据中并保存
hw_list_data['Toxicity_Prediction'] = hw_toxicity_predictions
hw_list_data.to_excel('HW_list_with_predictions.xlsx', index=False)
print("预测结果已保存到 HW_list_with_predictions.xlsx")