In [6]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdchiral.template_extractor import extract_from_reaction
from rdchiral.main import rdchiralRunText
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# 步骤1：加载和预处理数据集
data = pd.read_csv('raw_train.csv')  # 替换为您的数据集文件路径
data = data.dropna()  # 删除含有缺失值的行

# 步骤2：提取反应模板
def extract_template(reaction):
    reactants, products = reaction.split('>>')
    input_rec = {'_id': None, 'reactants': reactants, 'products': products}
    ans = extract_from_reaction(input_rec)
    if 'reaction_smarts' in ans.keys():
        return ans['reaction_smarts']
    else:
        return None

data['template'] = data['reactants>reagents>production'].apply(extract_template)
data = data.dropna()  # 删除缺少模板的反应

# 步骤3：将产物转换为Morgan指纹
def convert_to_fingerprint(product):
    mol = Chem.MolFromSmiles(product)
    
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    on_bits = list(fp.GetOnBits())
    arr = np.zeros(fp.GetNumBits(), dtype=bool)
    arr[on_bits] = 1
    return arr

# 分割"reactants>reagents>production"列为"reactants"和"products"列
data[['reactants', 'products']] = data['reactants>reagents>production'].str.split('>>', expand=True)

data['fingerprint'] = data['products'].apply(convert_to_fingerprint)

# 步骤4：准备用于模型训练的数据
X = data['fingerprint'].tolist()
y = data['template'].tolist()

# 步骤5：对指纹进行向量化
vectorizer = CountVectorizer(binary=True, dtype=bool)

# 将指纹数组转换为字符串文本
X_str = [np.array2string(fp) for fp in X]
X_vec = vectorizer.fit_transform(X_str)

# 步骤6：训练SVM模型
model = SVC()
model.fit(X_vec, y)

# 步骤7：对测试数据进行预测
test_data = pd.read_csv('raw_test.csv')  # 替换为您的测试数据集文件路径
test_data = test_data.dropna()

test_data['template'] = test_data['reactants>reagents>production'].apply(extract_template)
test_data = test_data.dropna()  # 删除缺少模板的反应

test_data[['reactants', 'products']] = test_data['reactants>reagents>production'].str.split('>>', expand=True)
test_data['fingerprint'] = test_data['products'].apply(convert_to_fingerprint)
X_test = test_data['fingerprint'].tolist()
X_test_vec = vectorizer.transform([np.array2string(fp) for fp in X_test])

y_test_pred = []
for i in range(X_test_vec.shape[0]):
    prediction = model.predict(X_test_vec[i])
    y_test_pred.append(prediction[0])

test_accuracy = accuracy_score(test_data['template'], y_test_pred)
print(f"accuracy：{test_accuracy}")


accuracy：0.256651
