In [None]:
import pandas as pd 
import numpy as np
import datetime
from rdkit.Chem import rdMolDescriptors
from rdkit import RDLogger,Chem
from rdkit.Chem import Descriptors
RDLogger.DisableLog('rdApp.*')


In [None]:
train_relative_path = '../data/round1_train_data.csv'
test_relative_path = '../data/round1_test_data.csv'

train_df = pd.read_csv(train_relative_path)
test_df = pd.read_csv(test_relative_path)

print(f'Training set size: {len(train_df)}, test set size: {len(test_df)}')

In [None]:
# 将分子转化为定长的指纹向量
def mfgen(mol,nBits=2048, radius=2):
    '''
    Parameters
    ----------
    mol : mol
        RDKit mol object.
    nBits : int
        Number of bits for the fingerprint.
    radius : int
        Radius of the Morgan fingerprint.
    Returns
    -------
    mf_desc_map : ndarray
        ndarray of molecular fingerprint descriptors.
    '''
    # 返回分子的位向量形式的Morgan fingerprint
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,radius=radius,nBits=nBits)
    return np.array(list(map(eval,list(fp.ToBitString()))))

# 加载数据
def vec_cpd_lst(smi_lst):
    smi_set = list(set(smi_lst))
    smi_vec_map = {}
    for smi in tqdm(smi_set): # tqdm：显示进度条
        mol = Chem.MolFromSmiles(smi)
        smi_vec_map[smi] = mfgen(mol)
    smi_vec_map[''] = np.zeros(2048)
    
    vec_lst = [smi_vec_map[smi] for smi in smi_lst]
    return np.array(vec_lst)



In [None]:
# 提取 SMILES 字符串, 计算每个 SMILES 的 logP 值和 TPSA 值
def get_logP_and_TPSA(df, cols):
    # 检查输入列是否存在
    for col in cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")

    for col in cols:
        # 创建存储logP值和TPSA值的列表
        logP_values = []
        tpsa_values = []
        
        # 遍历每一行的SMILES字符串并计算LogP和TPSA值
        for smile in df[col]:
            try:
                # 从SMILES字符串创建分子对象
                mol = Chem.MolFromSmiles(smile)
                if mol is not None:
                    # 计算LogP值
                    logP = Descriptors.MolLogP(mol)
                    logP_values.append(logP)
                    
                    # 计算TPSA值
                    tpsa = Descriptors.TPSA(mol)
                    tpsa_values.append(tpsa)
                else:
                    logP_values.append(None)  # 无效的SMILES
                    tpsa_values.append(None)  # 无效的SMILES
            except Exception as e:
                print(f"Error processing SMILES {smile}: {e}")
                logP_values.append(None)  # 处理错误情况
                tpsa_values.append(None)  # 处理错误情况

        # 将计算的LogP值和TPSA值添加到数据框中
        df[f'logP_{col}'] = logP_values
        df[f'TPSA_{col}'] = tpsa_values

    return df

In [None]:
cols = ['Reactant1', 'Reactant2', 'Product', 'Additive', 'Solvent']
train_df1 = get_logP_and_TPSA(train_df, cols)
test_df1 = get_logP_and_TPSA(test_df, cols)

In [None]:
# 从csv中读取数据
train_rct1_smi = train_df['Reactant1'].to_list()
train_rct2_smi = train_df['Reactant2'].to_list()
train_add_smi = train_df['Additive'].to_list()
train_sol_smi = train_df['Solvent'].to_list()

# 将SMILES转化为分子指纹
train_rct1_fp = vec_cpd_lst(train_rct1_smi)
train_rct2_fp = vec_cpd_lst(train_rct2_smi)
train_add_fp = vec_cpd_lst(train_add_smi)
train_sol_fp = vec_cpd_lst(train_sol_smi)
# 在dim=1维度进行拼接。即：将一条数据的Reactant1,Reactant2,Product,Additive,Solvent字段的morgan fingerprint拼接为一个向量。
train_x = np.concatenate([train_rct1_fp,train_rct2_fp,train_add_fp,train_sol_fp],axis=1)
train_y = train_df['Yield'].to_numpy()

# 测试集也进行同样的操作
test_rct1_smi = test_df['Reactant1'].to_list()
test_rct2_smi = test_df['Reactant2'].to_list()
test_add_smi = test_df['Additive'].to_list()
test_sol_smi = test_df['Solvent'].to_list()

test_rct1_fp = vec_cpd_lst(test_rct1_smi)
test_rct2_fp = vec_cpd_lst(test_rct2_smi)
test_add_fp = vec_cpd_lst(test_add_smi)
test_sol_fp = vec_cpd_lst(test_sol_smi)
test_x = np.concatenate([test_rct1_fp,test_rct2_fp,test_add_fp,test_sol_fp],axis=1)

In [None]:
# 将读取到的分子向量特征与分子性质合并

X_train_1 = pd.DataFrame(train_x)
X_test_1 = pd.DataFrame(test_x)

cols = ['logP_Reactant1', 'TPSA_Reactant1', 'logP_Reactant2', 'TPSA_Reactant2', 'logP_Product', 'TPSA_Product', 'logP_Additive',
       'TPSA_Additive', 'logP_Solvent', 'TPSA_Solvent']
for col in cols:
    X_train_1 = pd.concat([X_train_1, train_df1[col]], axis=1)
    X_test_1 = pd.concat([X_test_1, test_df1[col]], axis=1)
    
# 将所有列名转换为字符串
X_train_1.columns = X_train_1.columns.astype(str)
X_test_1.columns = X_test_1.columns.astype(str)

### 特征工程

In [None]:
# 删除全为零的特征列
def eliminate_zeros(df1, df2, threshold=1):
    ''' 删除全为零的特征列 '''
    
    df_1_zero = (df1 == 0)
    zero_count1 = df_1_zero.sum(axis=0)
    zeros1_ratio = zero_count1 / len(df1)

    # Identify columns with zero ratio equal to threshold
    columns_to_remove = zeros1_ratio[zeros1_ratio >= threshold].index
    
    # Drop the columns from both DataFrames
    df_1 = df1.drop(columns=columns_to_remove, axis=1)
    df_2 = df2.drop(columns=columns_to_remove, axis=1)

    return df_1, df_2


In [None]:
X_train_temp, X_test_temp = eliminate_zeros(X_train_1, X_test_1)

#### 使用模型进行特征选择
__说明__ : 这一步比较费时间，我在机器上跑了大概 60 多分钟。


In [None]:
# 使用模型进行特征选择
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel

# Initialize GradientBoostingRegressor with parameters
gbdt = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=13,subsample=0.7)


gbdt.fit(X_train_temp, train_y)

selector = SelectFromModel(gbdt, prefit=True, threshold="mean")


X_train2 = selector.transform(X_train_temp)
X_test2 = selector.transform(X_test_temp)

print(X_train2.shape, X_test2.shape) 

### 使用 GBDT 模型进行预测

In [None]:

def train_gbdt_and_predict(X_train, y_train, X_test):
    ''' 使用 GBDT 训练并预测测试集 '''
    
    # 定义 GBDT 参数
    gbdt_params = {
        'learning_rate': 0.01,
        'max_depth': 20,
        'max_features': 'sqrt',
        'min_samples_leaf': 2,
        'min_samples_split': 5,
        'n_estimators': 1000,
        'subsample': 0.8,
    }
    
    # 初始化并训练 GBDT 模型
    gbdt = GradientBoostingRegressor(**gbdt_params)
    gbdt.fit(X_train, y_train)
        
    # 预测测试集
    y_pred_test = gbdt.predict(X_test)
    
    return y_pred_test

In [None]:
y_pred = train_gbdt_and_predict(X_train2, train_y, X_test2)

# 创建提交文件的内容
ans_str_lst = ['rxnid,Yield']
for idx, y in enumerate(y_pred):
    ans_str_lst.append(f'test{idx+1},{y:.4f}')

# 将内容转换为 DataFrame
# DataFrame 需要的格式是两列，第一列是 'rxnid', 第二列是 'Yield'
df = pd.DataFrame([x.split(',') for x in ans_str_lst[1:]], columns=['rxnid', 'Yield'])

# 生成当前时间的时间戳
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

# 创建文件名
filename = f"../submit/submit_{timestamp}.txt"

# 保存 DataFrame 为 CSV 文件，不包含列头和索引
df.to_csv(filename, header=True, index=False)

print(f"提交文件已保存为 {filename}")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train_temp_3, train_y, test_size=0.2, random_state=42)
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, r2_score
from sklearn.ensemble import GradientBoostingRegressor
def train_gbdt(X_train, y_train, X_val, y_val, training_name):
    ''' 使用 GBDT 训练'''
    
    gbdt_params = {
    'learning_rate': 0.01,
    'max_depth': 20,
    'max_features': 'sqrt',
    'min_samples_leaf': 2,
    'min_samples_split': 5,
    'n_estimators': 1700,

    }
    gbdt = GradientBoostingRegressor(**gbdt_params)
    gbdt.fit(X_train, y_train)
     
    y_pred = gbdt.predict(X_val)
    r2 = r2_score(y_val, y_pred)

    print(f" 使用 GBDT 在 {training_name} 数据集上的 R2 分数: {r2}")

train_gbdt(X_train, y_train, X_val, y_val, "lgbm")