In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor as GBR
from matplotlib import rcParams
from sklearn.model_selection import GridSearchCV

# import seaborn as sns  # 引入 seaborn 以利用其色彩方案
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

In [None]:
# 全局字体大小等设置
global_font_size = 14
font_family = 'serif'
font_serif = ['Times New Roman']
font_weight = 'bold'
# 更新matplotlib全局字体参数

rcParams.update({
    'font.family': font_family,
    'font.serif': font_serif,
    'font.weight': font_weight,
    'font.size': global_font_size,
})


data = pd.read_csv(r'training_set-411.csv')
X = data.iloc[:, 1:-5] 
y = data.iloc[:,-5:]
# X.replace('none', np.nan, inplace=True)
symbol_columns = ['TM', 'modifications_ring', 'modifications_chain', 'Conjugate_structure','period','group_id']
for column in symbol_columns:
    X[column], _ = pd.factorize(X[column])
# imputer = SimpleImputer(strategy='mean')
# data_filled = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
# 调节超参数
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
def TrainSVM(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    param_grid = {
        'C': [0.1, 1,100 , 1000],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
        'kernel': ['rbf'],
    }

        # 初始化回归模型
    dt = SVR()
    grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring=rmse_scorer)
    grid_search.fit(X, y)

    # 打印最佳参数组合和最佳得分
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
    #使用最佳组合计算
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_out = best_model.predict(X_train)
    
    # 计算预测值与真实标签之间的RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train, y_out))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Test RMSE: {:.2f}".format(test_rmse))
    print("Training RMSE: {:.2f}".format(train_rmse))
    
    # 计算R2
    # y_pred = best_model.predict(X_test)
    # y_out = best_model.predict(X_train)
    trainR2 = r2_score(y_train, y_out)
    testR2 = r2_score(y_test, y_pred)
    print("SVR \n RMSE: ", test_rmse, "train R2: ", trainR2, "test R2: ", testR2)
    # plot
    fig, ax1 = plt.subplots()
    x = np.append(y_train, y_test)
    ax1.scatter(y_train, y_out, color='turquoise', label='Train data',alpha=0.7)
    ax1.scatter(y_test, y_pred, color='deeppink', label='Test data',alpha=0.7)
    text_props = {'ha': 'right', 'va': 'bottom', 'fontsize': 12}
    ax1.text(0.99, 0.16, f"Training R$^{2}$: {trainR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.11, f"Training RMSE: {train_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.06, f"Test R$^{2}$: {testR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.01, f"Test RMSE: {test_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.legend(loc='upper left',  frameon=False)
    #ax1.plot(x, x, color='gray', linewidth=2, linestyle='--')
    ax1.plot(ax1.get_xlim(), ax1.get_xlim(), color='gray', linewidth=2, linestyle='--')
    plt.xlabel("E(DFT) (eV)", fontweight='bold')
    plt.ylabel('E(ML) (eV)', fontweight='bold')
    plt.show()
    return rmse

In [None]:
# 调节超参数
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
def TrainDeciTree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    param_grid = {
        'max_depth': [None, 5, 10, 20, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }

        # 初始化决策树回归模型
    dt = DecisionTreeRegressor()
    grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring=rmse_scorer)
    grid_search.fit(X, y)

    # 打印最佳参数组合和最佳得分
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
    #使用最佳组合计算
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_out = best_model.predict(X_train)
    
    # 计算预测值与真实标签之间的RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train, y_out))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Test RMSE: {:.2f}".format(test_rmse))
    print("Training RMSE: {:.2f}".format(train_rmse))
    
    # 计算R2
    # y_pred = best_model.predict(X_test)
    # y_out = best_model.predict(X_train)
    trainR2 = r2_score(y_train, y_out)
    testR2 = r2_score(y_test, y_pred)
    print("Decision Tree Regression \n RMSE: ", test_rmse, "train R2: ", trainR2, "test R2: ", testR2)
    # plot
    fig, ax1 = plt.subplots()
    x = np.append(y_train, y_test)
    ax1.scatter(y_train, y_out, color='turquoise', label='Train data',alpha=0.7)
    ax1.scatter(y_test, y_pred, color='deeppink', label='Test data',alpha=0.7)
    text_props = {'ha': 'right', 'va': 'bottom', 'fontsize': 12}
    ax1.text(0.99, 0.16, f"Training R$^{2}$: {trainR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.11, f"Training RMSE: {train_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.06, f"Test R$^{2}$: {testR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.01, f"Test RMSE: {test_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.legend(loc='upper left',  frameon=False)
    #ax1.plot(x, x, color='gray', linewidth=2, linestyle='--')
    ax1.plot(ax1.get_xlim(), ax1.get_xlim(), color='gray', linewidth=2, linestyle='--')
    plt.xlabel("E(DFT) (eV)", fontweight='bold')
    plt.ylabel('E(ML) (eV)', fontweight='bold')
    plt.show()
    return rmse

In [None]:

# 调节超参数
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
def TrainRandForest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [20],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 4, 10]
    }

    # 拟合模型
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring=rmse_scorer)
    grid_search.fit(X, y)

    # 打印最佳参数组合和最佳得分
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
    #使用最佳组合计算
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_out = best_model.predict(X_train)

    # 计算预测值与真实标签之间的RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train, y_out))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Test RMSE: {:.2f}".format(test_rmse))
    print("Training RMSE: {:.2f}".format(train_rmse))
    
    # 计算R2
    # y_pred = best_model.predict(X_test)
    # y_out = best_model.predict(X_train)
    trainR2 = r2_score(y_train, y_out)
    testR2 = r2_score(y_test, y_pred)
    print("Random Forest Regression \n RMSE: ", test_rmse, "train R2: ", trainR2, "test R2: ", testR2)
    # plot
    fig, ax1 = plt.subplots()
    x = np.append(y_train, y_test)
    ax1.scatter(y_train, y_out, color='turquoise', label='Train data',alpha=0.7)
    ax1.scatter(y_test, y_pred, color='deeppink', label='Test data',alpha=0.7)
    text_props = {'ha': 'right', 'va': 'bottom', 'fontsize': 12}
    ax1.text(0.99, 0.16, f"Training R$^{2}$: {trainR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.11, f"Training RMSE: {train_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.06, f"Test R$^{2}$: {testR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.01, f"Test RMSE: {test_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.legend(loc='upper left',  frameon=False)
    #ax1.plot(x, x, color='gray', linewidth=2, linestyle='--')
    ax1.plot(ax1.get_xlim(), ax1.get_xlim(), color='gray', linewidth=2, linestyle='--')
    plt.xlabel("E(DFT) (eV)", fontweight='bold')
    plt.ylabel('E(ML) (eV)', fontweight='bold')
    plt.show()
    return rmse



In [None]:
# 调节超参数
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
def TrainBoost(X, y):
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5]
#         'max_depth': [3, 5, 7],
#         'min_samples_split': [2, 5, 10],
#         'min_samples_leaf': [1, 2, 4]
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    gb = GBR()
    grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring=rmse_scorer)
    grid_search.fit(X, y)

    # 打印最佳参数组合和最佳得分
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
    #使用最佳组合计算
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_out = best_model.predict(X_train)

    # 计算预测值与真实标签之间的RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train, y_out))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Test RMSE: {:.2f}".format(test_rmse))
    print("Training RMSE: {:.2f}".format(train_rmse))
    
    # 计算R2
    # y_pred = best_model.predict(X_test)
    # y_out = best_model.predict(X_train)
    trainR2 = r2_score(y_train, y_out)
    testR2 = r2_score(y_test, y_pred)
    print("GBR \n RMSE: ", test_rmse, "train R2: ", trainR2, "test R2: ", testR2)
    # plot
    fig, ax1 = plt.subplots()
    x = np.append(y_train, y_test)
    ax1.scatter(y_train, y_out, color='turquoise', label='Train data',alpha=0.7)
    ax1.scatter(y_test, y_pred, color='deeppink', label='Test data',alpha=0.7)
    text_props = {'ha': 'right', 'va': 'bottom', 'fontsize': 12}
    ax1.text(0.99, 0.16, f"Training R$^{2}$: {trainR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.11, f"Training RMSE: {train_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.06, f"Test R$^{2}$: {testR2:.2f}", transform=ax1.transAxes, **text_props)
    ax1.text(0.99, 0.01, f"Test RMSE: {test_rmse:.2f}", transform=ax1.transAxes, **text_props)
    ax1.legend(loc='upper left',  frameon=False)
    #ax1.plot(x, x, color='gray', linewidth=2, linestyle='--')
    ax1.plot(ax1.get_xlim(), ax1.get_xlim(), color='gray', linewidth=2, linestyle='--')
    plt.xlabel("E(DFT) (eV)", fontweight='bold')
    plt.ylabel('E(ML) (eV)', fontweight='bold')
    plt.show()
    return rmse


In [None]:
result = pd.DataFrame(columns=y.columns)
result.insert(0, 'Method', ['SVM', 'Linear Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression', 'Mean MSE'])
# for i in range(5):
i = 4
print(y.columns[i], "as label")
data1 = pd.concat([X, y.iloc[:,i]], axis=1)
# 删除含有缺失标签的样本
data_cleaned = data1.dropna(subset=[data1.columns[-1]])

# 分离特征和标签
X_cleaned = data_cleaned.iloc[:, :-1]  # 取前n-1列为特征
y_cleaned = data_cleaned.iloc[:,-1]  # 取最后一列为标签



In [None]:
mseSVM = TrainSVM(X_cleaned, y_cleaned)

In [None]:
mseDeciTree = TrainDeciTree(X_cleaned, y_cleaned)

In [None]:
mseRandForest = TrainRandForest(X_cleaned, y_cleaned)

In [None]:
mseBoost = TrainBoost(X_cleaned, y_cleaned)