In [4]:
# !pip install pysr
import pandas as pd
import numpy as np
from pysr import PySRRegressor
import matplotlib.pyplot as plt
import sympy
import pandas as pd

# Load the dataset
df = pd.read_csv("./dose_response_parameters.csv")


y = df['P_Km_relative'].values * 1  # 放大y，避免loss过小带来的精度问题

X = np.column_stack([df['E'], df['k']])
# log transform to X
X = np.column_stack([np.log(df['E'].values + 1e-8), np.log(df['k'].values + 1e-8)])


# X = df['P_Vmax_relative'].values.reshape(-1, 1)

# It's a good practice to check the shapes of the data
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Let's inspect the first few rows to ensure the data is loaded correctly
print("\nFirst 5 rows of the input data (X):")
print(pd.DataFrame(X, columns=['E', 'k']).head())
print("\nFirst 5 values of the output data (y):")
print(y[:5])

# check for NaN or Inf values in X and y
print(np.isnan(X).any(), np.isinf(X).any(), np.max(np.abs(X)))
print(np.isnan(y).any(), np.isinf(y).any(), np.max(np.abs(y)))


Shape of X: (21, 2)
Shape of y: (21,)

First 5 rows of the input data (X):
          E         k
0  7.808307  1.218405
1  7.608937  0.406188
2  7.877354  0.599790
3  7.805786  0.581259
4  7.623823  0.502339

First 5 values of the output data (y):
[0.94971704 0.75842551 1.29493945 1.33593881 1.16693506]
False False 7.984403532244132
False False 2.76022757841034


In [5]:
model = PySRRegressor(
    niterations=100,
    populations=20,
    binary_operators=["+", "*", "/", "-"],
    unary_operators=["exp", "log", "tanh", "inv",  "square", "cube"],
    complexity_of_operators={"exp": 1, "log": 1, "tanh": 2, "inv": 1, "square": 2, "cube": 2},
    verbosity=1,
    alpha=1,
    # set maximum running time
    max_evals=4000000
    
)
# Train the model on the entire dataset
# The problem is a single regression problem, not multiple separate ones
model.fit(X, y)

Compiling Julia backend...
[ Info: Started!



Expressions evaluated per second: 2.860e+04
Progress: 172 / 2000 total iterations (8.600%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           2.584e-01  0.000e+00  y = 1.0814
3           2.272e-01  6.441e-02  y = 0.59403 / x₁
4           2.193e-01  3.538e-02  y = inv(x₁ + 0.3437)
5           1.940e-01  1.229e-01  y = (30.549 / x₀) + -2.9451
7           1.675e-01  7.338e-02  y = ((29.569 / x₀) - 2.2001) - x₁
10          1.491e-01  3.876e-02  y = exp(square(tanh(log(log(x₀ + -5.0423)))))
13          1.223e-01  6.590e-02  y = exp(cube(square(square(log(log(log(x₀ - 2.0133)))))))
26          1.201e-01  1.404e-03  y = cube(cube(cube(square(square(inv(cube(tanh((square(squ...
                                      are(x₀ * 0.13575)) * x₀) + -0.014873))))))))
────────────────────────────────

[ Info: Final population:
[ Info: Results saved to:


0,1,2
,model_selection,'best'
,binary_operators,"['+', '*', ...]"
,unary_operators,"['exp', 'log', ...]"
,expression_spec,
,niterations,100
,populations,20
,population_size,27
,max_evals,4000000
,maxsize,30
,maxdepth,


  - outputs/20250919_202142_rNbjit/hall_of_fame.csv


In [6]:
# 获取 PySR 找到的所有方程（这是一个 Pandas DataFrame）
all_equations = model.equations_

# 根据损失（loss）从低到高排序
sorted_by_loss = all_equations.sort_values(by='loss', ascending=True)
print("按损失排序的前10个方程:")
display(sorted_by_loss.head(10))

# 或者，你也可以根据得分（score）排序，它能更好地平衡复杂度和损失
sorted_by_score = all_equations.sort_values(by='score', ascending=False)
print("按得分排序的前10个方程:")
display(sorted_by_score.head(10))


按损失排序的前10个方程:


Unnamed: 0,complexity,loss,equation,score,sympy_format,lambda_format
20,30,0.042756,exp((x1 + inv(x0)) * (x1 * (((1.2361686 * log(...,0.003091,exp(x1*(x1 + 1/x0)*(1.2361686*log(log(x0 - 5.0...,PySRFunction(X=>exp(x1*(x1 + 1/x0)*(1.2361686*...
19,29,0.042889,exp(((((log(log(x0 + -5.085079)) / (exp(x1) + ...,0.000143,exp(x1*(-0.09926369950028 + 1.2600658*log(log(...,PySRFunction(X=>exp(x1*(-0.09926369950028 + 1....
18,28,0.042895,exp(((x1 + inv(x0)) * x1) * ((1.2361686 * (log...,0.003409,exp(x1*(x1 + 1/x0)*(1.2361686*log(log(x0 - 5.0...,PySRFunction(X=>exp(x1*(x1 + 1/x0)*(1.2361686*...
17,26,0.043188,exp((inv(x0) + (x1 * x1)) * ((log(log(x0 + -5....,0.000524,exp((x1**2 + 1/x0)*(log(log(x0 - 5.0327706))/(...,PySRFunction(X=>exp((x1**2 + 1/x0)*(log(log(x0...
16,24,0.043233,exp((x1 * ((log(log(x0 + -5.0820527)) / ((log(...,0.026523,exp(x1**2*(-0.11095682 + log(log(x0 - 5.082052...,PySRFunction(X=>exp(x1**2*(-0.11095682 + log(l...
15,22,0.045589,exp((((log(log(x0 + -5.0429616)) / (exp(x1) + ...,0.015414,0.934339811982409*exp(0.793209051752687*x1*log...,PySRFunction(X=>0.934339811982409*exp(0.793209...
14,20,0.047016,exp((log(log(-5.1858373 + x0)) / (log(square(x...,0.405816,exp(x1**2*log(log(x0 - 5.1858373))/(exp(x1) + ...,PySRFunction(X=>exp(x1**2*log(log(x0 - 5.18583...
13,19,0.070549,exp(tanh(log(log(-5.238939 + x0)) / tanh(inv(t...,0.004081,exp(tanh(log(log(x0 - 5.238939))/tanh(1/tanh(1...,PySRFunction(X=>exp(tanh(log(log(x0 - 5.238939...
12,18,0.070837,exp(tanh(log(log(x0 + -5.238939)) / tanh(tanh(...,0.019355,exp(tanh(log(log(x0 - 5.238939))/tanh(tanh(1/(...,PySRFunction(X=>exp(tanh(log(log(x0 - 5.238939...
11,16,0.073633,exp(tanh(inv(log(x1) + 0.62985474)) * tanh(log...,0.284352,exp(tanh(1/(log(x1) + 0.62985474))*tanh(log(lo...,PySRFunction(X=>exp(tanh(1/(log(x1) + 0.629854...


按得分排序的前10个方程:


Unnamed: 0,complexity,loss,equation,score,sympy_format,lambda_format
14,20,0.047016,exp((log(log(-5.1858373 + x0)) / (log(square(x...,0.405816,exp(x1**2*log(log(x0 - 5.1858373))/(exp(x1) + ...,PySRFunction(X=>exp(x1**2*log(log(x0 - 5.18583...
3,5,0.159156,inv(log(x0 + -4.786181)),0.320604,1/log(x0 - 4.786181),PySRFunction(X=>1/log(x0 - 4.786181))
11,16,0.073633,exp(tanh(inv(log(x1) + 0.62985474)) * tanh(log...,0.284352,exp(tanh(1/(log(x1) + 0.62985474))*tanh(log(lo...,PySRFunction(X=>exp(tanh(1/(log(x1) + 0.629854...
6,9,0.11387,exp(log(x1) * log(log(x0 + -4.9099708))),0.114832,exp(log(x1)*log(log(x0 - 4.9099708))),PySRFunction(X=>exp(log(x1)*log(log(x0 - 4.909...
5,8,0.127726,exp(log(log(x0 + -5.244945)) / -5.1728654),0.097309,log(x0 - 5.244945)**(-0.193316454744792),PySRFunction(X=>log(x0 - 5.244945)**(-0.193316...
1,3,0.227209,0.59402996 / x1,0.06441,0.59402996/x1,PySRFunction(X=>0.59402996/x1)
4,7,0.14078,exp(cube(inv(x0 + -5.24489))),0.061344,exp(0.00693091938063737/(0.190661767930309*x0 ...,PySRFunction(X=>exp(0.00693091938063737/(0.190...
7,11,0.101723,exp((x1 + log(x1)) * log(log(x0 + -5.1632667))),0.0564,exp((x1 + log(x1))*log(log(x0 - 5.1632667))),PySRFunction(X=>exp((x1 + log(x1))*log(log(x0 ...
2,4,0.219311,inv(x1 + 0.34370077),0.03538,1/(x1 + 0.34370077),PySRFunction(X=>1/(x1 + 0.34370077))
16,24,0.043233,exp((x1 * ((log(log(x0 + -5.0820527)) / ((log(...,0.026523,exp(x1**2*(-0.11095682 + log(log(x0 - 5.082052...,PySRFunction(X=>exp(x1**2*(-0.11095682 + log(l...


In [9]:
# manually select the best equation
best_equation_info = sorted_by_score.iloc[1]
best_equation_str = best_equation_info['equation']

print("找到的最佳通用方程:")
print(f"Equation: {best_equation_str}")
print(f"Loss: {best_equation_info['loss']:.4f}")
print(f"Score: {best_equation_info['score']:.4f}")

best_equation = best_equation_info['sympy_format']

p_km_relative, p_vmax_relative = sympy.symbols('x0 x1')
eval_func = sympy.lambdify(
    [p_km_relative, p_vmax_relative],
    best_equation,
    modules=['numpy']
)

找到的最佳通用方程:
Equation: inv(log(x0 + -4.786181))
Loss: 0.1592
Score: 0.3206


In [11]:
import plotly.graph_objects as go
from sklearn.metrics import r2_score

# 计算 R²
y_pred = eval_func(X[:, 0], X[:, 1])
r2 = r2_score(y, y_pred)

# 网格
# x0_vals = np.linspace(df['P_Km_relative'].min(), df['P_Km_relative'].max(), 50)
# x1_vals = np.linspace(df['P_Vmax_relative'].min(), df['P_Vmax_relative'].max(), 50)
# make the grid a bit larger for better visualization
x0_vals = np.linspace(df['P_Km_relative'].min()*0.5, df['P_Km_relative'].max()*1.1, 50)
x1_vals = np.linspace(df['P_Vmax_relative'].min()*0.5, df['P_Vmax_relative'].max()*1.1, 50)

X0, X1 = np.meshgrid(x0_vals, x1_vals)
Y_pred = eval_func(X0, X1)

fig = go.Figure()

# 数据点
fig.add_trace(go.Scatter3d(
    x=X[:, 0],
    y=X[:, 1],
    z=y,
    mode='markers',
    marker=dict(size=4, color='blue', opacity=0.6),
    name="Actual Data"
))

# 曲面
fig.add_trace(go.Surface(
    x=x0_vals, y=x1_vals, z=Y_pred,
    colorscale='Viridis',
    opacity=0.7,
    name="Predicted Surface"
))

# 在标题里加方程 & R²（字符格式）
fig.update_layout(
    scene=dict(
        xaxis_title="E (x0)",
        yaxis_title="k (x1)",
        zaxis_title="Km (y)",
    ),
    title=f"Best Equation Fit<br>Eq: {best_equation_str}<br>R² = {r2:.3f}"
)

fig.show()


  return log(x0 - 4.786181)**(-1.0)


In [29]:
# 网格
x0_vals = np.linspace(df['P_Km_relative'].min(), df['P_Km_relative'].max(), 100)
x1_vals = np.linspace(df['P_Vmax_relative'].min(), df['P_Vmax_relative'].max(), 100)
X0, X1 = np.meshgrid(x0_vals, x1_vals)
Y_pred = eval_func(X0, X1)

fig = go.Figure()

# 等高线图
fig.add_trace(go.Contour(
    x=x0_vals,
    y=x1_vals,
    z=Y_pred,
    colorscale="Viridis",
    contours=dict(showlabels=True),
    colorbar=dict(title="Predicted y"),
    opacity=0.8,
    name="Prediction Contour"
))

# 真实数据点
fig.add_trace(go.Scatter(
    x=df['P_Km_relative'],
    y=df['P_Vmax_relative'],
    mode="markers",
    marker=dict(size=6, color=y, colorscale="Viridis", showscale=False),
    name="Actual Data"
))

fig.update_layout(
    xaxis_title="P_Km_relative (x0)",
    yaxis_title="P_Vmax_relative (x1)",
    title=f"Best Equation Contour Plot<br>Eq: {best_equation_str}<br>R² = {r2:.3f}"
)

fig.show()


Error in callback _flush_stdio (for post_execute), with arguments args (),kwargs {}:


UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 4094-4095: unexpected end of data