In [25]:
# ==========================
# Основные библиотеки
# ==========================
import numpy as np
import pandas as pd
import warnings

# ==========================
# Визуализация
# ==========================
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# ==========================
# Предобработка данных
# ==========================
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler

# ==========================
# Модели и обучение
# ==========================
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor,
    StackingRegressor, AdaBoostRegressor
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# ==========================
# Метрики
# ==========================
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
)

# ==========================
# Библиотеки для бустинга
# ==========================
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool

# ==========================
# Статистика
# ==========================
from scipy.stats import f_oneway


In [26]:
df = pd.read_csv("df.csv")
df_bin = pd.read_csv("df_bin.csv")
df_cut = pd.read_csv("df_cut.csv")
df_cut_bin = pd.read_csv("df_cut_bin.csv")

огрешности двух моделей складываются при вычислении разности

При вычитании предсказаний двух моделей ошибки могут усиливаться и приводить к сильному ухудшению качества.

Сильная корреляция IC50 и CC50?

Если IC50 и CC50 в данных коррелируют, но модели обучены отдельно, то простое вычитание может не давать точного результата для SI.

Погрешности в данных

Если в данных есть шум или выбросы, они влияют сильнее при вычислении разности.

Сравниваем разные модели

In [29]:
!pip install catboost



In [30]:
df = df.drop(columns=['IC50, mM'])
df_bin = df_bin.drop(columns=['IC50, mM'])
df_cut = df_cut.drop(columns=['IC50, mM'])
df_cut_bin = df_cut_bin.drop(columns=['IC50, mM'])
df = df.drop(columns=['SI'])
df_bin = df_bin.drop(columns=['SI'])
df_cut = df_cut.drop(columns=['SI'])
df_cut_bin = df_cut_bin.drop(columns=['SI'])
df = df.drop(columns=['log_IC50'])
df_bin = df_bin.drop(columns=['log_IC50'])
df_cut = df_cut.drop(columns=['log_IC50'])
df_cut_bin = df_cut_bin.drop(columns=['log_IC50'])
df = df.drop(columns=['log_CC50'])
df_bin = df_bin.drop(columns=['log_CC50'])
df_cut = df_cut.drop(columns=['log_CC50'])
df_cut_bin = df_cut_bin.drop(columns=['log_CC50'])
df = df.drop(columns=['CC50, mM'])
df_bin = df_bin.drop(columns=['CC50, mM'])
df_cut = df_cut.drop(columns=['CC50, mM'])
df_cut_bin = df_cut_bin.drop(columns=['CC50, mM'])

In [None]:
warnings.filterwarnings("ignore")

# Очистка имён колонок и замена inf на nan
for d in [df, df_cut, df_bin, df_cut_bin]:
    d.columns = d.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
    d.replace([np.inf, -np.inf], np.nan, inplace=True)

estimators = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42, verbosity=0)),
    ('gb', GradientBoostingRegressor(random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_state=42)),
]

final_estimator = LinearRegression()
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=final_estimator,
    cv=5,
    n_jobs=-1
)

models = {
    "KNN": KNeighborsRegressor(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "HistGradientBoosting": HistGradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(random_state=42, min_gain_to_split=0, max_depth=6, num_leaves=31, verbose=-1),
    "CatBoost": CatBoostRegressor(
        verbose=0,
        random_state=42,
        bagging_temperature=0,
        depth=5,
        iterations=300,
        l2_leaf_reg=3,
        learning_rate=0.05
    ),
    "Stacking": stacking_model
}


def evaluate_models(df_input, y_col='log_SI'):
    df_eval = df_input.copy()
    df_eval = df_eval.drop(columns=['Unnamed_0'], errors='ignore')

    X = df_eval.drop(columns=[y_col])
    y = df_eval[y_col]

    X.replace([np.inf, -np.inf], np.nan, inplace=True)

    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    scaler = RobustScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    results = []

    for name, model in tqdm(models.items(), desc=f"Cross-validation ({df_input.shape[1]} features)"):
        try:
            cv_results = cross_validate(
                model, X_scaled, y,
                cv=kf,
                scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_median_absolute_error'],
                return_train_score=False,
                n_jobs=-1
            )

            mae_mean = -np.mean(cv_results['test_neg_mean_absolute_error'])
            mse_mean = -np.mean(cv_results['test_neg_mean_squared_error'])
            rmse_mean = np.sqrt(mse_mean)
            r2_mean = np.mean(cv_results['test_r2'])
            medae_mean = -np.mean(cv_results['test_neg_median_absolute_error'])

            results.append({
                "Model": name,
                "MAE": mae_mean,
                "MSE": mse_mean,
                "RMSE": rmse_mean,
                "MedAE": medae_mean,
                "R²": r2_mean
            })
        except Exception as e:
            print(f"Error in model {name}: {e}")
    return pd.DataFrame(results).sort_values(by="RMSE"), X_scaled, y, imputer, scaler

# Оценка всех датасетов и сбор результатов
print("=== Evaluation on df ===")
results_df, X_df_scaled, y_df, imputer_df, scaler_df = evaluate_models(df)
results_df['Dataset'] = 'df'

print("=== Evaluation on df_cut ===")
results_df_cut, X_cut_scaled, y_cut, imputer_cut, scaler_cut = evaluate_models(df_cut)
results_df_cut['Dataset'] = 'df_cut'

print("=== Evaluation on df_bin ===")
results_df_bin, X_bin_scaled, y_bin, imputer_bin, scaler_bin = evaluate_models(df_bin)
results_df_bin['Dataset'] = 'df_bin'

print("=== Evaluation on df_cut_bin ===")
results_df_cut_bin, X_cut_bin_scaled, y_cut_bin, imputer_cut_bin, scaler_cut_bin = evaluate_models(df_cut_bin)
results_df_cut_bin['Dataset'] = 'df_cut_bin'

comparison_df = pd.concat([results_df, results_df_cut, results_df_bin, results_df_cut_bin], ignore_index=True)

# Визуализация RMSE
plt.figure(figsize=(14, 7))
sns.barplot(data=comparison_df, x='RMSE', y='Model', hue='Dataset', palette='Set2')
plt.title("Model RMSE Comparison Across Datasets")
plt.xlabel("RMSE")
plt.ylabel("Model")
plt.legend(title="Dataset")
plt.tight_layout()
plt.show()

# Визуализация R²
plt.figure(figsize=(14, 7))
sns.barplot(data=comparison_df, x='R²', y='Model', hue='Dataset', palette='Set1')
plt.title("Model R² Comparison Across Datasets")
plt.xlabel("R²")
plt.ylabel("Model")
plt.legend(title="Dataset")
plt.tight_layout()
plt.show()

print("\nModel metrics comparison:")
print(comparison_df.pivot_table(index="Model", columns="Dataset", values=["RMSE", "R²", "MAE"]).round(4))

# --- Визуализация важности признаков для CatBoost на df (пример) ---
cat_model = models['CatBoost']
cat_model.fit(X_df_scaled, y_df)

# Получение важности признаков
feature_importances = cat_model.get_feature_importance(Pool(X_df_scaled, label=y_df))
feat_imp_df = pd.DataFrame({
    'Feature': X_df_scaled.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df.head(20))
plt.title("Top 20 Feature Importances - CatBoost (df)")
plt.tight_layout()
plt.show()

# --- График остатков для лучшей модели на df ---
# Определим лучшую модель по RMSE на df
best_model_name = results_df.loc[results_df['RMSE'].idxmin(), 'Model']
print(f"Best model on df by RMSE: {best_model_name}")

best_model = models[best_model_name]

# Для графика остатков нужна обучающая и тестовая выборка
X_train, X_test, y_train, y_test = train_test_split(X_df_scaled, y_df, test_size=0.2, random_state=42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title(f'Residuals plot for best model: {best_model_name}')
plt.tight_layout()
plt.show()

# --- Сохраняем результаты в Excel и PDF ---
from matplotlib.backends.backend_pdf import PdfPages

with PdfPages('regression_evaluation_report_SI.pdf') as pdf:
    # RMSE Plot
    plt.figure(figsize=(14, 7))
    sns.barplot(data=comparison_df, x='RMSE', y='Model', hue='Dataset', palette='Set2')
    plt.title("Model RMSE Comparison Across Datasets")
    plt.xlabel("RMSE")
    plt.ylabel("Model")
    plt.legend(title="Dataset")
    plt.tight_layout()
    pdf.savefig()
    plt.close()

    # R² Plot
    plt.figure(figsize=(14, 7))
    sns.barplot(data=comparison_df, x='R²', y='Model', hue='Dataset', palette='Set1')
    plt.title("Model R² Comparison Across Datasets")
    plt.xlabel("R²")
    plt.ylabel("Model")
    plt.legend(title="Dataset")
    plt.tight_layout()
    pdf.savefig()
    plt.close()

    # Feature importance CatBoost
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feat_imp_df.head(20))
    plt.title("Top 20 Feature Importances - CatBoost (df)")
    plt.tight_layout()
    pdf.savefig()
    plt.close()

    # Residuals plot best model
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_pred, y=residuals)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals (Actual - Predicted)')
    plt.title(f'Residuals plot for best model: {best_model_name}')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

    # Табличное сравнение
    fig, ax = plt.subplots(figsize=(14, 7))
    ax.axis('off')
    ax.axis('tight')

    table_data = comparison_df.pivot_table(
        index="Model",
        columns="Dataset",
        values=["RMSE", "R²", "MAE"]
    ).round(4)

    tbl = ax.table(
        cellText=table_data.values,
        colLabels=[f'{metric}_{ds}' for metric, ds in table_data.columns],
        rowLabels=table_data.index,
        loc='center',
        cellLoc='center'
    )
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1.2, 1.2)
    ax.set_title("Model metrics comparison")
    pdf.savefig()
    plt.close()

with pd.ExcelWriter("regression_comparison_metrics_SI.xlsx") as writer:
    results_df.to_excel(writer, sheet_name="df", index=False)
    results_df_cut.to_excel(writer, sheet_name="df_cut", index=False)
    results_df_bin.to_excel(writer, sheet_name="df_bin", index=False)
    results_df_cut_bin.to_excel(writer, sheet_name="df_cut_bin", index=False)
    comparison_df.to_excel(writer, sheet_name="Comparison", index=False)


=== Evaluation on df ===


Cross-validation (228 features):  11%|█         | 1/9 [00:03<00:30,  3.76s/it]

In [25]:
si_columns = [col for col in df_bin.columns if 'SI' in col]

if si_columns:
    print("Найдены колонки, содержащие 'SI':")
    for col in si_columns:
        print(f"- {col}")
else:
    print("Колонки, содержащие 'SI', не найдены.")


Найдены колонки, содержащие 'SI':
- log_SI


In [45]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
import joblib

# === Подготовка данных ===
# df_bin уже должен быть загружен на этом этапе

# === Добавление признака log_IC50 ===
ic50_model = joblib.load('regression_IC50_catboost_final_model_bin.pkl')

X_ic50 = df_bin.copy()
X_ic50.columns = X_ic50.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_ic50.replace([np.inf, -np.inf], np.nan, inplace=True)

if 'log_IC50' in X_ic50.columns:
    X_ic50.drop(columns=['log_IC50'], inplace=True)

ic50_imputer = SimpleImputer(strategy='most_frequent')
X_ic50_imputed = pd.DataFrame(ic50_imputer.fit_transform(X_ic50), columns=X_ic50.columns)

df_bin['log_IC50'] = ic50_model.predict(X_ic50_imputed)

# === Добавление признака log_CC50 ===
cc50_model = joblib.load('regression_CC50_lightgbm_final_model_bin.pkl')

X_cc50 = df_bin.copy()
X_cc50.columns = X_cc50.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_cc50.replace([np.inf, -np.inf], np.nan, inplace=True)

# Удаляем признаки, которых не было при обучении CC50 модели
for col_to_drop in ['log_IC50', 'log_CC50', 'log_SI']:
    if col_to_drop in X_cc50.columns:
        X_cc50.drop(columns=[col_to_drop], inplace=True)

cc50_imputer = SimpleImputer(strategy='most_frequent')
X_cc50_imputed = pd.DataFrame(cc50_imputer.fit_transform(X_cc50), columns=X_cc50.columns)

df_bin['log_CC50'] = cc50_model.predict(X_cc50_imputed)

df_bin['log_2'] = df_bin['log_IC50'] - df_bin['log_CC50']

df_bin['S_2'] = 10**df_bin['log_2']

print("Признаки log_IC50 и log_CC50 добавлены в df_bin.")

# === Обучение модели для log_SI ===
df_copy = df_bin.copy()
df_copy.columns = df_copy.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
if 'Unnamed_0' in df_copy.columns:
    df_copy.drop(columns=['Unnamed_0'], inplace=True)

X = df_copy.drop(columns=['log_SI'])
y = df_copy['log_SI']

# Импутация
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Параметры CatBoost
catboost_params = {
    'bagging_temperature': 0.13,
    'depth': 5,
    'iterations': 626,
    'l2_leaf_reg': 4.2,
    'learning_rate': 0.04,
    'verbose': 0,
    'random_state': 42,
    'loss_function': 'RMSE'
}

# Кросс-валидация
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_list, mse_list, rmse_list, medae_list, r2_list = [], [], [], [], []

print("Running 6-fold cross-validation...")

for fold, (train_index, val_index) in enumerate(kf.split(X_imputed), 1):
    X_train_cv, X_val_cv = X_imputed.iloc[train_index], X_imputed.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    model_cv = CatBoostRegressor(**catboost_params)
    model_cv.fit(X_train_cv, y_train_cv, verbose=0)

    y_pred_cv = model_cv.predict(X_val_cv)

    mae_list.append(mean_absolute_error(y_val_cv, y_pred_cv))
    mse_list.append(mean_squared_error(y_val_cv, y_pred_cv))
    rmse_list.append(np.sqrt(mse_list[-1]))
    medae_list.append(median_absolute_error(y_val_cv, y_pred_cv))
    r2_list.append(r2_score(y_val_cv, y_pred_cv))

    print(f"Fold {fold} R²: {r2_list[-1]:.4f}")

print("\nCross-validation results:")
print(f"MAE: {np.mean(mae_list):.4f} ± {np.std(mae_list):.4f}")
print(f"RMSE: {np.mean(rmse_list):.4f} ± {np.std(rmse_list):.4f}")
print(f"R²: {np.mean(r2_list):.4f} ± {np.std(r2_list):.4f}")

# Оценка на тестовой выборке
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.25, random_state=42)

cat_model_eval = CatBoostRegressor(**catboost_params)
cat_model_eval.fit(X_train, y_train)

y_pred = cat_model_eval.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nTest set performance:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MedAE: {medae:.4f}")
print(f"R²: {r2:.4f}")

# График остатков
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title(f'log(SI): Residuals plot - CatBoost evaluation model (Test set R²={r2:.4f})')
plt.tight_layout()
plt.savefig("log(SI) catboost_residuals_plot.png")
plt.close()

# Важность признаков
cat_model_final = CatBoostRegressor(**catboost_params)
cat_model_final.fit(X_imputed, y)

feature_importances = cat_model_final.get_feature_importance(Pool(X_imputed, label=y))
feat_imp_df = pd.DataFrame({
    'Feature': X_imputed.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nTop 30 важнейших признаков:")
for i, row in feat_imp_df.head(30).iterrows():
    print(f"{row['Feature']}: {row['Importance']:.4f}")

plt.figure(figsize=(10, 12))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df.head(30))
plt.title("log(SI): Top 30 Feature Importances CatBoost Final Model")
plt.tight_layout()
plt.savefig("log(SI) catboost_feature_importance_top30.png")
plt.close()

# Сохранение модели
joblib.dump(cat_model_final, 'catboost_final_model_SI.pkl')
print("Финальная модель обучена на всех данных и сохранена в 'catboost_final_model_SI.pkl'")
print("Графики сохранены в 'log(SI) catboost_residuals_plot.png' и 'log(SI) catboost_feature_importance_top30.png'")


Признаки log_IC50 и log_CC50 добавлены в df_bin.
Running 6-fold cross-validation...
Fold 1 R²: 0.4905
Fold 2 R²: 0.5167
Fold 3 R²: 0.5772
Fold 4 R²: 0.5689
Fold 5 R²: 0.5844
Fold 6 R²: 0.5413

Cross-validation results:
MAE: 0.3366 ± 0.0153
RMSE: 0.5032 ± 0.0300
R²: 0.5465 ± 0.0340

Test set performance:
MAE: 0.3581
MSE: 0.2963
RMSE: 0.5443
MedAE: 0.2375
R²: 0.5052

Top 30 важнейших признаков:
S_2: 34.2444
log_2: 27.4106
log_IC50: 8.9237
log_CC50: 1.5336
VSA_EState8: 1.2799
PEOE_VSA9: 0.8182
FpDensityMorgan3: 0.6371
SlogP_VSA5: 0.5134
EState_VSA3: 0.4939
EState_VSA4: 0.4870
VSA_EState6: 0.4391
EState_VSA7: 0.4262
MinAbsEStateIndex: 0.4186
VSA_EState7: 0.4085
qed: 0.4048
EState_VSA2: 0.3847
VSA_EState3: 0.3834
AvgIpc: 0.3825
PEOE_VSA7: 0.3781
EState_VSA8: 0.3722
BCUT2D_MRLOW: 0.3698
FpDensityMorgan1: 0.3594
VSA_EState1: 0.3569
MolWt_TPSA_ratio: 0.3529
Chi_std: 0.3445
EState_VSA_std: 0.3412
Log_Flexibility: 0.3363
BCUT2D_MRHI: 0.3273
VSA_EState4: 0.3157
SMR_VSA_max: 0.3056
Финальная модел

- по важным признакам только

In [66]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
import joblib

# === Подготовка данных ===
# df_bin уже должен быть загружен

# === Добавление признаков log_IC50, log_CC50 и расчет log_2, S_2 ===
ic50_model = joblib.load('regression_IC50_catboost_final_model_bin.pkl')
cc50_model = joblib.load('regression_CC50_lightgbm_final_model_bin.pkl')

def add_predicted_feature(df, model, drop_cols=None, imputer_strategy='most_frequent'):
    df_tmp = df.copy()
    df_tmp.columns = df_tmp.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
    df_tmp.replace([np.inf, -np.inf], np.nan, inplace=True)
    if drop_cols:
        for c in drop_cols:
            if c in df_tmp.columns:
                df_tmp.drop(columns=[c], inplace=True)
    imputer = SimpleImputer(strategy=imputer_strategy)
    X_imputed = pd.DataFrame(imputer.fit_transform(df_tmp), columns=df_tmp.columns)
    preds = model.predict(X_imputed)
    return preds

df_bin['log_IC50'] = add_predicted_feature(df_bin, ic50_model, drop_cols=['log_IC50'])
df_bin['log_CC50'] = add_predicted_feature(df_bin, cc50_model, drop_cols=['log_IC50', 'log_CC50', 'log_SI'])
df_bin['log_2'] = df_bin['log_IC50'] - df_bin['log_CC50']
df_bin['S_2'] = 10**df_bin['log_2']

print("Признаки log_IC50 и log_CC50 добавлены в df_bin.")

# === Подготовка датасета для log_SI ===
df_copy = df_bin.copy()
df_copy.columns = df_copy.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
if 'Unnamed_0' in df_copy.columns:
    df_copy.drop(columns=['Unnamed_0'], inplace=True)

X = df_copy.drop(columns=['log_SI'])
y = df_copy['log_SI']

# Импутация
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Параметры CatBoost
catboost_params = {
    'bagging_temperature': 0.13,
    'depth': 5,
    'iterations': 626,
    'l2_leaf_reg': 4.2,
    'learning_rate': 0.04,
    'verbose': 0,
    'random_state': 42,
    'loss_function': 'RMSE'
}

# Обучаем модель на всех признаках, чтобы получить важность
model_full = CatBoostRegressor(**catboost_params)
model_full.fit(X_imputed, y)

feature_importances = model_full.get_feature_importance(Pool(X_imputed, label=y))
feat_imp_df = pd.DataFrame({
    'Feature': X_imputed.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Выбираем топ-50 признаков
top_50_features = feat_imp_df.head(11)['Feature'].tolist()
print(f"Используем топ-50 признаков по важности для обучения модели:")

for f in top_50_features:
    print(f"- {f}")

# Отбираем только топ-50 признаков
X_top50 = X_imputed[top_50_features]

# Кросс-валидация на топ-50 признаках
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_list, mse_list, rmse_list, medae_list, r2_list = [], [], [], [], []

print("\nRunning 6-fold cross-validation on top-50 features...")

for fold, (train_index, val_index) in enumerate(kf.split(X_top50), 1):
    X_train_cv, X_val_cv = X_top50.iloc[train_index], X_top50.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    model_cv = CatBoostRegressor(**catboost_params)
    model_cv.fit(X_train_cv, y_train_cv, verbose=0)

    y_pred_cv = model_cv.predict(X_val_cv)

    mae_list.append(mean_absolute_error(y_val_cv, y_pred_cv))
    mse_list.append(mean_squared_error(y_val_cv, y_pred_cv))
    rmse_list.append(np.sqrt(mse_list[-1]))
    medae_list.append(median_absolute_error(y_val_cv, y_pred_cv))
    r2_list.append(r2_score(y_val_cv, y_pred_cv))

    print(f"Fold {fold} R²: {r2_list[-1]:.4f}")

print("\nCross-validation results (top-50 features):")
print(f"MAE: {np.mean(mae_list):.4f} ± {np.std(mae_list):.4f}")
print(f"RMSE: {np.mean(rmse_list):.4f} ± {np.std(rmse_list):.4f}")
print(f"R²: {np.mean(r2_list):.4f} ± {np.std(r2_list):.4f}")

# Разделение на train/test с топ-50 признаками
X_train, X_test, y_train, y_test = train_test_split(X_top50, y, test_size=0.25, random_state=42)

# Обучение модели на train с топ-50 признаками
cat_model_eval = CatBoostRegressor(**catboost_params)
cat_model_eval.fit(X_train, y_train)

# Предсказания и оценка на тесте
y_pred = cat_model_eval.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nTest set performance (top-50 features):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MedAE: {medae:.4f}")
print(f"R²: {r2:.4f}")

# График остатков
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title(f'log(SI): Residuals plot - CatBoost evaluation model (Test set R²={r2:.4f})')
plt.tight_layout()
plt.savefig("log(SI)_catboost_residuals_plot_top50.png")
plt.close()

# Финальная модель на всех данных с топ-50 признаками
cat_model_final = CatBoostRegressor(**catboost_params)
cat_model_final.fit(X_top50, y)

feature_importances_final = cat_model_final.get_feature_importance(Pool(X_top50, label=y))
feat_imp_df_final = pd.DataFrame({
    'Feature': X_top50.columns,
    'Importance': feature_importances_final
}).sort_values(by='Importance', ascending=False)

print("\nTop 30 важнейших признаков финальной модели:")
print(feat_imp_df_final.head(30))

plt.figure(figsize=(10, 12))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df_final.head(30))
plt.title("log(SI): Top 30 Feature Importances CatBoost Final Model (top-50 features)")
plt.tight_layout()
plt.savefig("log(SI)_catboost_feature_importance_top30_top50.png")
plt.close()

# Сохраняем финальную модель
joblib.dump(cat_model_final, 'catboost_final_model_SI_top50.pkl')
print("Финальная модель обучена на топ-50 признаках и сохранена в 'catboost_final_model_SI_top50.pkl'")
print("Графики сохранены с суффиксом '_top50'.")


Признаки log_IC50 и log_CC50 добавлены в df_bin.
Используем топ-50 признаков по важности для обучения модели:
- S_2
- log_2
- log_IC50
- log_CC50
- VSA_EState8
- PEOE_VSA9
- FpDensityMorgan3
- SlogP_VSA5
- EState_VSA3
- EState_VSA4
- VSA_EState6

Running 6-fold cross-validation on top-50 features...
Fold 1 R²: 0.4976
Fold 2 R²: 0.5242
Fold 3 R²: 0.5947
Fold 4 R²: 0.5888
Fold 5 R²: 0.6010
Fold 6 R²: 0.5648

Cross-validation results (top-50 features):
MAE: 0.3294 ± 0.0202
RMSE: 0.4945 ± 0.0317
R²: 0.5618 ± 0.0385

Test set performance (top-50 features):
MAE: 0.3494
MSE: 0.2788
RMSE: 0.5280
MedAE: 0.2318
R²: 0.5344

Top 30 важнейших признаков финальной модели:
             Feature  Importance
0                S_2   31.241017
1              log_2   29.671547
2           log_IC50   11.835889
3           log_CC50    4.979729
4        VSA_EState8    4.753341
5          PEOE_VSA9    3.838632
6   FpDensityMorgan3    3.292648
10       VSA_EState6    2.956686
8        EState_VSA3    2.782749
9   

In [11]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import joblib

# === Подготовка данных ===
# df_bin уже должен быть загружен

# === Добавление признаков log_IC50, log_CC50 и расчет log_2, S_2 ===
ic50_model = joblib.load('regression_IC50_catboost_final_model_bin.pkl')
cc50_model = joblib.load('regression_CC50_lightgbm_final_model_bin.pkl')

def add_predicted_feature(df, model, drop_cols=None, imputer_strategy='most_frequent'):
    df_tmp = df.copy()
    df_tmp.columns = df_tmp.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
    df_tmp.replace([np.inf, -np.inf], np.nan, inplace=True)
    if drop_cols:
        for c in drop_cols:
            if c in df_tmp.columns:
                df_tmp.drop(columns=[c], inplace=True)
    imputer = SimpleImputer(strategy=imputer_strategy)
    X_imputed = pd.DataFrame(imputer.fit_transform(df_tmp), columns=df_tmp.columns)
    preds = model.predict(X_imputed)
    return preds

df_bin['log_IC50'] = add_predicted_feature(df_bin, ic50_model, drop_cols=['log_IC50'])
df_bin['log_CC50'] = add_predicted_feature(df_bin, cc50_model, drop_cols=['log_IC50', 'log_CC50', 'log_SI'])
df_bin['log_2'] = df_bin['log_IC50'] - df_bin['log_CC50']
df_bin['S_2'] = 10**df_bin['log_2']

print("Признаки log_IC50 и log_CC50 добавлены в df_bin.")

# === Подготовка датасета для log_SI ===
df_copy = df_bin.copy()
df_copy.columns = df_copy.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
if 'Unnamed_0' in df_copy.columns:
    df_copy.drop(columns=['Unnamed_0'], inplace=True)

X = df_copy.drop(columns=['log_SI'])
y = df_copy['log_SI']

# Импутация
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Параметры RandomForest
rf_params = {
    'n_estimators': 100,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1
}

# Обучаем модель на всех признаках, чтобы получить важность
model_full = RandomForestRegressor(**rf_params)
model_full.fit(X_imputed, y)

feature_importances = model_full.feature_importances_
feat_imp_df = pd.DataFrame({
    'Feature': X_imputed.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Выбираем топ-50 признаков (в вашем примере 11, оставил 11)
top_50_features = feat_imp_df.head(11)['Feature'].tolist()
print(f"Используем топ-50 признаков по важности для обучения модели:")

for f in top_50_features:
    print(f"- {f}")

# Отбираем только топ-50 признаков
X_top50 = X_imputed[top_50_features]

# Кросс-валидация на топ-50 признаках
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_list, mse_list, rmse_list, medae_list, r2_list = [], [], [], [], []

print("\nRunning 6-fold cross-validation on top-50 features...")

for fold, (train_index, val_index) in enumerate(kf.split(X_top50), 1):
    X_train_cv, X_val_cv = X_top50.iloc[train_index], X_top50.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    model_cv = RandomForestRegressor(**rf_params)
    model_cv.fit(X_train_cv, y_train_cv)

    y_pred_cv = model_cv.predict(X_val_cv)

    mae_list.append(mean_absolute_error(y_val_cv, y_pred_cv))
    mse_list.append(mean_squared_error(y_val_cv, y_pred_cv))
    rmse_list.append(np.sqrt(mse_list[-1]))
    medae_list.append(median_absolute_error(y_val_cv, y_pred_cv))
    r2_list.append(r2_score(y_val_cv, y_pred_cv))

    print(f"Fold {fold} R²: {r2_list[-1]:.4f}")

print("\nCross-validation results (top-50 features):")
print(f"MAE: {np.mean(mae_list):.4f} ± {np.std(mae_list):.4f}")
print(f"RMSE: {np.mean(rmse_list):.4f} ± {np.std(rmse_list):.4f}")
print(f"R²: {np.mean(r2_list):.4f} ± {np.std(r2_list):.4f}")

# Разделение на train/test с топ-50 признаками
X_train, X_test, y_train, y_test = train_test_split(X_top50, y, test_size=0.25, random_state=42)

# Обучение модели на train с топ-50 признаками
rf_model_eval = RandomForestRegressor(**rf_params)
rf_model_eval.fit(X_train, y_train)

# Предсказания и оценка на тесте
y_pred = rf_model_eval.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nTest set performance (top-50 features):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MedAE: {medae:.4f}")
print(f"R²: {r2:.4f}")

# График остатков
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title(f'log(SI): Residuals plot - RandomForest evaluation model (Test set R²={r2:.4f})')
plt.tight_layout()
plt.savefig("log(SI)_rf_residuals_plot_top50.png")
plt.close()

# Финальная модель на всех данных с топ-50 признаками
rf_model_final = RandomForestRegressor(**rf_params)
rf_model_final.fit(X_top50, y)

feature_importances_final = rf_model_final.feature_importances_
feat_imp_df_final = pd.DataFrame({
    'Feature': X_top50.columns,
    'Importance': feature_importances_final
}).sort_values(by='Importance', ascending=False)

print("\nTop 30 важнейших признаков финальной модели:")
print(feat_imp_df_final.head(30))

plt.figure(figsize=(10, 12))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df_final.head(30))
plt.title("log(SI): Top 30 Feature Importances RandomForest Final Model (top-50 features)")
plt.tight_layout()
plt.savefig("log(SI)_rf_feature_importance_top30_top50.png")
plt.close()

# Сохраняем финальную модель
joblib.dump(rf_model_final, 'rf_final_model_SI_top50.pkl')
print("Финальная модель обучена на топ-50 признаках и сохранена в 'rf_final_model_SI_top50.pkl'")
print("Графики сохранены с суффиксом '_top50'.")


Признаки log_IC50 и log_CC50 добавлены в df_bin.
Используем топ-50 признаков по важности для обучения модели:
- log_2
- S_2
- log_IC50
- log_CC50
- MolLogP
- VSA_EState4
- FpDensityMorgan1
- EState_VSA_std
- FpDensityMorgan2
- EState_VSA4
- BCUT2D_MRLOW

Running 6-fold cross-validation on top-50 features...
Fold 1 R²: 0.4648
Fold 2 R²: 0.5179
Fold 3 R²: 0.5497
Fold 4 R²: 0.5416
Fold 5 R²: 0.5918
Fold 6 R²: 0.5056

Cross-validation results (top-50 features):
MAE: 0.3471 ± 0.0188
RMSE: 0.5131 ± 0.0334
R²: 0.5286 ± 0.0394

Test set performance (top-50 features):
MAE: 0.3543
MSE: 0.2892
RMSE: 0.5378
MedAE: 0.2306
R²: 0.5171

Top 30 важнейших признаков финальной модели:
             Feature  Importance
1                S_2    0.398206
0              log_2    0.385436
2           log_IC50    0.041064
3           log_CC50    0.028586
5        VSA_EState4    0.025041
4            MolLogP    0.024379
7     EState_VSA_std    0.020963
9        EState_VSA4    0.019664
10      BCUT2D_MRLOW    0.019

In [80]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
import joblib

# === Подготовка данных ===
# df_bin уже должен быть загружен

# === Добавление признаков log_IC50, log_CC50 и расчет log_2, S_2 ===
ic50_model = joblib.load('regression_IC50_catboost_final_model_bin.pkl')
cc50_model = joblib.load('regression_CC50_lightgbm_final_model_bin.pkl')

def add_predicted_feature(df, model, drop_cols=None, imputer_strategy='most_frequent'):
    df_tmp = df.copy()
    df_tmp.columns = df_tmp.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
    df_tmp.replace([np.inf, -np.inf], np.nan, inplace=True)
    if drop_cols:
        for c in drop_cols:
            if c in df_tmp.columns:
                df_tmp.drop(columns=[c], inplace=True)
    imputer = SimpleImputer(strategy=imputer_strategy)
    X_imputed = pd.DataFrame(imputer.fit_transform(df_tmp), columns=df_tmp.columns)
    preds = model.predict(X_imputed)
    return preds

df_bin['log_IC50'] = add_predicted_feature(df_bin, ic50_model, drop_cols=['log_IC50'])
df_bin['log_CC50'] = add_predicted_feature(df_bin, cc50_model, drop_cols=['log_IC50', 'log_CC50', 'log_SI'])
df_bin['log_2'] = df_bin['log_IC50'] - df_bin['log_CC50']
df_bin['S_2'] = 10**df_bin['log_2']

print("Признаки log_IC50 и log_CC50 добавлены в df_bin.")

# === Подготовка датасета для log_SI ===
df_copy = df_bin.copy()
df_copy.columns = df_copy.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
if 'Unnamed_0' in df_copy.columns:
    df_copy.drop(columns=['Unnamed_0'], inplace=True)

X = df_copy.drop(columns=['log_SI'])
y = df_copy['log_SI']

# Импутация
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Параметры CatBoost
catboost_params = {
    'bagging_temperature': 0.13,
    'depth': 5,
    'iterations': 626,
    'l2_leaf_reg': 4.2,
    'learning_rate': 0.04,
    'verbose': 0,
    'random_state': 42,
    'loss_function': 'RMSE'
}

# Обучаем модель на всех признаках, чтобы получить важность
model_full = CatBoostRegressor(**catboost_params)
model_full.fit(X_imputed, y)

feature_importances = model_full.get_feature_importance(Pool(X_imputed, label=y))
feat_imp_df = pd.DataFrame({
    'Feature': X_imputed.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Выбираем топ-50 признаков
top_50_features = feat_imp_df.head(24)['Feature'].tolist()
print(f"Используем топ-50 признаков по важности для обучения модели:")

for f in top_50_features:
    print(f"- {f}")

# Отбираем только топ-50 признаков
X_top50 = X_imputed[top_50_features]

# Кросс-валидация на топ-50 признаках
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_list, mse_list, rmse_list, medae_list, r2_list = [], [], [], [], []

print("\nRunning 6-fold cross-validation on top-50 features...")

for fold, (train_index, val_index) in enumerate(kf.split(X_top50), 1):
    X_train_cv, X_val_cv = X_top50.iloc[train_index], X_top50.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    model_cv = CatBoostRegressor(**catboost_params)
    model_cv.fit(X_train_cv, y_train_cv, verbose=0)

    y_pred_cv = model_cv.predict(X_val_cv)

    mae_list.append(mean_absolute_error(y_val_cv, y_pred_cv))
    mse_list.append(mean_squared_error(y_val_cv, y_pred_cv))
    rmse_list.append(np.sqrt(mse_list[-1]))
    medae_list.append(median_absolute_error(y_val_cv, y_pred_cv))
    r2_list.append(r2_score(y_val_cv, y_pred_cv))

    print(f"Fold {fold} R²: {r2_list[-1]:.4f}")

print("\nCross-validation results (top-50 features):")
print(f"MAE: {np.mean(mae_list):.4f} ± {np.std(mae_list):.4f}")
print(f"RMSE: {np.mean(rmse_list):.4f} ± {np.std(rmse_list):.4f}")
print(f"R²: {np.mean(r2_list):.4f} ± {np.std(r2_list):.4f}")

# Разделение на train/test с топ-50 признаками
X_train, X_test, y_train, y_test = train_test_split(X_top50, y, test_size=0.25, random_state=42)

# Обучение модели на train с топ-50 признаками
cat_model_eval = CatBoostRegressor(**catboost_params)
cat_model_eval.fit(X_train, y_train)

# Предсказания и оценка на тесте
y_pred = cat_model_eval.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nTest set performance (top-50 features):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MedAE: {medae:.4f}")
print(f"R²: {r2:.4f}")

# График остатков
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title(f'log(SI): Residuals plot - CatBoost evaluation model (Test set R²={r2:.4f})')
plt.tight_layout()
plt.savefig("log(SI)_catboost_residuals_plot_top50.png")
plt.close()

# Финальная модель на всех данных с топ-50 признаками
cat_model_final = CatBoostRegressor(**catboost_params)
cat_model_final.fit(X_top50, y)

feature_importances_final = cat_model_final.get_feature_importance(Pool(X_top50, label=y))
feat_imp_df_final = pd.DataFrame({
    'Feature': X_top50.columns,
    'Importance': feature_importances_final
}).sort_values(by='Importance', ascending=False)

print("\nTop 30 важнейших признаков финальной модели:")
print(feat_imp_df_final.head(30))

plt.figure(figsize=(10, 12))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df_final.head(30))
plt.title("log(SI): Top 30 Feature Importances CatBoost Final Model (top-50 features)")
plt.tight_layout()
plt.savefig("log(SI)_catboost_feature_importance_top30_top50.png")
plt.close()

# Сохраняем финальную модель
joblib.dump(cat_model_final, 'catboost_final_model_SI_top50.pkl')
print("Финальная модель обучена на топ-50 признаках и сохранена в 'catboost_final_model_SI_top50.pkl'")
print("Графики сохранены с суффиксом '_top50'.")

# === ДОПОЛНИТЕЛЬНО: Тест модели на top-10 до top-50 признаках ===
print("\nТест производительности модели CatBoost для различного количества топ-признаков:")

for n_feats in range(10, 90):
    top_features_n = feat_imp_df.head(n_feats)['Feature'].tolist()
    X_top_n = X_imputed[top_features_n]

    X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_top_n, y, test_size=0.25, random_state=42)

    model_n = CatBoostRegressor(**catboost_params)
    model_n.fit(X_train_n, y_train_n, verbose=0)

    y_pred_n = model_n.predict(X_test_n)

    mae_n = mean_absolute_error(y_test_n, y_pred_n)
    r2_n = r2_score(y_test_n, y_pred_n)

    print(f"Iteration top-{n_feats} features:")
    print(f"MAE: {mae_n:.4f}")
    print(f"R²:  {r2_n:.4f}\n")



Признаки log_IC50 и log_CC50 добавлены в df_bin.
Используем топ-50 признаков по важности для обучения модели:
- S_2
- log_2
- log_IC50
- log_CC50
- VSA_EState8
- PEOE_VSA9
- FpDensityMorgan3
- SlogP_VSA5
- EState_VSA3
- EState_VSA4
- VSA_EState6
- EState_VSA7
- MinAbsEStateIndex
- VSA_EState7
- qed
- EState_VSA2
- VSA_EState3
- AvgIpc
- PEOE_VSA7
- EState_VSA8
- BCUT2D_MRLOW
- FpDensityMorgan1
- VSA_EState1
- MolWt_TPSA_ratio

Running 6-fold cross-validation on top-50 features...
Fold 1 R²: 0.4981
Fold 2 R²: 0.5265
Fold 3 R²: 0.5857
Fold 4 R²: 0.5770
Fold 5 R²: 0.6006
Fold 6 R²: 0.5462

Cross-validation results (top-50 features):
MAE: 0.3309 ± 0.0168
RMSE: 0.4981 ± 0.0311
R²: 0.5557 ± 0.0357

Test set performance (top-50 features):
MAE: 0.3516
MSE: 0.2823
RMSE: 0.5313
MedAE: 0.2270
R²: 0.5286

Top 30 важнейших признаков финальной модели:
              Feature  Importance
1               log_2   31.489676
0                 S_2   25.742169
2            log_IC50   10.634645
3            l

In [94]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
import joblib

# === Добавление признака log_IC50 на основе ранее обученной модели ===

# Загрузка модели IC50
ic50_model = joblib.load('regression_IC50_catboost_final_model_bin.pkl')

# Подготовка данных для предсказания log_IC50
X_ic50 = df_bin.copy()
X_ic50.columns = X_ic50.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_ic50.replace([np.inf, -np.inf], np.nan, inplace=True)

# Удаление log_IC50, если он уже есть
if 'log_IC50' in X_ic50.columns:
    X_ic50 = X_ic50.drop(columns=['log_IC50'])

# Импутация пропусков (most_frequent, как в обучении модели)
ic50_imputer = SimpleImputer(strategy='most_frequent')
X_ic50_imputed = pd.DataFrame(ic50_imputer.fit_transform(X_ic50), columns=X_ic50.columns)

# Предсказание и добавление признака
df_bin['log_IC50'] = ic50_model.predict(X_ic50_imputed)
print("Признак log_IC50 добавлен в df_bin с использованием модели regression_IC50_catboost_final_model_bin.pkl")

# === Обучение модели для log_SI ===

# Копируем и чистим данные
df_copy = df_bin.copy()
df_copy.columns = df_copy.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
if 'Unnamed_0' in df_copy.columns:
    df_copy.drop(columns=['Unnamed_0'], inplace=True)

X = df_copy.drop(columns=['log_SI'])
y = df_copy['log_SI']

# Импутация
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Параметры CatBoost
catboost_params = {
    'bagging_temperature': 0.13,
    'depth': 5,
    'iterations': 626,
    'l2_leaf_reg': 4.2,
    'learning_rate': 0.04,
    'verbose': 0,
    'random_state': 42,
    'loss_function': 'RMSE'
}

# Кросс-валидация
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_list, mse_list, rmse_list, medae_list, r2_list = [], [], [], [], []

print("Running 6-fold cross-validation...")

for fold, (train_index, val_index) in enumerate(kf.split(X_imputed), 1):
    X_train_cv, X_val_cv = X_imputed.iloc[train_index], X_imputed.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    model_cv = CatBoostRegressor(**catboost_params)
    model_cv.fit(X_train_cv, y_train_cv, verbose=0)

    y_pred_cv = model_cv.predict(X_val_cv)

    mae_list.append(mean_absolute_error(y_val_cv, y_pred_cv))
    mse_list.append(mean_squared_error(y_val_cv, y_pred_cv))
    rmse_list.append(np.sqrt(mse_list[-1]))
    medae_list.append(median_absolute_error(y_val_cv, y_pred_cv))
    r2_list.append(r2_score(y_val_cv, y_pred_cv))

    print(f"Fold {fold} R²: {r2_list[-1]:.4f}")

print("\nCross-validation results:")
print(f"MAE: {np.mean(mae_list):.4f} ± {np.std(mae_list):.4f}")
print(f"RMSE: {np.mean(rmse_list):.4f} ± {np.std(rmse_list):.4f}")
print(f"R²: {np.mean(r2_list):.4f} ± {np.std(r2_list):.4f}")

# Оценка на отложенной выборке
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.25, random_state=42)

cat_model_eval = CatBoostRegressor(**catboost_params)
cat_model_eval.fit(X_train, y_train)

y_pred = cat_model_eval.predict(X_test)

# Метрики
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nTest set performance:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MedAE: {medae:.4f}")
print(f"R²: {r2:.4f}")

# График остатков
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title(f'log(SI): Residuals plot - CatBoost evaluation model (Test set R²={r2:.4f})')
plt.tight_layout()
plt.savefig("log(SI) catboost_residuals_plot.png")
plt.close()

# Важность признаков
cat_model_final = CatBoostRegressor(**catboost_params)
cat_model_final.fit(X_imputed, y)

feature_importances = cat_model_final.get_feature_importance(Pool(X_imputed, label=y))
feat_imp_df = pd.DataFrame({
    'Feature': X_imputed.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nTop 30 важнейших признаков:")
for i, row in feat_imp_df.head(30).iterrows():
    print(f"{row['Feature']}: {row['Importance']:.4f}")

plt.figure(figsize=(10, 12))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df.head(30))
plt.title("log(SI): Top 30 Feature Importances CatBoost Final Model")
plt.tight_layout()
plt.savefig("log(SI) catboost_feature_importance_top30.png")
plt.close()

# Сохранение модели
joblib.dump(cat_model_final, 'catboost_final_model_SI.pkl')
print("Финальная модель обучена на всех данных и сохранена в 'catboost_final_model_SI.pkl'")
print("Графики сохранены в 'log(SI) catboost_residuals_plot.png' и 'log(SI) catboost_feature_importance_top30.png'")


Признак log_IC50 добавлен в df_bin с использованием модели regression_IC50_catboost_final_model_bin.pkl
Running 6-fold cross-validation...
Fold 1 R²: 0.3427
Fold 2 R²: 0.3294
Fold 3 R²: 0.3465
Fold 4 R²: 0.3662
Fold 5 R²: 0.3686
Fold 6 R²: 0.3692

Cross-validation results:
MAE: 0.4338 ± 0.0206
RMSE: 0.6009 ± 0.0261
R²: 0.3538 ± 0.0152

Test set performance:
MAE: 0.4442
MSE: 0.3863
RMSE: 0.6216
MedAE: 0.2953
R²: 0.3548

Top 30 важнейших признаков:
log_IC50: 31.7864
VSA_EState8: 1.4716
BCUT2D_CHGLO: 1.3318
BCUT2D_MRHI: 1.2749
EState_VSA3: 1.2582
AvgIpc: 1.1778
FractionCSP3: 1.1684
RingCount: 1.1412
EState_VSA4: 1.1349
SMR_VSA10: 1.0929
FpDensityMorgan3: 1.0885
PEOE_VSA8: 1.0710
VSA_EState2: 1.0416
VSA_EState3: 0.9894
FpDensityMorgan2: 0.9701
PEOE_VSA6: 0.9619
PEOE_VSA_std: 0.9594
qed: 0.9165
VSA_EState4: 0.8782
MaxAbsPartialCharge: 0.8606
EState_VSA5: 0.8482
EState_VSA8: 0.8303
Log_MolWt_TPSA_ratio: 0.8100
BCUT2D_MWLOW_log: 0.7929
MolLogP: 0.7840
SlogP_VSA4: 0.7559
SlogP_VSA2: 0.7252
PEO

In [88]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
import joblib

# Копируем данные, чистим
df_copy = df_bin.copy()
df_copy.columns = df_copy.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
if 'Unnamed_0' in df_copy.columns:
    df_copy.drop(columns=['Unnamed_0'], inplace=True)

X = df_copy.drop(columns=['log_SI'])
y = df_copy['log_SI']

# Импутация
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

catboost_params = {
    'bagging_temperature': 0.13,
    'depth': 5,
    'iterations': 626,
    'l2_leaf_reg': 4.2,
    'learning_rate': 0.04,
    'verbose': 0,
    'random_state': 42,
    'loss_function': 'RMSE'
}

# Кросс-валидация
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_list, mse_list, rmse_list, medae_list, r2_list = [], [], [], [], []

print("Running 6-fold cross-validation...")

for fold, (train_index, val_index) in enumerate(kf.split(X_imputed), 1):
    X_train_cv, X_val_cv = X_imputed.iloc[train_index], X_imputed.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    model_cv = CatBoostRegressor(**catboost_params)
    model_cv.fit(X_train_cv, y_train_cv, verbose=0)

    y_pred_cv = model_cv.predict(X_val_cv)

    mae_list.append(mean_absolute_error(y_val_cv, y_pred_cv))
    mse_list.append(mean_squared_error(y_val_cv, y_pred_cv))
    rmse_list.append(np.sqrt(mse_list[-1]))
    medae_list.append(median_absolute_error(y_val_cv, y_pred_cv))
    r2_list.append(r2_score(y_val_cv, y_pred_cv))

    print(f"Fold {fold} R²: {r2_list[-1]:.4f}")

print("\nCross-validation results:")
print(f"MAE: {np.mean(mae_list):.4f} ± {np.std(mae_list):.4f}")
print(f"RMSE: {np.mean(rmse_list):.4f} ± {np.std(rmse_list):.4f}")
print(f"R²: {np.mean(r2_list):.4f} ± {np.std(r2_list):.4f}")

# Оценка на отложенной выборке
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.25, random_state=42)

cat_model_eval = CatBoostRegressor(**catboost_params)
cat_model_eval.fit(X_train, y_train)

y_pred = cat_model_eval.predict(X_test)

# Метрики
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nTest set performance:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MedAE: {medae:.4f}")
print(f"R²: {r2:.4f}")

# График остатков
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title(f'log(SI): Residuals plot - CatBoost evaluation model (Test set R²={r2:.4f})')
plt.tight_layout()
plt.savefig("log(SI) catboost_residuals_plot.png")
plt.close()

# Важность признаков
cat_model_final = CatBoostRegressor(**catboost_params)
cat_model_final.fit(X_imputed, y)

feature_importances = cat_model_final.get_feature_importance(Pool(X_imputed, label=y))
feat_imp_df = pd.DataFrame({
    'Feature': X_imputed.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nTop 30 важнейших признаков:")
for i, row in feat_imp_df.head(30).iterrows():
    print(f"{row['Feature']}: {row['Importance']:.4f}")

plt.figure(figsize=(10, 12))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df.head(30))
plt.title("log(SI): Top 30 Feature Importances CatBoost Final Model")
plt.tight_layout()
plt.savefig("log(SI) catboost_feature_importance_top30.png")
plt.close()

# Сохранение модели
joblib.dump(cat_model_final, 'catboost_final_model_SI.pkl')
print("Финальная модель обучена на всех данных и сохранена в 'catboost_final_model_SI.pkl'")
print("Графики сохранены в 'log(SI) catboost_residuals_plot.png' и 'catboost_feature_importance_top30.png'")


Running 6-fold cross-validation...
Fold 1 R²: 0.2570
Fold 2 R²: 0.2114
Fold 3 R²: 0.2350
Fold 4 R²: 0.2382
Fold 5 R²: 0.2589
Fold 6 R²: 0.3044

Cross-validation results:
MAE: 0.4832 ± 0.0257
RMSE: 0.6471 ± 0.0331
R²: 0.2508 ± 0.0287

Test set performance:
MAE: 0.5004
MSE: 0.4468
RMSE: 0.6685
MedAE: 0.3724
R²: 0.2538

Top 30 важнейших признаков:
FractionCSP3: 3.8738
VSA_EState8: 3.0103
BCUT2D_CHGLO: 2.7643
VSA_EState6: 2.4517
VSA_EState4: 2.2712
BCUT2D_MRLOW: 2.2073
RingCount: 2.1850
SlogP_VSA5: 1.9432
FpDensityMorgan3: 1.9268
Log_Flexibility: 1.8815
qed: 1.8293
AvgIpc: 1.7758
EState_VSA4: 1.6430
VSA_EState2: 1.6094
NumSaturatedHeterocycles: 1.5524
HallKierAlpha: 1.4643
SMR_VSA4: 1.4329
NHOHCount: 1.4261
MinPartialCharge: 1.3925
NumSaturatedCarbocycles: 1.3219
Flexibility: 1.2937
MaxAbsPartialCharge: 1.1816
EState_VSA_std: 1.1403
PEOE_VSA7: 1.0894
CSP3_Kappa1_ratio: 1.0805
MinAbsEStateIndex: 1.0672
EState_VSA5: 0.9769
VSA_EState3: 0.9737
PEOE_VSA2: 0.9709
EState_VSA3: 0.9530
Финальная м

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from catboost import CatBoostRegressor

# 1. Подготовка данных
X = df.drop(columns=['log_SI'])
y = df['log_SI']

# Импутация пропусков
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Делим на train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42
)

# 2. Настройка модели и параметров
model = CatBoostRegressor(verbose=0, random_state=42)

param_grid = {
    'depth': [4, 5, 6],
    'learning_rate': [0.03, 0.05, 0.08],
    'iterations': [300, 400],
    'l2_leaf_reg': [2, 3, 4],
    'bagging_temperature': [0, 1]
}

# 3. Метрики
scoring = {
    'rmse': make_scorer(mean_squared_error, greater_is_better=False),
    'r2': make_scorer(r2_score)
}

# 4. GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scoring,
    refit='r2',
    cv=3,
    n_jobs=-1,
    return_train_score=True
)

# Обучение
grid_search.fit(X_train, y_train)

# 5. Результаты
print("Лучшие параметры:")
print(grid_search.best_params_)

# Оценка на тестовой выборке
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

rmse_test = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print("\nМетрики на тестовой выборке:")
print(f"RMSE: {rmse_test:.4f}")
print(f"R²:   {r2_test:.4f}")

# 6. Графики по результатам GridSearchCV
cv_results = pd.DataFrame(grid_search.cv_results_)

# Вычисляем RMSE (отрицательный MSE → RMSE)
cv_results['rmse'] = (-cv_results['mean_test_rmse'])**0.5
cv_results_sorted_rmse = cv_results.sort_values(by='rmse', ascending=False).reset_index(drop=True)

# График RMSE
plt.figure(figsize=(10, 5))
sns.lineplot(data=cv_results_sorted_rmse['rmse'], marker='o')
plt.title('RMSE по комбинациям параметров (от худших к лучшим)')
plt.xlabel('Комбинация параметров (индекс)')
plt.ylabel('RMSE')
plt.grid(True)
plt.tight_layout()
plt.show()

# График R²
cv_results_sorted_r2 = cv_results.sort_values(by='mean_test_r2', ascending=True).reset_index(drop=True)

plt.figure(figsize=(10, 5))
sns.lineplot(data=cv_results_sorted_r2['mean_test_r2'], marker='o')
plt.title('R² по комбинациям параметров (от худших к лучшим)')
plt.xlabel('Комбинация параметров (индекс)')
plt.ylabel('R²')
plt.grid(True)
plt.tight_layout()
plt.show()


KeyboardInterrupt: 

In [35]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# === Настройки модели ===
mlp = MLPRegressor(
    hidden_layer_sizes=(1024, 512, 256, 128, 64, 32, 16),
    activation='relu',
    solver='adam',
    alpha=0.0005,
    batch_size=32,
    learning_rate='adaptive',
    learning_rate_init=0.0003,
    max_iter=3000,
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=40,
    random_state=42,
    verbose=False
)

# === Пути к датасетам ===
datasets = {
    "df.csv": "df.csv",
    "df_bin.csv": "df_bin.csv",
    "df_cut.csv": "df_cut.csv",
    "df_cut_bin.csv": "df_cut_bin.csv"
}

# === Целевая переменная ===
target = "log_SI"

# === Результаты по каждому датасету ===
results = []

# tqdm для отображения прогресса
for name in tqdm(datasets, desc="Обработка датасетов"):
    path = datasets[name]
    tqdm.write(f"\n📂 Загружаем {name}")
    df = pd.read_csv(path)

    if target not in df.columns:
        tqdm.write(f"⚠️ Целевая переменная '{target}' не найдена в {name}, пропускаем.")
        continue

    # Удалим признаки, содержащие утечку
    leak_keywords = ['IC50', 'CC50', 'SI']
    leak_cols = [col for col in df.columns if any(key.lower() in col.lower() for key in leak_keywords)]
    leak_cols = list(set(leak_cols) - set([target]))  # Не удаляем целевую переменную

    if leak_cols:
        tqdm.write(f"🧹 Удаляем признаки с утечкой: {', '.join(leak_cols)}")

    # Отделяем признаки и целевую переменную
    X = df.drop(columns=[target] + leak_cols)
    y = df[target]

    # Очистка и препроцессинг
    tqdm.write("⚙️ Препроцессинг...")
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)

    # Разделение на train/test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Обучение модели
    tqdm.write(" Обучаем модель...")
    model = MLPRegressor(**mlp.get_params())
    model.fit(X_train, y_train)
    tqdm.write(f" Обучение завершено. Итераций: {model.n_iter_}")

    # Предсказания и метрики
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Кросс-валидация
    tqdm.write(" Кросс-валидация...")
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='r2', n_jobs=-1)

    # Сохраняем метрики
    results.append({
        "Dataset": name,
        "Test MAE": mae,
        "Test RMSE": rmse,
        "Test R²": r2,
        "CV R² Mean": cv_scores.mean(),
        "CV R² Std": cv_scores.std()
    })

    tqdm.write(f" {name} готово: R² = {r2:.4f}, MAE = {mae:.4f}\n")

# === Вывод таблицы результатов ===
results_df = pd.DataFrame(results)
print("\n === Сравнение результатов по датасетам ===", flush=True)
print(results_df.to_string(index=False, float_format="%.4f"), flush=True)


Обработка датасетов:   0%|          | 0/4 [00:00<?, ?it/s]


📂 Загружаем df.csv
🧹 Удаляем признаки с утечкой: FpDensityMorgan2, SI, FpDensityMorgan3, IC50, mM, CC50, mM, log_IC50, FpDensityMorgan1, log_CC50
⚙️ Препроцессинг...
 Обучаем модель...


Обработка датасетов:   0%|          | 0/4 [01:02<?, ?it/s]

 Обучение завершено. Итераций: 58
 Кросс-валидация...


Обработка датасетов:  25%|██▌       | 1/4 [04:28<13:24, 268.28s/it]

 df.csv готово: R² = 0.2008, MAE = 0.5131


📂 Загружаем df_bin.csv
🧹 Удаляем признаки с утечкой: FpDensityMorgan2, SI, FpDensityMorgan3, IC50, mM, CC50, mM, log_IC50, FpDensityMorgan1, log_CC50
⚙️ Препроцессинг...
 Обучаем модель...


Обработка датасетов:  25%|██▌       | 1/4 [05:19<13:24, 268.28s/it]

 Обучение завершено. Итераций: 49
 Кросс-валидация...


Обработка датасетов:  50%|█████     | 2/4 [08:41<08:39, 259.55s/it]

 df_bin.csv готово: R² = 0.1903, MAE = 0.5108


📂 Загружаем df_cut.csv
🧹 Удаляем признаки с утечкой: SI, IC50, mM, CC50, mM, log_IC50, FpDensityMorgan1, log_CC50
⚙️ Препроцессинг...
 Обучаем модель...


Обработка датасетов:  50%|█████     | 2/4 [09:30<08:39, 259.55s/it]

 Обучение завершено. Итераций: 50
 Кросс-валидация...


Обработка датасетов:  75%|███████▌  | 3/4 [13:36<04:35, 275.77s/it]

 df_cut.csv готово: R² = 0.1933, MAE = 0.5112


📂 Загружаем df_cut_bin.csv
🧹 Удаляем признаки с утечкой: SI, IC50, mM, CC50, mM, log_IC50, FpDensityMorgan1, log_CC50
⚙️ Препроцессинг...
 Обучаем модель...


Обработка датасетов:  75%|███████▌  | 3/4 [14:23<04:35, 275.77s/it]

 Обучение завершено. Итераций: 47
 Кросс-валидация...


Обработка датасетов: 100%|██████████| 4/4 [17:30<00:00, 262.55s/it]

 df_cut_bin.csv готово: R² = 0.1976, MAE = 0.5087


 === Сравнение результатов по датасетам ===
       Dataset  Test MAE  Test RMSE  Test R²  CV R² Mean  CV R² Std
        df.csv    0.5131     0.6983   0.2008      0.2241     0.0638
    df_bin.csv    0.5108     0.6915   0.1903      0.2027     0.0675
    df_cut.csv    0.5112     0.7015   0.1933      0.1922     0.0499
df_cut_bin.csv    0.5087     0.6884   0.1976      0.2196     0.0531



