In [1]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# Загрузка данных
data = pd.read_csv('data_merged.csv')

data

Unnamed: 0.1,Unnamed: 0,Npair,Nm1,Bm1_N,Bm1_S,Fm1_B,Nm2,Bm2_N,Bm2_S,Fm2_B,Bpair_N,Bpair_S,Fpair_N,Fpair_S
0,1,2,11,234,233,5.94,17,233,236,5.96,343,345,11.897638,11.897638
1,2,3,102,237,221,5.88,108,220,236,5.83,342,341,11.740157,11.740157
2,3,4,105,234,230,5.93,121,231,234,5.91,342,342,11.858268,11.858268
3,4,5,107,241,222,5.91,100,221,242,5.91,347,349,11.850394,11.850394
4,5,6,109,235,229,5.93,118,220,233,5.80,339,340,11.748031,11.748031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,242,288,95,235,228,5.91,485,227,235,5.87,341,341,11.811024,11.811024
229,243,289,96,233,230,5.91,117,233,234,5.94,342,341,11.881890,11.881890
230,244,290,97,241,220,5.92,394,221,240,5.91,347,346,11.866142,11.866142
231,245,291,98,248,219,5.89,1866,213,245,5.91,352,351,11.826772,11.842520


In [3]:
# Удаление строк с пропущенной целевой переменной
# data = data.dropna(subset=['В на пов-ти, мТл, N'])
data = data.dropna()
data

Unnamed: 0.1,Unnamed: 0,Npair,Nm1,Bm1_N,Bm1_S,Fm1_B,Nm2,Bm2_N,Bm2_S,Fm2_B,Bpair_N,Bpair_S,Fpair_N,Fpair_S
0,1,2,11,234,233,5.94,17,233,236,5.96,343,345,11.897638,11.897638
1,2,3,102,237,221,5.88,108,220,236,5.83,342,341,11.740157,11.740157
2,3,4,105,234,230,5.93,121,231,234,5.91,342,342,11.858268,11.858268
3,4,5,107,241,222,5.91,100,221,242,5.91,347,349,11.850394,11.850394
4,5,6,109,235,229,5.93,118,220,233,5.80,339,340,11.748031,11.748031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,242,288,95,235,228,5.91,485,227,235,5.87,341,341,11.811024,11.811024
229,243,289,96,233,230,5.91,117,233,234,5.94,342,341,11.881890,11.881890
230,244,290,97,241,220,5.92,394,221,240,5.91,347,346,11.866142,11.866142
231,245,291,98,248,219,5.89,1866,213,245,5.91,352,351,11.826772,11.842520


In [22]:
# Определение признаков и метки
# 2. Подготовка данных
X = data[['Bm1_N', 'Bm1_S', 'Fm1_B', 'Bm2_N', 'Bm2_S', 'Fm2_B']]
# y = data[['Bpair_N', 'Bpair_S', 'Fpair_N', 'Fpair_S']]
y = data[['Fpair_N']]

In [23]:
# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [24]:
# Инициализация модели CatBoost
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    verbose=100,
    eval_metric='R2',
    early_stopping_rounds=50
)

In [25]:
# Обучение модели
model.fit(
    X_train, y_train,
    # eval_set=(X_test, y_test),
    # use_best_model=True
)

0:	learn: 0.0947259	total: 191ms	remaining: 3m 10s
100:	learn: 0.9270276	total: 283ms	remaining: 2.52s
200:	learn: 0.9697790	total: 481ms	remaining: 1.91s
300:	learn: 0.9858031	total: 597ms	remaining: 1.39s
400:	learn: 0.9936330	total: 720ms	remaining: 1.07s
500:	learn: 0.9970537	total: 798ms	remaining: 795ms
600:	learn: 0.9985815	total: 882ms	remaining: 586ms
700:	learn: 0.9993021	total: 969ms	remaining: 413ms
800:	learn: 0.9996678	total: 1.05s	remaining: 261ms
900:	learn: 0.9998292	total: 1.14s	remaining: 126ms
999:	learn: 0.9999091	total: 1.23s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1a81efe9490>

In [27]:
# Предсказание и оценка
y_pred = model.predict(X_test)
# print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2: {r2_score(y_test, y_pred)}')

R2: 0.6790834618446642


In [28]:
# Сохранение модели
model.save_model('catboost_model.cbm')

In [29]:
# import matplotlib.pyplot as plt
# import seaborn as sns

plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')

# Scatter plot предсказаний vs реальных значений
ax = sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)  # Линия идеальных предсказаний

# Аннотация метрик
text = f'R²: {r2_score(y_test, y_pred):.3f}\nRMSE: {mean_squared_error(y_test, y_pred, squared=False):.3f}'
plt.text(0.05, 0.95, text, transform=ax.transAxes, 
         verticalalignment='top', bbox=dict(facecolor='white', alpha=0.8))

plt.title('Предсказанные vs Реальные значения: "В на пов-ти, мТл, N"', fontsize=14)
plt.xlabel('Реальные значения', fontsize=12)
plt.ylabel('Предсказанные значения', fontsize=12)
plt.tight_layout()
plt.show()

# Гистограмма ошибок
plt.figure(figsize=(10, 5))
errors = y_pred - y_test
sns.histplot(errors, kde=True, bins=30)
plt.title('Распределение ошибок предсказаний', fontsize=14)
plt.xlabel('Ошибка (Предсказание - Реальное значение)', fontsize=12)
plt.ylabel('Частота', fontsize=12)
plt.axvline(x=0, color='r', linestyle='--')
plt.tight_layout()
plt.show()

ValueError: Data must be 1-dimensional, got ndarray of shape (47, 1) instead

<Figure size 1000x600 with 0 Axes>