In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostClassifier
# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Загрузка данных
train = pd.read_csv('heart.csv')

# создаём дополнительную колонку для расчёта разницы сердечного ритма
train['HR_Difference'] = (220 - train['Age']) - train['MaxHR']

# Подготовка данных
X = train.drop('HeartDisease', axis=1)
y = train['HeartDisease']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Создание пайплайна для предобработки
my_imputer = ColumnTransformer(
    transformers=[
        ('num_imputer_age', SimpleImputer(strategy='median'), ['Age']),
        ('num_imputer_hr_diffrence', SimpleImputer(strategy='median'), ['HR_Difference']),
        ('num_imputer_rest', SimpleImputer(strategy='median', missing_values=0), ['RestingBP']),
        ('num_imputer_chol', SimpleImputer(strategy='median', missing_values=0), ['Cholesterol']),
        ('num_imputer_fast', SimpleImputer(strategy='median', missing_values=0), ['FastingBS']),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

scaler_and_encoder = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'HR_Difference', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']),
        ('cat', OneHotEncoder(sparse_output=False), ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

preprocessor = Pipeline([
    ('imputer', my_imputer),
    ('scaler_encoder', scaler_and_encoder)
])

# Преобразование тренировочных данных
X_train_transformed = preprocessor.fit_transform(X_train, y_train)

# Обучение модели CatBoost
catboost_model = CatBoostClassifier(verbose=0)
catboost_model.fit(X_train_transformed, y_train)

# Сохранение модели
catboost_model.save_model('catboost_model.cbm')
print("Модель успешно сохранена в файл 'catboost_model.cbm'.")


Модель успешно сохранена в файл 'catboost_model.cbm'.
