# Установка зависимостей

In [None]:
%pip install kagglehub
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install sklearn

# Импорт библиотек

In [None]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
    confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, RocCurveDisplay

import warnings
warnings.simplefilter("ignore", FutureWarning)
%matplotlib inline

# Вспомогательная функция для печати метрик

In [None]:
def printer(name, model, x_train_scaled, x_test_scaled, y_train, y_test):
    print("===== " + name + " =====\n")

    x_train_scaled_predicted = model.predict(x_train_scaled)
    x_test_scaled_predicted = model.predict(x_test_scaled)

    print("Точность (Train):           ", accuracy_score(y_train, x_train_scaled_predicted))
    print("Точность (Test):            ", accuracy_score(y_test, x_test_scaled_predicted))
    print()

    print("Точность (Train Precision): ", precision_score(y_train, x_train_scaled_predicted))
    print("Точность (Test Precision):  ", precision_score(y_test, x_test_scaled_predicted))
    print()

    print("Полнота (Train Recall):     ", recall_score(y_train, x_train_scaled_predicted))
    print("Полнота (Test Recall):      ", recall_score(y_test, x_test_scaled_predicted))
    print()

    print("F1-метрика (Train):         ", f1_score(y_train, x_train_scaled_predicted))
    print("F1-метрика (Test):          ", f1_score(y_test, x_test_scaled_predicted))
    print()

    print("Матрица ошибок (Train):")
    disp = ConfusionMatrixDisplay(confusion_matrix(y_train, x_train_scaled_predicted))
    disp.plot()
    plt.show()

    print("Матрица ошибок (Test):")
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test, x_test_scaled_predicted))
    disp.plot()
    plt.show()

# Загрузка и подготовка данных

In [None]:
path = kagglehub.dataset_download("aadarshvelu/heart-failure-prediction-clinical-records")
print("Путь к файлам датасета:", path)

In [None]:
df = pd.read_csv("C:\\python\\GitHub Projects\\Heart-failure-prediction project\\heart_failure_clinical_records.csv")

In [None]:
x = df.drop(columns=["DEATH_EVENT"], axis=1)
y = df["DEATH_EVENT"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Логистическая регрессия

In [None]:
lr = LogisticRegression().fit(x_train_scaled, y_train)
printer("Логистическая регрессия", lr, x_train_scaled, x_test_scaled, y_train, y_test)

# Случайный лес

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(x_train_scaled, y_train)
printer("Случайный лес", rf, x_train_scaled, x_test_scaled, y_train, y_test)

# Метод ближайших соседей

In [None]:
kn = KNeighborsClassifier().fit(x_train_scaled, y_train)
printer("K-ближайших соседей", kn, x_train_scaled, x_test_scaled, y_train, y_test)

# XGBoost

In [None]:
xgb = XGBClassifier().fit(x_train_scaled, y_train)
printer("XGBoost", xgb, x_train_scaled, x_test_scaled, y_train, y_test)

# Сравнение моделей (кросс-валидация)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr_scores = cross_val_score(lr, x_train_scaled, y_train, cv=skf, scoring='f1')
rf_scores = cross_val_score(rf, x_train_scaled, y_train, cv=skf, scoring='f1')
kn_scores = cross_val_score(kn, x_train_scaled, y_train, cv=skf, scoring='f1')
xgb_score = cross_val_score(xgb, x_train_scaled, y_train, cv=skf, scoring='f1')

In [None]:
print("Логистическая регрессия: %0.4f ± %0.4f" % (lr_scores.mean(), lr_scores.std()))
print("Случайный лес:           %0.4f ± %0.4f" % (rf_scores.mean(), rf_scores.std()))
print("Метод соседей:           %0.4f ± %0.4f" % (kn_scores.mean(), kn_scores.std()))
print("XGBoost:                 %0.4f ± %0.4f" % (xgb_score.mean(), xgb_score.std()))

# Boxplot сравнение моделей по F1-метрике

In [None]:
plt.figure(figsize=(7, 7))
sns.boxplot(data=[lr_scores, rf_scores, kn_scores, xgb_score])
plt.xticks([0, 1, 2, 3], ["LogReg", "RandomForest", "KNN", "XGB"])
plt.ylabel("F1-score")
plt.title("F1-score моделей (5-fold CV)")
plt.tight_layout()
plt.show()

# Подбор гиперпараметров (GridSearchCV)

In [None]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])

param_grid = [
    {
        'clf': [LogisticRegression()], 
        'clf__C': [0.1, 1, 10]
    },
    {
        'clf': [RandomForestClassifier(random_state=42)], 
        'clf__n_estimators': [50, 100, 200], 
        'clf__max_depth': [None, 5, 10]
    },
    {
        'clf': [KNeighborsClassifier()], 
        'clf__n_neighbors': [5, 10, 15]
    },
    {
        'clf': [XGBClassifier()], 
        'clf__n_estimators': [100, 200], 
        'clf__max_depth': [3, 5, 7], 
        'clf__learning_rate': [0.01, 0.1]
    }
]

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1')
grid.fit(x_train_scaled, y_train)

results = pd.DataFrame(grid.cv_results_)
cols = ['param_clf', 'mean_test_score', 'std_test_score', 'params']
print(results[cols].sort_values('mean_test_score', ascending=False))

# ROC-кривые и AUC для лучших моделей

In [None]:
RocCurveDisplay.from_estimator(xgb, x_test_scaled, y_test)
RocCurveDisplay.from_estimator(rf, x_test_scaled, y_test)

y_pred_xgb = xgb.predict_proba(x_test_scaled)[:, 1]
y_pred_rf = rf.predict_proba(x_test_scaled)[:, 1]

print("XGB AUC:          ", roc_auc_score(y_test, y_pred_xgb))
print("Random Forest AUC:", roc_auc_score(y_test, y_pred_rf))

# Важность признаков

In [None]:
dt_val_rf = pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)
dt_val_xgb = pd.Series(xgb.feature_importances_, index=x.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 5))
plt.bar(dt_val_rf.index, dt_val_rf.values, alpha=0.6, label='RandomForest')
plt.bar(dt_val_xgb.index, dt_val_xgb.values, alpha=0.6, label='XGBoost')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Важность признака')
plt.title('Сравнение важности признаков')
plt.legend()
plt.tight_layout()
plt.show()

# Вывод

Модель **XGBoost** показала себя лучше, чем Random Forest. Несмотря на близкие значения AUC, у XGB более компактное распределение F1-оценок и лучшие результаты перекрёстной проверки.

F1-score XGB: **0.9873 ± 0.0031**

F1-score Random Forest: **0.9869 ± 0.0041**

Как и ожидалось, наибольшее влияние на предсказание оказывает параметр **time of monitoring (в днях)**.