In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('heart_attack_risk_dataset.csv')

In [3]:
df

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,69,Female,1,0,Moderate,34.61,1,0,152.1,171,85,0,Moderate,Non-anginal,Reversible defect,0,Normal,0,114,Low
1,32,Male,0,0,Moderate,22.75,0,0,166.8,126,103,0,Low,Asymptomatic,Normal,0,ST-T abnormality,0,173,Moderate
2,89,Male,0,1,Moderate,35.32,0,0,272.3,123,127,0,Low,Typical,Reversible defect,0,ST-T abnormality,0,109,Low
3,78,Male,0,1,Moderate,18.23,1,0,237.7,144,125,0,Low,Typical,Fixed defect,1,Left Ventricular Hypertrophy,0,129,Low
4,38,Female,1,0,Moderate,19.82,0,0,207.7,123,107,0,High,Asymptomatic,Reversible defect,0,ST-T abnormality,0,124,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,21,Male,0,0,Low,39.93,0,0,269.9,171,113,0,High,Typical,Reversible defect,0,ST-T abnormality,0,122,High
49996,35,Female,0,0,Low,18.10,0,0,235.8,146,71,0,Moderate,Non-anginal,Fixed defect,1,ST-T abnormality,0,121,Moderate
49997,46,Male,0,1,High,21.42,0,0,172.8,146,85,1,Low,Typical,Fixed defect,0,Left Ventricular Hypertrophy,0,125,Low
49998,56,Male,0,1,Low,29.93,0,0,244.1,151,110,0,Low,Asymptomatic,Reversible defect,0,Normal,0,149,Moderate


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Загрузка данных
df = pd.read_csv('heart_attack_risk_dataset.csv')

# Кастомные медицинские признаки
df['Cardio_Stress_Index'] = df['Resting_BP'] * df['Heart_Rate'] / (df['Age'] + 1e-5)
df['Metabolic_Syndrome'] = (df['BMI'] > 30).astype(int) * (df['Hypertension'] == 1).astype(int) * (df['Diabetes'] == 1).astype(int)
df['Vascular_Age'] = df['Cholesterol_Level'] * 0.1 + df['Age'] * 0.8
df['BP_to_Cholesterol_Ratio'] = df['Resting_BP'] / (df['Cholesterol_Level'] + 1e-5)

# Обработка категориальных признаков
categorical_cols = [
    'Gender', 'Physical_Activity_Level', 'Chest_Pain_Type',
    'Thalassemia', 'ECG_Results', 'Stress_Level'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        ('num', RobustScaler(), ['Age', 'BMI', 'Cholesterol_Level', 'Resting_BP'])
    ],
    remainder='passthrough'
)

# Целевая переменная
y = df['Heart_Attack_Risk'].map({'Low':0, 'Moderate':1, 'High':2})

In [None]:
# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Heart_Attack_Risk', axis=1), 
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Пайплайн с балансировкой и моделью
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(k_neighbors=3)),
    ('model', StackingClassifier(
        estimators=[
            ('xgb', XGBClassifier(
                max_depth=7,
                learning_rate=0.05,
                n_estimators=300,
                subsample=0.8,
                colsample_bytree=0.7,
                gamma=0.3,
                reg_alpha=0.1,
                reg_lambda=0.8,
                eval_metric='mlogloss',
                enable_categorical=True
            )),
            ('catboost', CatBoostClassifier(
                depth=8,
                iterations=1000,
                learning_rate=0.03,
                silent=True
            )),
            ('lgbm', LGBMClassifier(
                num_leaves=63,
                max_depth=8,
                learning_rate=0.05,
                n_estimators=500
            ))
        ],
        final_estimator=XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=200),
        cv=3,
        passthrough=True,
        stack_method='predict_proba'
    ))
])

# Параметры для оптимизации
param_grid = {
    'model__xgb__max_depth': [5, 7],
    'model__xgb__learning_rate': [0.03, 0.05],
    'model__catboost__depth': [6, 8],
    'model__lgbm__num_leaves': [63, 127],
    'smote__k_neighbors': [3, 5]
}

# Поиск по сетке
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=3, shuffle=True),
    scoring='roc_auc_ovr',
    n_jobs=2,  # Уменьшаем количество параллельных задач
    verbose=2
)

grid_search.fit(X_train, y_train)

# Лучшая модель
best_model = grid_search.best_estimator_

# Предсказания
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)

# Метрики
print("Best Parameters:", grid_search.best_params_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC (OvR):", roc_auc_score(y_test, y_proba, multi_class='ovr'))


In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')

# Загрузка данных
df = pd.read_csv('heart_attack_risk_dataset.csv')

# Генерация ключевых признаков
df['BP_Age_Ratio'] = df['Resting_BP'] / (df['Age'] + 1e-5)
df['Cholesterol_Age_Index'] = df['Cholesterol_Level'] * df['Age']

# Выбор наиболее важных признаков (можно изменить)
selected_features = [
    'Age', 'Resting_BP', 'Cholesterol_Level', 'BMI',
    'BP_Age_Ratio', 'Cholesterol_Age_Index',
    'Gender', 'Chest_Pain_Type', 'ECG_Results'
]

# Препроцессинг
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), 
     ['Gender', 'Chest_Pain_Type', 'ECG_Results']),
    ('num', RobustScaler(), ['Age', 'BMI', 'Resting_BP', 'Cholesterol_Level'])
])

# Инициализация моделей с оптимизированными параметрами
models = {
    'xgb': XGBClassifier(
        n_estimators=10,
        eval_metric='mlogloss',
        enable_categorical=True,
        tree_method='hist'
    ),
    'lgbm': LGBMClassifier(
        n_estimators=10,
        verbose=-1,
        boosting_type='dart',
        objective='multiclass'
    ),
    'catboost': CatBoostClassifier(
        iterations=10,
        silent=True,
        task_type='CPU',
        auto_class_weights='Balanced'
    )
}

# Пайплайн с ансамблем
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(k_neighbors=3)),
    ('ensemble', StackingClassifier(
        estimators=list(models.items()),
        final_estimator=XGBClassifier(n_estimators=100),
        cv=3,
        passthrough=True,
        stack_method='predict_proba'
    ))
])

# Оптимизированная сетка параметров
param_grid = {
    'ensemble__xgb__max_depth': [3, 5],
    'ensemble__lgbm__num_leaves': [31, 63],
    'ensemble__catboost__depth': [4, 6],
    'smote__k_neighbors':  [3, 5]
}

# Инициализация GridSearch
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=StratifiedKFold(2, shuffle=True),
    scoring='roc_auc_ovr',
    n_jobs=4,
    verbose=0
)

# Прогресс-бар
total_combinations = len(param_grid['ensemble__xgb__max_depth']) * \
                    len(param_grid['ensemble__lgbm__num_leaves']) * \
                    len(param_grid['ensemble__catboost__depth']) * \
                    len(param_grid['smote__k_neighbors']) * \
                    grid.cv.n_splits

with tqdm(total=total_combinations, desc='Grid Search') as pbar:
    grid.fit(df[selected_features], df['Heart_Attack_Risk'].map({'Low':0, 'Moderate':1, 'High':2}))
    pbar.update(total_combinations)

# Результаты
print(f"\nBest ROC-AUC: {grid.best_score_:.4f}")
print("Best Params:", grid.best_params_)


Grid Search: 100%|██████████| 32/32 [00:43<00:00,  1.35s/it]


Best ROC-AUC: 0.5010
Best Params: {'ensemble__catboost__depth': 4, 'ensemble__lgbm__num_leaves': 31, 'ensemble__xgb__max_depth': 3, 'smote__k_neighbors': 5}





In [50]:
# После получения предсказаний добавить:
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

# Вычисление метрик
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')

print("\n=== Final Metrics ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"ROC-AUC (OvR): {roc_auc:.4f}\n")

# Детальный отчет по классам
print("=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=['Low', 'Moderate', 'High']))

# Матрица ошибок
print("=== Confusion Matrix ===")
print(pd.crosstab(y_test, y_pred, 
                rownames=['True'], 
                colnames=['Predicted'], 
                margins=True))



=== Final Metrics ===
Accuracy: 0.2918
Precision (weighted): 0.3803
Recall (weighted): 0.2918
ROC-AUC (OvR): 0.5008

=== Classification Report ===
              precision    recall  f1-score   support

         Low       0.50      0.01      0.02      5005
    Moderate       0.30      0.91      0.45      2981
        High       0.21      0.08      0.12      2014

    accuracy                           0.29     10000
   macro avg       0.33      0.33      0.19     10000
weighted avg       0.38      0.29      0.16     10000

=== Confusion Matrix ===
Predicted   0     1    2    All
True                           
0          39  4580  386   5005
1          19  2716  246   2981
2          20  1831  163   2014
All        78  9127  795  10000
