#  Medical Heart Disease **Prediction**

In [18]:
# # 📁 medical_heart_project.ipynb
# ├── 1️⃣ Настройка окружения и загрузка данных
# ├── 2️⃣ Подготовка данных (функции препроцессинга)
# ├── 3️⃣ Decision Tree (обучение, оптимизация, метрики)
# ├── 4️⃣ Logistic Regression (обучение, оптимизация, метрики)
# ├── 5️⃣ Сравнение моделей
# ├── 6️⃣ Сохранение моделей


Настройка окружения и загрузка данных

In [19]:
!pip install optuna



In [20]:
# === Установка зависимостей и импорт ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pickle
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, fbeta_score)

# === Загрузка данных ===
# Данные используются из Tech Weekend Data Science Hackathon (Kaggle)
# Альтернативная ссылка на архив: Dropbox
!wget -q 'https://www.dropbox.com/scl/fi/zos8534bw8jxqq8gzs279/tech-weekend-data-science-hackathon.zip?rlkey=fglhtz05wycbz51rtsmeso1b1&st=2h5ubtux&dl=0' -O 'tech-weekend.zip'
!unzip -q tech-weekend.zip

df = pd.read_csv('/content/train.csv')
df.head()


replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: [n]o
error:  invalid response [[n]o]
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 0
error:  invalid response [0]
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

Unnamed: 0,ID,age,sex,chest,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electrocardiographic_results,maximum_heart_rate_achieved,exercise_induced_angina,oldpeak,slope,number_of_major_vessels,thal,class
0,0,49.207124,0,4.0,162.996167,181.108682,0,0,148.227858,1,0.944547,2,0,3,1
1,1,53.628425,1,1.741596,130.23373,276.47463,0,2,152.917139,0,0.11907,2,0,3,0
2,2,49.591426,1,4.0,146.999012,223.300517,1,2,102.35209,1,1.616747,2,2,7,1
3,3,58.991445,1,4.0,112.369143,187.245501,0,0,158.16475,1,0.0,1,1,7,1
4,4,51.053602,1,1.954609,138.032047,238.482868,0,0,172.540828,0,1.150464,1,1,3,0


Подготовка данных (функции препроцессинга)

In [21]:
def preprocess_data(df):
    """Обработка пропусков, масштабирование и разделение данных"""
    X = df.drop(columns=['ID', 'class'])
    y = df['class']

    binary_cols = ['sex', 'fasting_blood_sugar', 'exercise_induced_angina']
    num_cols = [col for col in X.columns if col not in binary_cols]

    X[num_cols] = X[num_cols].fillna(X[num_cols].median())
    X[binary_cols] = X[binary_cols].fillna(X[binary_cols].mode().iloc[0])

    scaler = StandardScaler()
    X[num_cols] = scaler.fit_transform(X[num_cols])

    return X, y, scaler

# --- Применение ---
X, y, scaler = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("✅ Данные подготовлены:", X_train.shape, X_test.shape)


✅ Данные подготовлены: (480000, 13) (120000, 13)


3️⃣ Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

def train_decision_tree(X_train, X_test, y_train, y_test):
    """Обучение DecisionTree с подбором параметров Optuna"""
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
            'class_weight': 'balanced'
        }
        model = DecisionTreeClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return fbeta_score(y_test, y_pred, beta=2)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=30, timeout=300)

    best_params = study.best_params
    print("🎯 Лучшие параметры:", best_params)

    best_tree = DecisionTreeClassifier(**best_params, random_state=42)
    best_tree.fit(X_train, y_train)

    y_pred_proba = best_tree.predict_proba(X_test)[:, 1]
    thresholds = np.linspace(0.01, 0.99, 100)
    f2_scores = [fbeta_score(y_test, (y_pred_proba >= t).astype(int), beta=2) for t in thresholds]
    optimal_threshold = thresholds[np.argmax(f2_scores)]

    print(f"✅ Optimal threshold for F2={max(f2_scores):.3f}: {optimal_threshold:.3f}")
    y_pred_opt = (y_pred_proba >= optimal_threshold).astype(int)

    print("\n=== DecisionTree ===")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))
    print("\nClassification Report:\n", classification_report(y_test, y_pred_opt))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))

    return best_tree, optimal_threshold

tree_model, dt_threshold = train_decision_tree(X_train, X_test, y_train, y_test)


[I 2025-10-07 10:12:35,691] A new study created in memory with name: no-name-c0dbb37e-7ee4-4ed9-af0c-d7175ff4a820
[I 2025-10-07 10:12:41,965] Trial 0 finished with value: 0.8797881056329334 and parameters: {'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}. Best is trial 0 with value: 0.8797881056329334.
[I 2025-10-07 10:12:44,581] Trial 1 finished with value: 0.8486094680111919 and parameters: {'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 4, 'criterion': 'gini'}. Best is trial 0 with value: 0.8797881056329334.
[I 2025-10-07 10:12:48,626] Trial 2 finished with value: 0.8769741227465861 and parameters: {'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}. Best is trial 0 with value: 0.8797881056329334.
[I 2025-10-07 10:12:53,635] Trial 3 finished with value: 0.8813349860736714 and parameters: {'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 4, 'criterion': 'entropy'}. Best is trial 3 with

🎯 Лучшие параметры: {'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 2, 'criterion': 'gini'}
✅ Optimal threshold for F2=0.910: 0.158

=== DecisionTree ===
Confusion Matrix:
 [[50412 16286]
 [ 2247 51055]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.76      0.84     66698
           1       0.76      0.96      0.85     53302

    accuracy                           0.85    120000
   macro avg       0.86      0.86      0.85    120000
weighted avg       0.87      0.85      0.85    120000

ROC AUC: 0.9558683365780674


4️⃣ Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

def train_logistic_regression(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression(
        class_weight='balanced',
        solver='liblinear',
        max_iter=1000,
        random_state=42
    )
    log_reg.fit(X_train, y_train)
    y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

    thresholds = np.linspace(0.01, 0.99, 100)
    f2_scores = [fbeta_score(y_test, (y_pred_proba >= t).astype(int), beta=2) for t in thresholds]
    optimal_threshold = thresholds[np.argmax(f2_scores)]

    print(f"✅ Optimal threshold for F2={max(f2_scores):.3f}: {optimal_threshold:.3f}")
    y_pred_opt = (y_pred_proba >= optimal_threshold).astype(int)

    print("\n=== Logistic Regression ===")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))
    print("\nClassification Report:\n", classification_report(y_test, y_pred_opt))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))

    return log_reg, optimal_threshold

log_reg_model, lr_threshold = train_logistic_regression(X_train, X_test, y_train, y_test)


✅ Optimal threshold for F2=0.901: 0.208

=== Logistic Regression ===
Confusion Matrix:
 [[48148 18550]
 [ 2366 50936]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.72      0.82     66698
           1       0.73      0.96      0.83     53302

    accuracy                           0.83    120000
   macro avg       0.84      0.84      0.83    120000
weighted avg       0.86      0.83      0.83    120000

ROC AUC: 0.9482811487853644


Сравнение моделей

In [24]:
models_comparison = pd.DataFrame({
    'Model': ['Decision Tree', 'Logistic Regression'],
    'Optimal Threshold': [dt_threshold, lr_threshold],
    'ROC AUC': [
        roc_auc_score(y_test, tree_model.predict_proba(X_test)[:, 1]),
        roc_auc_score(y_test, log_reg_model.predict_proba(X_test)[:, 1])
    ]
})
models_comparison


Unnamed: 0,Model,Optimal Threshold,ROC AUC
0,Decision Tree,0.158485,0.955868
1,Logistic Regression,0.20798,0.948281


Сохранение моделей

In [25]:
joblib.dump(tree_model, 'decision_tree_model.joblib')
joblib.dump(log_reg_model, 'logistic_regression_model.joblib')

with open('dt_threshold.pkl', 'wb') as f:
    pickle.dump(dt_threshold, f)

with open('lr_threshold.pkl', 'wb') as f:
    pickle.dump(lr_threshold, f)

print("✅ Модели и пороги сохранены!")


✅ Модели и пороги сохранены!
