# Исследование: Прогноз сердечных заболеваний

## Содержание: основной причиной смерти в развитых странах являются болезни сердца. Поэтому необходимо проделать работу, чтобы помочь предотвратить риск сердечного приступа или инсульта.

## Постановка задачи: использоват имеющийся набор данных, чтобы предсказать, какие пациенты с наибольшей вероятностью будут страдать от сердечно-сосудистых заболеваний в ближайшем будущем.

#### Материал взят из репозитория машинного обучения Калифорнийского университета в Ирвине по адресу https://archive.ics.uci.edu/ml/datasets/Heart+Disease.

## Шаг 1. Откроем файл с данными и прочитаем его

In [33]:
#Импорт библиотек и различных метрик для работы 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [34]:
data = pd.read_csv('C:/Users/User/Downloads/Heart_Disease_Prediction.csv') 

In [35]:
data.info()
display (data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


### Описание данных

Age: The person’s age in years

Sex: The person’s sex (1 = male, 0 = female)

Chest pain type: chest pain type:
                 Value 1: asymptomatic
                 Value 2: atypical angina
                 Value 3: non-anginal pain
                 Value 4: typical angina

BP: The person’s resting blood pressure (mm Hg on admission to the hospital)

Cholesterol: The person’s cholesterol measurement in mg/dl

FBS over 120: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)

EKG results: resting electrocardiographic results — 
             Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria
             Value 1: normal 
             Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

Max HR: The person’s maximum heart rate achieved

Exercise angina: Exercise induced angina (1 = yes; 0 = no)

ST depression: ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot. See more here)

Slope of ST: the slope of the peak exercise ST segment 
             0: downsloping; 
             1: flat; 
             2: upsloping

Number of vessels fluro: The number of major vessels (0–3)





Thallium: A blood disorder called thalassemia 
          Value 0: NULL (dropped from the dataset previously 
          Value 1: fixed defect (no blood flow in some part of the heart) 
          Value 2: normal blood flow 
          Value 3: reversible defect (a blood flow is observed but it is not normal)	

target: Heart Disease ((1 = yes(Presence), 0 = no(Absence)))



# Шаг 2. Предобработка и исследовательский анализ данных

In [36]:
#Изменим на змеиный шрифт названия столбцов и приведем к единообразию
data = data.rename(columns = {'Heart Disease': 'heart_disease', 'Age': 'age', 'Sex': 'sex', 
                              'Chest pain type': 'chest_pain_type', 'BP': 'bp', 'Cholesterol': 'cholesterol', 
                              'FBS over 120': 'fbs_over_120', 'EKG results': 'ekg_results', 'Max HR': 'max_hr', 
                              'Exercise angina': 'exercise_angina', 'ST depression': 'st_depression', 
                              'Slope of ST': 'slope_of_st','Number of vessels fluro': 'number_of_vessels_fluro', 
                              'Thallium': 'thallium', 'Heart Disease': 'heart_disease' })
#Проверим 
display (data)

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium,heart_disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


In [37]:
data.shape

(270, 14)

##### Проверим на возможность ошибок в датасете

In [38]:
data['age'].unique()

array([70, 67, 57, 64, 74, 65, 56, 59, 60, 63, 53, 44, 61, 71, 46, 40, 48,
       43, 47, 54, 51, 58, 66, 37, 50, 42, 62, 49, 52, 45, 41, 76, 39, 35,
       55, 34, 38, 69, 68, 77, 29], dtype=int64)

In [39]:
data['heart_disease'].unique()

array(['Presence', 'Absence'], dtype=object)

In [40]:
data.isna().sum()

age                        0
sex                        0
chest_pain_type            0
bp                         0
cholesterol                0
fbs_over_120               0
ekg_results                0
max_hr                     0
exercise_angina            0
st_depression              0
slope_of_st                0
number_of_vessels_fluro    0
thallium                   0
heart_disease              0
dtype: int64

In [41]:
data['ekg_results'].unique()

array([2, 0, 1], dtype=int64)

In [42]:
# Поиск дубликатов
data.duplicated().sum()

0

In [43]:
data.describe()

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


data['heart_disease'].unique()

Вывод: данные чистые, предварительной обработки не потребовалось. Удалять столбцы не будем, т.к. это медицинские понятия и наличие одного или другого показателя может влиять на болезнь человека
Целевой признак будет формироваться из колонки наличие болезни "heart_disease", т.к. в задании просят предсказать, какие пациенты с наибольшей вероятностью будут страдать от сердечно-сосудистых заболеваний.

#### Применим one hot encoding 

In [44]:
# Применим one hot encoding к датафрейму, избежав "ловушку фиктивных признаков"
data_ohe=pd.get_dummies(data, drop_first=True)
data_ohe.head()

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium,heart_disease_Presence
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,1
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,0
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,1
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,0
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,0


In [45]:
features=data_ohe.drop(['heart_disease_Presence'],axis=1) # Признаки
target=data_ohe['heart_disease_Presence'] # Целевой признак

#### Данные разобьем на три части: обучающую, валидационную и тестовую. Исходные данные разбиваем в соотношении 3:1:1.

In [46]:
train_features, test_features, train_target, test_target = train_test_split(features, target, 
                                                    train_size=0.6, 
                                                    random_state=12345,
                                                    stratify=target,shuffle=True)
print('train_features',train_features.shape)

train_features (162, 13)


In [47]:
test_features, valid_features, test_target, valid_target = train_test_split(test_features, test_target, 
                                                    train_size=0.5, 
                                                    random_state=12345,
                                                    stratify= test_target, shuffle=True)

In [48]:
model_logistic=LogisticRegression(random_state=12345, solver='liblinear').fit(train_features,train_target)
prediction_logistic=model_logistic.predict(test_features)
print('F1:',f1_score(valid_target,prediction_logistic))

F1: 0.5106382978723404


#####   Проверим соотношение классов

In [49]:
print('Размеры выборок:')
print('train_features',train_features.shape)
print('valid_features', valid_features.shape)
print('test_features', test_features.shape)
print('train_target', train_target.shape)
print('valid_target', valid_target.shape)
print('test_target', test_target.shape)

Размеры выборок:
train_features (162, 13)
valid_features (54, 13)
test_features (54, 13)
train_target (162,)
valid_target (54,)
test_target (54,)


In [50]:
print('соотношение классов в исходном датасете:', round((target[target == 0].count()/target[target == 1].count()),2))
print('соотношение классов на учебной выборке:', round((train_target[train_target == 0
                                                             ].count()/train_target[train_target == 1].count()),2))
print('соотношение классов на валидационной выборке:', round((valid_target[valid_target == 0
                                                                   ].count()/valid_target[valid_target == 1].count()),2))
print('соотношение классов на тестовой выборке:', round((test_target[test_target == 0].count()/
                                                         test_target[test_target == 1].count()),2))

соотношение классов в исходном датасете: 1.25
соотношение классов на учебной выборке: 1.25
соотношение классов на валидационной выборке: 1.25
соотношение классов на тестовой выборке: 1.25


In [51]:
# Стандартизируем выборки, т.к есть значения и маленькие и очень большие. 
numeric=['age','bp','cholesterol','max_hr']
scaler=StandardScaler()
scaler.fit(train_features[numeric])
train_features[numeric] = scaler.transform(train_features[numeric])
scaler.fit(valid_features[numeric])
valid_features[numeric] = scaler.transform(valid_features[numeric])
scaler.fit(test_features[numeric])
test_features[numeric] = scaler.transform(test_features[numeric])
display(train_features.head(5))
display(valid_features.head(5))
display(test_features.head(5))

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium
94,-0.277557,1,4,-0.400792,-0.630817,0,0,0.857684,0,1.0,1,2,7
72,1.672702,1,2,1.259916,-0.019109,0,2,-0.239133,0,0.0,1,0,3
189,1.672702,1,3,1.474201,0.42577,0,0,-1.599186,1,2.9,2,1,7
47,-1.144339,1,4,-1.20436,-0.908866,0,2,1.252538,0,0.0,1,1,3
1,1.347658,0,3,-0.936504,5.894067,0,2,0.506702,0,1.6,2,0,7


Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium
232,-0.951149,1,4,-0.98647,0.179209,0,2,1.305306,0,0.0,1,0,3
269,1.688956,1,4,1.700886,0.694801,0,2,-1.938844,1,1.5,2,3,3
188,1.088932,0,4,0.506506,2.836493,0,2,0.125615,0,1.2,2,0,3
119,0.368903,1,4,-0.090684,0.63531,1,2,-2.149503,1,1.6,3,0,7
137,0.368903,1,4,-0.38928,-0.038927,1,2,-0.422098,1,1.2,2,1,3


Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium
209,-2.015415,1,3,0.14264,-0.211941,0,0,1.653599,0,3.5,3,0,3
163,0.265504,1,4,0.015325,-0.005522,0,2,-0.828009,1,3.0,2,2,7
240,1.351656,1,3,3.325511,0.338512,1,2,0.042731,1,1.6,2,0,7
145,-0.277572,1,4,-0.302962,0.521996,0,0,-2.351804,1,2.0,2,2,7
151,-0.603418,0,3,-0.493934,-0.922944,0,0,0.391027,0,1.6,2,0,3


### Обучение моделей

#### Модел DecisionTree

In [52]:
# Подберем оптимальные параметры для модели решающего дерева
parameters = {
    'criterion' : ['gini', 'entropy'],
    'max_depth':[5,10,15,20]
}

dtc = DecisionTreeClassifier(random_state=12345)

clf_dtc = GridSearchCV(
    estimator = dtc,
    param_grid = parameters,
    scoring = 'f1',
    cv=5)

clf_dtc.fit(train_features, train_target)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=12345),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20]},
             scoring='f1')

In [58]:
# Обучим модель решающего дерева с лучшими гиперпараметрами и рассчитаем метрики на тестовой выборке
model_tree = clf_dtc.best_estimator_.fit(train_features,train_target)
prediction_tree = model_tree.predict(valid_features)
print("accuracy:", round ((model_tree.score(train_features,train_target)),2))
print("precision:", round((precision_score(valid_target, prediction_tree)),2))
print("recall:", round((recall_score(valid_target, prediction_tree)),2))
print("F1:", round ((f1_score(valid_target,prediction_tree)),2))
print("AUC ROC:", round(roc_auc_score(valid_target, (model_tree.predict_proba(valid_features))[:, 1]),4))
print("confusion_matrix:")
print(confusion_matrix(valid_target, prediction_tree))

accuracy: 0.91
precision: 0.78
recall: 0.88
F1: 0.82
AUC ROC: 0.8944
confusion_matrix:
[[24  6]
 [ 3 21]]


### Борьба с дисбалансом

##### Воспользуемся функцией Upsampling и увеличим ... (target==1)

In [54]:
## Функция Upsampling
def upsampling (features,target,repeat):
    #Разделяем выборку по значениям целевой функции
    target_one=target[target==1]
    target_null=target[target==0]
    features_one=features[target==1]
    features_null=features[target==0]
    
    #Увеличиваем и соединяем обратно
    upsampling_features=pd.concat([features_null]+[features_one]*repeat)
    upsampling_target=pd.concat([target_null]+[target_one]*repeat)
    
    # Перемешиваем
    upsampling_features,upsampling_target=shuffle(upsampling_features,upsampling_target,random_state=1234)
    
    return upsampling_features,upsampling_target

In [55]:
features_balance_up,target_balance_up=upsampling(train_features,train_target,4)

##### Воспользуемся функцией Downsampling и уменьшим...(target==1)

In [56]:
## Функция Downsampling
def downsampling (features,target,repeat):
    #Разделяем выборку по значениям целевой функции
    target_one=target[target==1]
    target_null=target[target==0]
    features_one=features[target==1]
    features_null=features[target==0]
    
    #Увеличиваем и соединяем обратно
    downsampling_features=pd.concat([features_null]+[features_one]*repeat)
    downsampling_target=pd.concat([target_null]+[target_one]*repeat)
    
    # Перемешиваем
    downsampling_features,downsampling_target=shuffle(downsampling_features,downsampling_target,random_state=1234)
    
    return downsampling_features,downsampling_target

In [57]:
features_balance_down,target_balance_down=downsampling(train_features,train_target,4)