### Задание

В рамках этого итогового задания мы будем прогнозировать сердечную недостаточность.

Плана по выполнению задания не будет. 
Но есть несколько требований: 

- оберните весь конвейер преобразований в Pipeline

- подберите оптимальный вариант прогнозной модели с помощью GridSearchCV

- примените обученный на тренировочных данных конвейер к тестовым данным, никак не предобрабатывая их, а лишь загрузив из файла и отделив целевой признак от остальных

- получите на тестовой части качество не ниже 0.87 по метрике ROCAUC

Пояснение: если пропуски в новых данных и будут, то только в тех колонках, где они есть в тренировочной части.

In [5]:
import pandas as pd
import dill as pickle
import requests
import json
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

In [6]:
df_train = pd.read_csv('heart_adapt_train.csv')
df_train.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,74.0,M,NAP,138.0,,0,Normal,116,N,0.2,Up,0
1,58.0,M,NAP,132.0,224.0,0,LVH,173,N,3.2,Up,1
2,44.0,M,ATA,150.0,288.0,0,Normal,150,Y,3.0,Flat,1
3,50.0,M,ASY,144.0,349.0,0,LVH,120,Y,1.0,Up,1
4,,M,ASY,145.0,248.0,0,Normal,96,Y,2.0,Flat,1
5,51.0,M,NAP,135.0,160.0,0,Normal,150,N,2.0,Flat,1
6,53.0,M,ASY,154.0,,1,ST,140,Y,1.5,Flat,1
7,38.0,M,NAP,138.0,175.0,0,Normal,173,N,0.0,Up,0
8,56.0,M,NAP,125.0,,1,Normal,98,N,-2.0,Flat,1
9,61.0,M,ASY,190.0,287.0,1,LVH,150,Y,2.0,Down,1


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             533 non-null    float64
 1   Sex             589 non-null    object 
 2   ChestPainType   589 non-null    object 
 3   RestingBP       588 non-null    float64
 4   Cholesterol     462 non-null    float64
 5   FastingBS       589 non-null    int64  
 6   RestingECG      589 non-null    object 
 7   MaxHR           589 non-null    int64  
 8   ExerciseAngina  589 non-null    object 
 9   Oldpeak         589 non-null    float64
 10  ST_Slope        589 non-null    object 
 11  HeartDisease    589 non-null    int64  
dtypes: float64(4), int64(3), object(5)
memory usage: 55.3+ KB


Пропуски присутствуют в числовых колонках 'Age', 'RestingBP' и 'Cholesterol'

In [8]:
df_train.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,533.0,588.0,462.0,589.0,589.0,589.0,589.0
mean,54.195122,133.358844,245.632035,0.258065,134.893039,0.937521,0.646859
std,9.532661,18.851852,58.599184,0.437942,24.942596,1.071318,0.478352
min,28.0,80.0,85.0,0.0,63.0,-2.6,0.0
25%,48.0,120.0,209.0,0.0,117.0,0.0,0.0
50%,55.0,130.0,240.0,0.0,135.0,0.8,1.0
75%,61.0,144.0,279.75,1.0,154.0,1.6,1.0
max,77.0,200.0,603.0,1.0,195.0,5.0,1.0


In [9]:
# заменяем пропуски на средние значения
for col in ['Age', 'RestingBP', 'Cholesterol']:
    df_train.fillna({col: df_train[col].mean()}, inplace=True)
df_train.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [10]:
df_train.describe(include='object')

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
count,589,589,589,589,589
unique,2,4,3,2,3
top,M,ASY,Normal,N,Flat
freq,477,348,345,325,325


In [11]:
df_train_ohe = pd.get_dummies(df_train, drop_first=True)
df_train_ohe.sample(7).T

Unnamed: 0,237,248,372,160,15,175,216
Age,44.0,55.0,64.0,68.0,61.0,63.0,48.0
RestingBP,120.0,135.0,143.0,145.0,120.0,140.0,160.0
Cholesterol,220.0,250.0,306.0,245.632035,282.0,245.632035,268.0
FastingBS,0,0,1,1,0,1,0
MaxHR,170,161,115,136,135,149,103
Oldpeak,0.0,1.4,1.8,1.8,4.0,2.0,1.0
HeartDisease,0,0,1,1,1,1,1
Sex_M,True,False,True,True,True,True,True
ChestPainType_ATA,True,True,False,False,False,False,False
ChestPainType_NAP,False,False,False,False,False,False,False
