# Загружаем и обрабатываем данные

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('heart.csv')
data


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [2]:
x = data.drop(columns = 'HeartDisease')
y = data['HeartDisease']

In [3]:
categorials = x.select_dtypes('object').columns
x[categorials] = x[categorials].fillna('nan')
x_int =x.select_dtypes(['int', 'float']).columns.tolist()
x[x_int].head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,40,140,289,0,172,0.0
1,49,160,180,0,156,1.0
2,37,130,283,0,98,0.0
3,48,138,214,0,108,1.5
4,54,150,195,0,122,0.0


In [4]:
dummy_x = pd.get_dummies(x[categorials], columns = categorials)
X = pd.concat([x[x_int].fillna(-999), dummy_x], axis=1)
X

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,1,0,0,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,0,1,0,0,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,0,1,1,0,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,0,1,1,0,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,0,0,1,0,0,1,0,0,1,0,0,1,0


# Строим модель логистической регрессии и получаем метрики с помощью cross_validate

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
from sklearn.linear_model  import LogisticRegression

In [7]:
model = LogisticRegression()

In [8]:
model.fit(X_train, y_train)

LogisticRegression()

In [9]:
pred = model.predict(X_test)


In [10]:
from sklearn.model_selection import cross_validate
scores = cross_validate(LogisticRegression(), X, y, cv=10, scoring=('accuracy','recall','precision','f1'))
mean_score = mean_scores = {(key, values.mean()) for (key, values) in scores.items()}
mean_score




{('fit_time', 0.027179813385009764),
 ('score_time', 0.003622579574584961),
 ('test_accuracy', 0.8527353081700909),
 ('test_f1', 0.8657492902467869),
 ('test_precision', 0.8688872883506861),
 ('test_recall', 0.8714901960784314)}

# Оптимизируем параметры модели с помощью GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV
parameters = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty': ['elasticnet', 'l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV( estimator = model,  
    param_grid = parameters, cv=10, scoring='accuracy')
grid.fit(X, y)


GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'penalty': ['elasticnet', 'l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             scoring='accuracy')

In [12]:
print(grid.best_params_)

{'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [13]:
model_grid = LogisticRegression(C=100, penalty='l2', solver='lbfgs')
model_grid.fit(X_train, y_train)

scores = cross_validate(model_grid, X, y, cv=10, scoring=['accuracy','recall','precision','f1'])
mean_score_g = {(key, values.mean()) for (key, values) in scores.items()}
mean_score_g

{('fit_time', 0.024716973304748535),
 ('score_time', 0.003563070297241211),
 ('test_accuracy', 0.8527353081700909),
 ('test_f1', 0.8664811559523521),
 ('test_precision', 0.8668852889286441),
 ('test_recall', 0.8754117647058823)}

# Оптимизируем параметры с помощью RandomizedSearchCV

In [14]:
from sklearn.model_selection import RandomizedSearchCV
rand = RandomizedSearchCV(model, parameters)
rand.fit(X, y)

RandomizedSearchCV(estimator=LogisticRegression(),
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                                        'penalty': ['elasticnet', 'l1', 'l2'],
                                        'solver': ['newton-cg', 'lbfgs',
                                                   'liblinear', 'sag',
                                                   'saga']})

In [15]:
print(rand.best_params_)

{'solver': 'liblinear', 'penalty': 'l1', 'C': 1}


In [16]:
model_rand = LogisticRegression(C=0.1, penalty='l2', solver='lbfgs')
model_rand.fit(X_train, y_train)

scores = cross_validate(model_rand, X, y, cv=10, scoring=['accuracy','recall','precision','f1'])
mean_score_r = {(key, values.mean()) for (key, values) in scores.items()}

mean_score_r

{('fit_time', 0.02506439685821533),
 ('score_time', 0.0035636186599731444),
 ('test_accuracy', 0.8494983277591974),
 ('test_f1', 0.8625049076651354),
 ('test_precision', 0.8645221888146987),
 ('test_recall', 0.8696078431372548)}

# Выводы

Оптимизация параметров данном случае не показала существенного улучшения метрик модели, хотя если сравнивать с домашним заданием "Ансамблирование", то там мы сталкивались с переобучением модели в разных вариантов. Если подбор параметров решает эту проблему, то ёто уже отличный результат