<b>Tasks:</b>
- Loading and preparing data
- Fit LogisticRegression without parameters
- Fit LogisticRegression with GridSearchCV and RandomizedSearchCV.

In [47]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [48]:
df = pd.read_csv('datasets_features/heart.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [49]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [50]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [51]:
df = pd.get_dummies(df, columns=df.select_dtypes('object').columns)
df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,1,0,1,1,...,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0


In [52]:
# Step 3.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # Z - Standartization 

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['HeartDisease']), df.HeartDisease, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [53]:
# Step 4-5. LogisticRegression and cross_validate scores
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate # кросс валидация


metric = ['accuracy','recall','precision','f1']
scores = cross_validate(LogisticRegression(), X_train, y_train, cv=10, scoring=metric)
print(f'accuracy {scores.get("test_accuracy").mean()}')
print(f'recall {scores.get("test_recall").mean()}')
print(f'precision {scores.get("test_precision").mean()}')
print(f'f1 {scores.get("test_f1").mean()}')

accuracy 0.8649944465012958
recall 0.8952439024390243
precision 0.8646520550878403
f1 0.8791797369847545


In [54]:
#Step 6a. GridSearchCV
from sklearn.model_selection import GridSearchCV

parameters = {
    "penalty": ['l1', 'l2'],
    "tol": [1e-4, 1e-2, 0.1, 1],
    "C": [0.1, 0.5, 1, 10],
}

lg = LogisticRegression(solver = 'liblinear')

grid = GridSearchCV(lg, parameters, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)

# Лучший скор
print(grid.best_score_)

# dict в котором хранятся лучшие параметры
print(grid.best_params_)

# Объекты лучших моделей
print(grid.best_estimator_)

scores = cross_validate(grid.best_estimator_, X_train, y_train, cv=10, scoring=metric)
print(f'accuracy {scores.get("test_accuracy").mean()}')
print(f'recall {scores.get("test_recall").mean()}')
print(f'precision {scores.get("test_precision").mean()}')
print(f'f1 {scores.get("test_f1").mean()}')

0.8663643095149943
{'C': 0.5, 'penalty': 'l2', 'tol': 0.0001}
LogisticRegression(C=0.5, solver='liblinear')
accuracy 0.8663643095149943
recall 0.8952439024390244
precision 0.86638206075999
f1 0.8801233488305116


In [55]:
#Step 6b. RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    "penalty": ['l1', 'l2'],
    "tol": [1e-4, 1e-2, 0.1, 1],
    "C": [0.1, 0.5, 1, 10],
}

lg = LogisticRegression(solver = 'liblinear')

grid = RandomizedSearchCV(lg, parameters, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)

# Лучший скор
print(grid.best_score_)

# dict в котором хранятся лучшие параметры
print(grid.best_params_)

# Объекты лучших моделей
print(grid.best_estimator_)

scores = cross_validate(grid.best_estimator_, X_train, y_train, cv=10, scoring=metric)
print(f'accuracy {scores.get("test_accuracy").mean()}')
print(f'recall {scores.get("test_recall").mean()}')
print(f'precision {scores.get("test_precision").mean()}')
print(f'f1 {scores.get("test_f1").mean()}')

0.8663643095149943
{'tol': 0.1, 'penalty': 'l2', 'C': 10}
LogisticRegression(C=10, solver='liblinear', tol=0.1)
accuracy 0.8663643095149943
recall 0.8952439024390243
precision 0.8665900395839644
f1 0.8802082412457007


In [57]:
#Step 6c. RandomizedSearchCV with some amount of models
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC  # Support Vector Classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


models=[
      {'name':'Lr_1',"model": LogisticRegression(), 'params':{'solver': ['Lr_lbfgs', 'newton-cg', 'newton-cholesky', 'sag'], 'C':[0.1,0.2,0.3,0.5,0.7,1], 'tol': [1e-4, 1e-2, 0.1, 1], 'penalty':['l2', None]}},
      {'name':'Lr_2',"model": LogisticRegression(), 'params':{'solver': ['liblinear', 'saga'], 'C':[0.1,0.2,0.3,0.5,0.7,1], 'penalty':['l1', 'l2']}},
      {'name':'SVC',"model": SVC(), 'params':{'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'gamma':['scale', 'auto']}},
      {'name':'KN',"model": KNeighborsClassifier(), 'params':{'n_neighbors':list(range(1,30)),'weights': ['uniform', 'distance'], 'p':[1,2,3]}},
      {'name':'RF',"model": RandomForestClassifier(), 'params':{'n_estimators':[10,25,50,100,150,200], 'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}},
      {'name':'DT',"model": DecisionTreeClassifier(), 'params':{'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}}

]


res=[]
for v in  models:
    res.append((v['name'], RandomizedSearchCV(v['model'], v['params'], cv=10).fit(X_train, y_train)))
    
    
for r in res:
    print('\n\n--', r[0], r[1].best_score_, r[1].best_params_, '')
    scores = cross_validate(r[1].best_estimator_, X_train, y_train, cv=10, scoring=metric)
    print(f'accuracy {scores.get("test_accuracy").mean()}')
    print(f'recall {scores.get("test_recall").mean()}')
    print(f'precision {scores.get("test_precision").mean()}')
    print(f'f1 {scores.get("test_f1").mean()}')



-- Lr_1 0.8663643095149943 {'tol': 0.0001, 'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.2} 
accuracy 0.8663643095149943
recall 0.8952439024390244
precision 0.86638206075999
f1 0.8801233488305116


-- Lr_2 0.8663643095149945 {'solver': 'saga', 'penalty': 'l1', 'C': 0.1} 
accuracy 0.8663643095149945
recall 0.8927439024390245
precision 0.8688854922750495
f1 0.8798952950886367


-- SVC 0.8677156608663458 {'kernel': 'rbf', 'gamma': 'scale'} 
accuracy 0.8677156608663458
recall 0.9201219512195122
precision 0.853064565419627
f1 0.8842058001921537


-- KN 0.8636060718252498 {'weights': 'distance', 'p': 2, 'n_neighbors': 25} 
accuracy 0.8636060718252498
recall 0.8976829268292683
precision 0.8622793769124458
f1 0.8785951701906519


-- RF 0.8718437615697889 {'n_estimators': 150, 'max_depth': 7, 'criterion': 'gini'} 
accuracy 0.8690670122176973
recall 0.9176219512195123
precision 0.8564419065365272
f1 0.884930340898558


-- DT 0.8432062199185488 {'max_depth': 3, 'criterion': 'entropy'} 
accurac

<b>Выводы: </b>

- RandomizedSearchCV подобрал модель которая сравнима с GridSearchCV.
- Лучший результат у KN.
- Метрика accuracy полученная с помощью ансамблей моделей на несколько процентов выше.


|                     |       accuracy      |     f1        |
|---------------------|---------------------|---------------|
|LogisticRegression   |         0.86        |      0.87     |
|GridSearchCV         |         0.86        |      0.88     |
|RandomizedSearchCV   |         0.86        |      0.88     |
|SVC                  |         0.86        |      0.88     |
|KN                   |         0.87        |      0.87     |
|RF                   |         0.86        |      0.88     |
|DT                   |         0.84        |      0.86     |




|Ensemblies            | accuracy |
|----------------------|----------|
|DecisionTreeClassifier|   0.86   |
|RandomForestClassifier|   0.89   |
|Bagging               |   0.89   |
|Stacking              |   0.89   |
