In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data_ml/heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [3]:
# 다 숫자이므로 get_dummies 할 필요 없다
y = df.DEATH_EVENT
x = df.drop(columns = 'DEATH_EVENT')

In [4]:
x.isnull().sum() # nan 값 없음

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=111)

In [6]:
x_train

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
278,50.0,1,1051,1,30,0,232000.0,0.7,136,0,0,246
218,68.0,1,1021,1,35,0,271000.0,1.1,134,1,0,197
128,61.0,0,248,0,30,1,267000.0,0.7,136,1,1,104
35,69.0,0,582,1,35,0,228000.0,3.5,134,1,0,30
184,58.0,1,145,0,25,0,219000.0,1.2,137,1,1,170
...,...,...,...,...,...,...,...,...,...,...,...,...
118,65.0,1,113,1,60,1,203000.0,0.9,140,0,0,94
169,70.0,0,835,0,35,1,305000.0,0.8,133,0,0,145
275,45.0,0,582,0,38,1,422000.0,0.8,137,0,0,245
86,55.0,0,47,0,35,1,173000.0,1.1,137,1,0,79


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import classification_report, f1_score, roc_auc_score

In [8]:
log = LogisticRegression()
rfc = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()

In [9]:
models = [log,rfc,xgb,lgbm]

for m in models:
    m.fit(x_train,y_train)
    preds = m.predict(x_test)
    proba = m.predict_proba(x_test)[:,1]
    print(m.__class__.__name__)
    print('classification :', classification_report(y_test,preds))
    print('roc_auc_score :', roc_auc_score(y_test, proba))
    print('-'*50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
classification :               precision    recall  f1-score   support

           0       0.85      0.95      0.90        37
           1       0.89      0.74      0.81        23

    accuracy                           0.87        60
   macro avg       0.87      0.84      0.85        60
weighted avg       0.87      0.87      0.86        60

roc_auc_score : 0.9224441833137486
--------------------------------------------------
RandomForestClassifier
classification :               precision    recall  f1-score   support

           0       0.86      0.97      0.91        37
           1       0.94      0.74      0.83        23

    accuracy                           0.88        60
   macro avg       0.90      0.86      0.87        60
weighted avg       0.89      0.88      0.88        60

roc_auc_score : 0.9424206815511164
--------------------------------------------------
XGBClassifier
classification :               precision    recall  f1-score   support

           0

In [10]:
# scaling

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [12]:
models = [log,rfc,xgb,lgbm]

for m in models:
    m.fit(x_train_scaled,y_train)
    preds = m.predict(x_test_scaled)
    proba = m.predict_proba(x_test_scaled)[:,1]
    print(m.__class__.__name__)
    print('classification :', classification_report(y_test,preds))
    print('roc_auc_score :', roc_auc_score(y_test, proba))
    print('-'*50)

LogisticRegression
classification :               precision    recall  f1-score   support

           0       0.88      0.97      0.92        37
           1       0.95      0.78      0.86        23

    accuracy                           0.90        60
   macro avg       0.91      0.88      0.89        60
weighted avg       0.90      0.90      0.90        60

roc_auc_score : 0.9212690951821386
--------------------------------------------------
RandomForestClassifier
classification :               precision    recall  f1-score   support

           0       0.88      0.95      0.91        37
           1       0.90      0.78      0.84        23

    accuracy                           0.88        60
   macro avg       0.89      0.86      0.87        60
weighted avg       0.88      0.88      0.88        60

roc_auc_score : 0.9565217391304348
--------------------------------------------------
XGBClassifier
classification :               precision    recall  f1-score   support

           0



In [13]:
from scipy.stats import skew

In [14]:
df.apply(lambda x :skew(x))

age                         0.420937
anaemia                     0.276863
creatinine_phosphokinase    4.440689
diabetes                    0.332251
ejection_fraction           0.552593
high_blood_pressure         0.623583
platelets                   1.454975
serum_creatinine            4.433610
serum_sodium               -1.042870
sex                        -0.623583
smoking                     0.766479
time                        0.127161
DEATH_EVENT                 0.766479
dtype: float64

In [15]:
# np.log1p 처리 'creatinine_phosphokinase',

In [16]:
# 치우침 심한 값 log 씌워주기

x_train_scaled[:,2] =np.log1p(x_train_scaled[:,2])
x_train_scaled[:,7] =np.log1p(x_train_scaled[:,7])

In [17]:
models = [log,rfc,xgb,lgbm]

for m in models:
    m.fit(x_train_scaled,y_train)
    preds = m.predict(x_test_scaled)
    proba = m.predict_proba(x_test_scaled)[:,1]
    print(m.__class__.__name__)
    print('classification :', classification_report(y_test,preds))
    print('roc_auc_score :', roc_auc_score(y_test, proba))
    print('-'*50)

LogisticRegression
classification :               precision    recall  f1-score   support

           0       0.88      0.95      0.91        37
           1       0.90      0.78      0.84        23

    accuracy                           0.88        60
   macro avg       0.89      0.86      0.87        60
weighted avg       0.88      0.88      0.88        60

roc_auc_score : 0.917743830787309
--------------------------------------------------
RandomForestClassifier
classification :               precision    recall  f1-score   support

           0       0.92      0.95      0.93        37
           1       0.91      0.87      0.89        23

    accuracy                           0.92        60
   macro avg       0.92      0.91      0.91        60
weighted avg       0.92      0.92      0.92        60

roc_auc_score : 0.9588719153936545
--------------------------------------------------
XGBClassifier
classification :               precision    recall  f1-score   support

           0 

