# 📌 로지스틱회귀 실습

## 1. 기본 설정 ❗

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import Image

import pandas_profiling

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LinearRegression

from sklearn.metrics import *

from sklearn.preprocessing import StandardScaler

from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.combine import *

import statsmodels.api as sm
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

## 2. EDA 및 데이터 전처리 📂

In [2]:
heart= pd.read_csv('framingham_heart_disease.csv')

In [3]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


In [4]:
heart.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
male,4238.0,0.429212,0.495022,0.0,0.0,0.0,1.0,1.0
age,4238.0,49.584946,8.57216,32.0,42.0,49.0,56.0,70.0
education,4133.0,1.97895,1.019791,1.0,1.0,2.0,3.0,4.0
currentSmoker,4238.0,0.494101,0.500024,0.0,0.0,0.0,1.0,1.0
cigsPerDay,4209.0,9.003089,11.920094,0.0,0.0,0.0,20.0,70.0
BPMeds,4185.0,0.02963,0.169584,0.0,0.0,0.0,0.0,1.0
prevalentStroke,4238.0,0.005899,0.076587,0.0,0.0,0.0,0.0,1.0
prevalentHyp,4238.0,0.310524,0.462763,0.0,0.0,0.0,1.0,1.0
diabetes,4238.0,0.02572,0.158316,0.0,0.0,0.0,0.0,1.0
totChol,4188.0,236.721585,44.590334,107.0,206.0,234.0,263.0,696.0


In [5]:
heart.head(10)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0
5,0,43,2.0,0,0.0,0.0,0,1,0,228.0,180.0,110.0,30.3,77.0,99.0,0
6,0,63,1.0,0,0.0,0.0,0,0,0,205.0,138.0,71.0,33.11,60.0,85.0,1
7,0,45,2.0,1,20.0,0.0,0,0,0,313.0,100.0,71.0,21.68,79.0,78.0,0
8,1,52,1.0,0,0.0,0.0,0,1,0,260.0,141.5,89.0,26.36,76.0,79.0,0
9,1,43,1.0,1,30.0,0.0,0,1,0,225.0,162.0,107.0,23.61,93.0,88.0,0


In [6]:
report = pandas_profiling.ProfileReport(heart)

In [7]:
display(report)

In [8]:
# 결측값 확인
heart.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

### 모델 학습에 필요한 필수 전처리

#### 결측치 대체1_수치형 변수

In [4]:
# 정규분포 형태의 수치형 평균으로 결측값 대체
heart = heart.fillna({'totChol': heart['totChol'].mean()})
heart = heart.fillna({'BMI': heart['BMI'].mean()})
heart = heart.fillna({'heartRate': heart['heartRate'].mean()})
# 수치형 cigsPerDay 흡연자들의 결측치이므로 평균으로 결측값 대체
heart = heart.fillna({'cigsPerDay': heart['heartRate'].mean()})

#### 결측치 대체2_범주형 변수

In [5]:
# 범주형 BPMeds는 대부분이 0이므로 0으로 대체
heart = heart.fillna({'BPMeds':0})

In [6]:
# 범주형 education 최빈값으로 대체
most_freq=heart['education'].value_counts(dropna=True).idxmax()
heart['education'].fillna(most_freq,inplace=True)

#### 결측체 대체3_다른 변수에 비해 결측치가 상대적으로 많은 수치형 glucose 선형회귀로 예측 후 대체

In [7]:
# glucose train/test set 나누기(결측치 포함 행=test set)
glucose_test=heart.loc[heart.isnull()['glucose'],:]
glucose_train=heart.dropna(subset=['glucose'])

In [8]:
# glucose train/test set 나누기(결측치 포함 행=test set)
glucose_test=heart.loc[heart.isnull()['glucose'],:]
glucose_train=heart.dropna(subset=['glucose'])

In [9]:
# train set x,y 나누기
glucose_y=glucose_train['glucose']
glucose_x=glucose_train.drop(['glucose'],axis=1)

In [10]:
# train_test에서 glucose 칼럼 삭제
glucose_test=glucose_test.drop(['glucose'],axis=1)

In [11]:
# Linear Regression 사용
X_train, X_test, y_train, y_test = train_test_split(glucose_x,glucose_y,test_size=0.2,random_state=0)

reg = LinearRegression()
reg.fit(X_train,y_train)
glucose_predict=reg.predict(glucose_test)

In [12]:
glucose_test['glucose']=glucose_predict
glucose_test

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,TenYearCHD,glucose
14,0,39,2.0,1,9.0,0.0,0,0,0,226.000000,114.0,64.0,22.35,85.0,0,79.354615
21,0,43,1.0,0,0.0,0.0,0,0,0,185.000000,123.5,77.5,29.89,70.0,0,79.452585
26,0,60,1.0,0,0.0,0.0,0,0,0,260.000000,110.0,72.5,26.59,65.0,0,77.334605
42,0,52,1.0,0,0.0,1.0,0,1,0,236.721585,148.0,92.0,25.09,70.0,1,82.700351
54,0,39,2.0,1,20.0,0.0,0,0,0,209.000000,115.0,75.0,22.54,90.0,0,78.966066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170,0,41,2.0,1,5.0,0.0,0,0,0,205.000000,105.0,74.0,20.85,87.0,0,77.627531
4208,0,51,1.0,1,9.0,0.0,0,0,0,340.000000,152.0,76.0,25.74,70.0,0,79.853412
4229,0,51,3.0,1,20.0,0.0,0,1,0,251.000000,140.0,80.0,25.60,75.0,0,78.160770
4230,0,56,1.0,1,3.0,0.0,0,1,0,268.000000,170.0,102.0,22.89,57.0,0,75.994097


In [13]:
data=pd.concat([glucose_train,glucose_test])
data

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.000000,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.000000,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.000000,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.000000,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170,0,41,2.0,1,5.0,0.0,0,0,0,205.0,105.0,74.0,20.85,87.0,77.627531,0
4208,0,51,1.0,1,9.0,0.0,0,0,0,340.0,152.0,76.0,25.74,70.0,79.853412,0
4229,0,51,3.0,1,20.0,0.0,0,1,0,251.0,140.0,80.0,25.60,75.0,78.160770,0
4230,0,56,1.0,1,3.0,0.0,0,1,0,268.0,170.0,102.0,22.89,57.0,75.994097,0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4238 entries, 0 to 4236
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4238 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4238 non-null   float64
 5   BPMeds           4238 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4238 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4238 non-null   float64
 13  heartRate        4238 non-null   float64
 14  glucose          4238 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 562.9 KB


In [15]:
data.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

#### 범주형 변수로 타입 변경

In [16]:
data['male']=data['male'].astype(object)
data['education']=data['education'].astype(object)
data['currentSmoker']=data['currentSmoker'].astype(object)
data['BPMeds']=data['BPMeds'].astype(object)
data['prevalentStroke']=data['prevalentStroke'].astype(object)
data['prevalentHyp']=data['prevalentHyp'].astype(object)
data['diabetes']=data['diabetes'].astype(object)

#### 왜도가 1이상인 수치형 변수 추출 후 로그 변환

In [17]:
# 왜도가 1 이상인 수치형 변수 추출
from scipy.stats import skew

features_index =data.dtypes[data.dtypes != 'object'].index

skew_features = data[features_index].apply(lambda x : skew(x))

skew_features_top = skew_features[skew_features>1]
skew_features_top = skew_features_top[['glucose','cigsPerDay','sysBP']]
print(skew_features_top.sort_values(ascending=False))

glucose       6.426567
cigsPerDay    1.740732
sysBP         1.144957
dtype: float64


In [18]:
# 왜도가 1 이상인 변수 로그 변환
data[skew_features_top.index] = np.log1p(data[skew_features_top.index])

#### 범주형 변수 원-핫 인코딩

In [19]:
data=pd.get_dummies(data,columns=['male','education','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes'])

In [20]:
data

Unnamed: 0,age,cigsPerDay,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD,male_0,...,currentSmoker_0,currentSmoker_1,BPMeds_0.0,BPMeds_1.0,prevalentStroke_0,prevalentStroke_1,prevalentHyp_0,prevalentHyp_1,diabetes_0,diabetes_1
0,39,0.000000,195.0,4.672829,70.0,26.97,80.0,4.356709,0,0,...,1,0,1,0,1,0,1,0,1,0
1,46,0.000000,250.0,4.804021,81.0,28.73,95.0,4.343805,0,1,...,1,0,1,0,1,0,1,0,1,0
2,48,3.044522,245.0,4.855929,80.0,25.34,75.0,4.262680,0,0,...,0,1,1,0,1,0,1,0,1,0
3,61,3.433987,225.0,5.017280,95.0,28.58,65.0,4.644391,1,1,...,0,1,1,0,1,0,0,1,1,0
4,46,3.178054,285.0,4.875197,84.0,23.10,85.0,4.454347,0,1,...,0,1,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170,41,1.791759,205.0,4.663439,74.0,20.85,87.0,4.364722,0,1,...,0,1,1,0,1,0,1,0,1,0
4208,51,2.302585,340.0,5.030438,76.0,25.74,70.0,4.392638,0,1,...,0,1,1,0,1,0,1,0,1,0
4229,51,3.044522,251.0,4.948760,80.0,25.60,75.0,4.371481,0,1,...,0,1,1,0,1,0,0,1,1,0
4230,56,1.386294,268.0,5.141664,102.0,22.89,57.0,4.343729,0,1,...,0,1,1,0,1,0,0,1,1,0


- 로그변환 후 스케일링하기
- https://john-analyst.medium.com/%ED%94%BC%EC%B2%98-%EC%8A%A4%EC%BC%80%EC%9D%BC%EB%A7%81-%EC%8B%9C%EC%9E%91%ED%95%98%EA%B8%B0-f58f39f6a710

In [21]:
train = data.iloc[:3380, :]
test = data.iloc[3380:, :]

### Train/Test 데이터 분할 및 데이터 정규화

In [22]:
X_data = train.loc[:,train.columns != 'TenYearCHD']
y_target = train['TenYearCHD']

In [23]:
X_data = StandardScaler().fit_transform(X_data)

In [24]:
X_data

array([[-1.23698758, -0.92834429, -0.93771076, ..., -0.66851912,
         0.15768181, -0.15768181],
       [-0.41988093, -0.92834429,  0.28726254, ..., -0.66851912,
         0.15768181, -0.15768181],
       [-0.18642188,  1.1265378 ,  0.17590133, ..., -0.66851912,
         0.15768181, -0.15768181],
       ...,
       [-1.00352853, -0.92834429,  0.06454012, ...,  1.49584354,
         0.15768181, -0.15768181],
       [-0.41988093, -0.92834429, -0.55908265, ...,  1.49584354,
         0.15768181, -0.15768181],
       [ 0.74741429, -0.46050877,  0.50998496, ...,  1.49584354,
         0.15768181, -0.15768181]])

In [25]:
X_train, X_valid, y_train, y_valid=train_test_split(X_data, y_target,
                                                  test_size=0.2, random_state=2021)

In [26]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(2704, 24)
(2704,)
(676, 24)
(676,)


In [27]:
X_train

array([[-0.88679901, -0.92834429,  0.19817357, ...,  1.49584354,
         0.15768181, -0.15768181],
       [-0.53661045,  1.49033024, -1.69496699, ..., -0.66851912,
         0.15768181, -0.15768181],
       [ 1.91470951, -0.92834429, -2.51903994, ...,  1.49584354,
         0.15768181, -0.15768181],
       ...,
       [-0.3031514 , -0.92834429, -0.1581823 , ..., -0.66851912,
         0.15768181, -0.15768181],
       [-1.58717614, -0.92834429, -1.20497767, ...,  1.49584354,
         0.15768181, -0.15768181],
       [-0.06969236,  0.62577165,  0.64361841, ...,  1.49584354,
         0.15768181, -0.15768181]])

## 3. 모델링

In [28]:
# 로지스틱 회귀를 이용해 학습 및 예측 수행
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_valid)

In [29]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.8432
precision: 0.6429
recall: 0.0818
roc_auc: 0.5365


In [30]:
# 로지스틱 회귀를 이용해 학습 및 예측 수행
lr = LogisticRegression(C = 0.8, solver='lbfgs', max_iter=10000)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_valid)

In [31]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.8432
precision: 0.6429
recall: 0.0818
roc_auc: 0.5365


In [32]:
cv_method = KFold(n_splits=50)
params = {
    'C': [0.45,0.48,0.5, 0.53]
}
lr_grid_cv = GridSearchCV(lr, param_grid=params, scoring='recall', cv=cv_method, n_jobs=-1)
lr_grid_cv.fit(X_train, y_train)

lr_grid_cv_preds = lr_grid_cv.predict(X_valid)

print(lr_grid_cv.best_params_)
print(lr_grid_cv.best_score_)

{'C': 0.45}
0.09960939060939063


In [33]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_grid_cv_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_grid_cv_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_grid_cv_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_grid_cv_preds)))

accuracy: 0.8432
precision: 0.6429
recall: 0.0818
roc_auc: 0.5365


### Sampling

#### - Under sampling

##### 1. Random Under sampling

In [34]:
# random under sampling
rus = RandomUnderSampler(random_state=0)

X_rus, y_rus = rus.fit_resample(X_train, y_train)

In [35]:
lr_grid_cv.fit(X_rus, y_rus)
lr_preds = lr_grid_cv.predict(X_valid)

In [36]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.6627
precision: 0.2731
recall: 0.6455
roc_auc: 0.6558


##### 2. TomekLinks sampling

In [116]:
# TomekLinks sampling
tls = TomekLinks()

X_tls, y_tls = tls.fit_resample(X_train, y_train)

In [117]:
lr_grid_cv.fit(X_tls, y_tls)
lr_preds = lr_grid_cv.predict(X_valid)

In [118]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.8432
precision: 0.6250
recall: 0.0909
roc_auc: 0.5402


##### 3. Condensed Nearest Neighbour sampling

In [56]:
# CNN sampling
cnn = CondensedNearestNeighbour(random_state=0)

X_cnn, y_cnn = cnn.fit_resample(X_train, y_train)

In [57]:
lr = LogisticRegression()
lr.fit(X_cnn, y_cnn)
lr_preds = lr.predict(X_valid)

In [58]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.8210
precision: 0.4098
recall: 0.2273
roc_auc: 0.5818


#### - Over sampling

##### 1. Random Over Sampling

In [82]:
ros = RandomOverSampler(random_state=0)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [83]:
lr = LogisticRegression()
lr.fit(X_ros, y_ros)
lr_preds = lr.predict(X_valid)

In [84]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.6864
precision: 0.2910
recall: 0.6455
roc_auc: 0.6699


##### 2. ADASYN (Adaptive Synthetic Sampling)

In [88]:
ada = ADASYN(random_state=0)

X_ada, y_ada = ada.fit_resample(X_train, y_train)

In [93]:
lr_grid_cv.fit(X_ada, y_ada)
lr_preds = lr_grid_cv.predict(X_valid)

In [94]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.6538
precision: 0.2737
recall: 0.6818
roc_auc: 0.6651


##### 3. SMOTE

In [96]:
smo = SMOTE(random_state=0)

X_smo, y_smo = smo.fit_resample(X_train, y_train)

In [99]:
lr_grid_cv.fit(X_smo, y_smo)
lr_preds = lr_grid_cv.predict(X_valid)

In [100]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.6731
precision: 0.2840
recall: 0.6636
roc_auc: 0.6693


##### 4. BorderlineSMOTE

In [37]:
bsm = BorderlineSMOTE(random_state=0)

X_bsm, y_bsm = bsm.fit_resample(X_train, y_train)

In [38]:
lr_grid_cv.fit(X_bsm, y_bsm)
lr_preds = lr_grid_cv.predict(X_valid)

In [39]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.6982
precision: 0.3074
recall: 0.6818
roc_auc: 0.6916


#### - Combine Sampling

##### 1. SMOTE + ENN

In [106]:
sme = SMOTEENN(random_state=0)

X_sme, y_sme = sme.fit_resample(X_train, y_train)

In [107]:
lr = LogisticRegression()
lr.fit(X_sme, y_sme)
lr_preds = lr.predict(X_valid)

In [108]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.5592
precision: 0.2446
recall: 0.8182
roc_auc: 0.6635


##### 2. SMOTE + TOMEK

In [111]:
smt = SMOTETomek(random_state=0)

X_smt, y_smt = smt.fit_resample(X_train, y_train)

In [114]:
lr_grid_cv.fit(X_smt, y_smt)
lr_preds = lr_grid_cv.predict(X_valid)

In [115]:
# 정확도 / 정밀도 / 재현율 / ROC_AUC 측정
print('accuracy: {0:.4f}'.format(accuracy_score(y_valid,lr_preds)))
print('precision: {:0.4f}'.format(precision_score(y_valid, lr_preds)))
print('recall: {:0.4f}'.format(recall_score(y_valid, lr_preds)))
print('roc_auc: {0:.4f}'.format(roc_auc_score(y_valid,lr_preds)))

accuracy: 0.6716
precision: 0.2829
recall: 0.6636
roc_auc: 0.6684


## 4. 최종 모델 선택 및 오즈비 출력

In [None]:
# BorderlineSMOTE Sampling 한 로지스틱회귀 선택
# accuracy: 0.6982 / recision: 0.3074 / recall: 0.6818 / roc_auc: 0.6916



#### 오즈비 출력

In [40]:
logit = sm.Logit(y_bsm, X_bsm)
result=logit.fit()

         Current function value: 0.580083
         Iterations: 35




In [41]:
result.summary()

0,1,2,3
Dep. Variable:,TenYearCHD,No. Observations:,4596.0
Model:,Logit,Df Residuals:,4579.0
Method:,MLE,Df Model:,16.0
Date:,"Thu, 05 Aug 2021",Pseudo R-squ.:,0.1631
Time:,23:35:01,Log-Likelihood:,-2666.1
converged:,False,LL-Null:,-3185.7
Covariance Type:,nonrobust,LLR p-value:,4.325e-211

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.5785,0.041,13.991,0.000,0.497,0.660
x2,0.5159,0.110,4.702,0.000,0.301,0.731
x3,0.1528,0.037,4.184,0.000,0.081,0.224
x4,0.2965,0.068,4.384,0.000,0.164,0.429
x5,-0.0038,0.058,-0.066,0.948,-0.118,0.110
x6,0.1270,0.037,3.431,0.001,0.054,0.200
x7,-0.0470,0.037,-1.275,0.202,-0.119,0.025
x8,0.1379,0.040,3.415,0.001,0.059,0.217
x9,-0.1306,,,,,


In [42]:
print(np.exp(result.params))

x1     1.783325
x2     1.675113
x3     1.165110
x4     1.345184
x5     0.996180
x6     1.135403
x7     0.954110
x8     1.147875
x9     0.877573
x10    1.139506
x11    1.064827
x12    0.969997
x13    0.980783
x14    0.968073
x15    1.108296
x16    0.902286
x17    0.960188
x18    1.041455
x19    0.967317
x20    1.033801
x21    0.945725
x22    1.057389
x23    0.994522
x24    1.005507
dtype: float64
