In [25]:
import sklearn

In [26]:
import pandas as pd
df = pd.read_csv('tumor.csv')
df["diagnosis"] = [0 if i == 'B' else 1 for i in df['diagnosis']]

In [27]:
# feature와 target 분리
features = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave.points_mean', 'symmetry_mean', 'fractal_dimension_mean']

X = df[features] # 행렬
y = df['diagnosis'] # 벡터

In [28]:
# 트레이닝/테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=1004) # random 값을 고정하려고

In [29]:
# 데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

In [30]:
# 로지스틱 회귀분석(L2 제약식)
from sklearn.linear_model import LogisticRegression
clf_logi_l2 = LogisticRegression(penalty='l2')
clf_logi_l2.fit(X_tn_std, y_tn)

In [31]:
# 로지스틱 회귀분석 모형(L2 제약식) 추정 계수
print(clf_logi_l2.coef_)
print(clf_logi_l2.intercept_)

[[ 1.10696073  1.44877972  0.95219541  1.2755903   1.11339971 -0.52664765
   1.21540132  1.47082896  0.34279816 -0.22630378]]
[-0.91404725]


In [32]:
# 예측
pred_logistic = clf_logi_l2.predict(X_te_std)
print(pred_logistic)

[0 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 0 0 1 1 0 1 0
 1 1 0 1 1 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1
 0 0 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1
 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0]


In [33]:
# 확률값으로 예측
pred_proba = clf_logi_l2.predict_proba(X_te_std)
print(pred_proba)

[[9.98586288e-01 1.41371173e-03]
 [2.32794726e-02 9.76720527e-01]
 [6.26209347e-01 3.73790653e-01]
 [9.95374166e-01 4.62583414e-03]
 [1.29813865e-02 9.87018613e-01]
 [1.85307616e-04 9.99814692e-01]
 [2.46876199e-02 9.75312380e-01]
 [9.83196995e-01 1.68030055e-02]
 [1.24416543e-01 8.75583457e-01]
 [9.99740819e-01 2.59180824e-04]
 [8.09286882e-02 9.19071312e-01]
 [9.80420475e-01 1.95795253e-02]
 [5.94660494e-04 9.99405340e-01]
 [9.64865519e-01 3.51344810e-02]
 [9.98143873e-01 1.85612671e-03]
 [9.93556327e-01 6.44367320e-03]
 [9.86734552e-01 1.32654477e-02]
 [9.93182194e-01 6.81780576e-03]
 [6.64884589e-02 9.33511541e-01]
 [9.90154201e-01 9.84579926e-03]
 [9.92631073e-01 7.36892673e-03]
 [3.32311884e-04 9.99667688e-01]
 [2.24275412e-01 7.75724588e-01]
 [2.46355339e-01 7.53644661e-01]
 [9.21749041e-01 7.82509589e-02]
 [2.09412443e-04 9.99790588e-01]
 [9.95968706e-01 4.03129416e-03]
 [9.99946362e-01 5.36379206e-05]
 [8.75523457e-01 1.24476543e-01]
 [3.30290842e-01 6.69709158e-01]
 [9.765701

In [35]:
# 리콜
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_logistic, average='macro')
print(recall)

0.9121835443037974
