In [2]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


cancer  = load_breast_cancer()

In [3]:
from sklearn.preprocessing import StandardScaler

#평균이 0, 분산이 1인 데이터 분포도로 변환
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train, X_test, y_train, y_test = train_test_split(data_scaled, cancer.target, test_size=0.3, random_state=11)

In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score

#로지스틱 회귀를 이용하여 학습 및 예측 수행
#solver 인자 default는 solver = 'lbfgs'
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_preds = lr_clf.predict(X_test)
lr_preds_proba = lr_clf.predict_proba(X_test)[:,1]

print('accuracy: {0:.3f}, roc_auc: {1:.3f}'.format(accuracy_score(y_test, lr_preds),
                                                    roc_auc_score(y_test, lr_preds_proba)))

accuracy: 0.953, roc_auc: 0.989


In [8]:
# 여러 solver 값 별로 LogisticRegression 학습 후 성능 평가 
solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']

for solver in solvers:
    lr_clf = LogisticRegression(solver= solver, max_iter = 600)
    lr_clf.fit(X_train, y_train)
    lr_preds = lr_clf.predict(X_test)
    lr_preds_proba = lr_clf.predict_proba(X_test)[:,1]

    print('solver: {0} ,accuracy: {1:.3f}, roc_auc: {2:.3f}'.format(solver,
                                                                    accuracy_score(y_test, lr_preds),
                                                                    roc_auc_score(y_test, lr_preds_proba)))

solver: liblinear ,accuracy: 0.959, roc_auc: 0.990
solver: newton-cg ,accuracy: 0.953, roc_auc: 0.989
solver: lbfgs ,accuracy: 0.953, roc_auc: 0.989
solver: sag ,accuracy: 0.953, roc_auc: 0.989
solver: saga ,accuracy: 0.953, roc_auc: 0.989


In [13]:
from sklearn.model_selection import GridSearchCV

# params = {
#     'solver': ['liblinear', 'lbfgs'],
#     'penalty': ['l2', 'l1'],
#     'C' : [0.01, 0.1, 1, 5, 10, ]
# }
params = [
    {'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 5, 10, 100]},
    {'solver': ['lbfgs', 'newton-cg'], 'penalty': ['l2'], 'C': [0.1, 1, 5, 10, 100]}
]
lr_clf = LogisticRegression(max_iter=1000)

grid_clf = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=3)
grid_clf.fit(data_scaled, cancer.target)
print(f'Best parameter : {grid_clf.best_params_}, Best_accuracy {grid_clf.best_score_}')

Best parameter : {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}, Best_accuracy 0.9789102385593614


In [14]:
best_param = grid_clf.best_params_

best_lr_clf = LogisticRegression(**best_param)
best_lr_clf.fit(X_train, y_train)

lr_preds = best_lr_clf.predict(X_test)
lr_preds_proba = best_lr_clf.predict_log_proba(X_test)[:,1]

print(f'accuracy : {accuracy_score(y_test, lr_preds)}, roc_auc : {roc_auc_score(y_test, lr_preds)} ')

accuracy : 0.9649122807017544, roc_auc : 0.9581222056631894 
