In [9]:
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
# 데이터 준비
breast_cancer = load_breast_cancer()
breast_cancer_data = breast_cancer.data

In [14]:
# 데이터 이해하기

# Feature Data 저장하기
breast_cancer_df = pd.DataFrame(data=breast_cancer_data,  columns=breast_cancer.feature_names)

# Label Data 지정하기
breast_cancer_df['label'] = breast_cancer.target
breast_cancer_label = breast_cancer.target

# Target Names 출력해 보기
print(breast_cancer.target_names)

# 데이터 Describe 해 보기
print(breast_cancer_df.describe())

['malignant' 'benign']
       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000           

In [12]:
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data,
                                                    breast_cancer_label,
                                                    test_size=0.2,
                                                    random_state=7)

In [19]:
# 다양한 모델로 학습시켜보기

from sklearn.metrics import recall_score

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)

print("y_train 고유값 분포:\n", pd.Series(y_train).value_counts())
print("y_test 고유값 분포:\n", pd.Series(y_test).value_counts())

print("-------------------------------------------")

decision_tree_y_pred = decision_tree.predict(X_test)
recall = recall_score(y_test, decision_tree_y_pred, pos_label=1)
print('Decision 재현율 : ', recall)

# Random Forest
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)

random_forest_y_pred = random_forest.predict(X_test)
print('Random Forest 재현율 : ', recall_score(y_test, random_forest_y_pred, pos_label=1))

# SVM
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)

svm_model_y_pred = svm_model.predict(X_test)
print("SVM 정확도 : ", recall_score(y_test, svm_model_y_pred, pos_label=1))

# SGD Classifier
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)

sgd_model_y_pred = sgd_model.predict(X_test)
print("SGD 정확도 : ", recall_score(y_test, sgd_model_y_pred, pos_label=1))

# Logistic Regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)

logistic_model_y_pred = logistic_model.predict(X_test)
print("Logistics 정확도 : ", recall_score(y_test, logistic_model_y_pred, pos_label=1))

# 암 진단의 경우 양성 판단이 중요하기 때문에 pos_label을 1로 설정해서 재현율을 양성 클래스를 보고 계산하게 합니다.
# 재현율은 찾아야 할 것을 얼마나 놓치지 않고 잘 찾았는지에 대한 비율이기 때문에 양성을 기준으로 놓치지 않고 잘 찾는 모델을 선택하는 것이 중요하다.

y_train 고유값 분포:
 1    283
0    172
Name: count, dtype: int64
y_test 고유값 분포:
 1    74
0    40
Name: count, dtype: int64
-------------------------------------------
Decision 재현율 :  0.9594594594594594
Random Forest 재현율 :  1.0
SVM 정확도 :  1.0
SGD 정확도 :  0.9594594594594594
Logistics 정확도 :  1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
