scikit-learnに内包されている、アメリカウィスコンシン州の乳がんのデータ元に、良性・悪性の二値分類を行って下さい。
評価は、F1スコアで評価した結果が良くなるように、アルゴリズム選定から、説明変数の選定及びパラメータ調整を行って下さい。

最終的な目的は、汎化性能が高いモデルを構築することです。
用いて良いツールは、scikit-learnのみとし、ディープラーニングなどの別の手法は採用しないものとします。

In [1]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

以下は回答例

In [2]:
X = X[:, :10]
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X, y)
y_pred = clf.predict(X)

In [3]:
from sklearn.metrics import accuracy_score
accuracy_score(y, y_pred)

0.9086115992970123

In [4]:
from sklearn.metrics import classification_report
print(classification_report(y, y_pred))

             precision    recall  f1-score   support

          0       0.92      0.83      0.87       212
          1       0.90      0.96      0.93       357

avg / total       0.91      0.91      0.91       569



# 回答は以下

以下にセルを追加して、実装を書き加えてください。

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [6]:
df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names) #datasetをデータフレーム化
le = LabelEncoder()
df["target"] = breast_cancer.target_names[breast_cancer.target]
df["target"] = le.fit_transform(df["target"]) #クラスラベルのエンコードmalignant=1,benign=0 

In [7]:
df.head(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,1
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,1
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,1
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,1
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,1


In [8]:
df.isnull().sum() #欠損値の確認，欠損値なし

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [9]:
df.shape #列数の確認

(569, 31)

In [10]:
X = df.iloc[:,0:30].values
y = df["target"]

In [11]:
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
#classifier
clf1 = LogisticRegression(penalty='l2',C=0.001,random_state=0)
clf2 = KNeighborsClassifier(n_neighbors=1, p=2,metric='minkowski')
clf3 = SVC(random_state=0)
clf4 = DecisionTreeClassifier(max_depth=3,criterion='entropy',random_state=0)
clf5 = RandomForestClassifier(random_state=0)

#パイプライン処理
pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
pipe2 = Pipeline([['sc', StandardScaler()], ['clf', clf2]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

clf_labels = ['Logistic Regression', 'KNN','SVM','Decision Tree', 'Random Forest']


In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import  f1_score
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.model_selection import GridSearchCV

#各classifierに対しk=5でcross validation
for clf, label in zip([pipe1, pipe2, pipe3,clf4,clf5], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X,
                             y=y,
                             cv=5,
                             scoring='accuracy')
    print("F1_score: %0.3f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))
#SVMのF1scoreが高い

F1_score: 0.946 (+/- 0.02) [Logistic Regression]
F1_score: 0.954 (+/- 0.02) [KNN]
F1_score: 0.975 (+/- 0.01) [SVM]
F1_score: 0.933 (+/- 0.02) [Decision Tree]
F1_score: 0.960 (+/- 0.02) [Random Forest]


In [13]:
#SVMのパラメーターチューニング
model = SVC(random_state=0)
sc = StandardScaler()
X=sc.fit_transform(X)

params= [{'C': [1, 10, 100, 1000], 'gamma': [0.01,0.001, 0.0001], 'kernel': ['rbf']}]

clf = GridSearchCV(model,param_grid=params,scoring="f1_macro",cv=5)
clf.fit(X,y)

print("Best Score:\n",clf.best_score_)
print("Best Hyper Parameters:\n",clf.best_params_)

Best Score:
 0.977274904776
Best Hyper Parameters:
 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [14]:
#ベストパラメーターをセット
clf_svm = SVC(C=10,gamma=0.01,kernel="rbf",random_state=0)
pipe = Pipeline([['sc', StandardScaler()], ['clf', clf_svm]])
scores = cross_val_score(estimator=pipe,
                             X=X,
                             y=y,
                             cv=5,
                             scoring='f1_macro')
print("F1_score: %0.3f (+/- %0.2f)"
          % (scores.mean(), scores.std()))

F1_score: 0.977 (+/- 0.01)
