In [1]:
import os

import numpy as np
import pandas as pd

import config

In [2]:
column_names = [
    "Sample code number",
    "Clump Thickness",
    "Uniformity of Cell Size",
    "Uniformity of Cell Shape",
    "Marginal Adhesion",
    "Single Epithelial Cell Size",
    "Bare Nuclei",
    "Bland Chromatin",
    "Normal Nucleoli",
    "Mitoses",
    "Class"
]

data = pd.read_csv(os.path.join(config.PROJECT_ROOT, "breast-cancer-wisconsin", "breast-cancer-wisconsin.data"), names=column_names)

data.shape

(699, 11)

In [3]:
# 数据清洗
data = data.replace(to_replace="?", value=np.nan)
data = data.dropna(how="any")

data.shape

(683, 11)

In [4]:
data

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [5]:
from sklearn.model_selection import train_test_split

# 训练集与测试集划分
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1: 10]], data[column_names[10]], test_size=0.25, random_state=33)

In [6]:
data[column_names[10]].value_counts()

2    444
4    239
Name: Class, dtype: int64

In [7]:
y_train.value_counts()

2    344
4    168
Name: Class, dtype: int64

In [8]:
y_test.value_counts()

2    100
4     71
Name: Class, dtype: int64

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

X_train

array([[-1.19196677, -0.68788787, -0.03809494, ..., -0.55666054,
        -0.58626819, -0.34343195],
       [-1.19196677,  0.30608745, -0.03809494, ...,  0.68611648,
         1.09441105, -0.34343195],
       [ 0.24231522, -0.02523765, -0.72246721, ..., -0.97091955,
        -0.58626819, -0.34343195],
       ..., 
       [ 0.24231522, -0.68788787, -0.38028108, ..., -0.97091955,
        -0.58626819, -0.34343195],
       [-1.19196677, -0.68788787, -0.72246721, ..., -0.55666054,
        -0.58626819, -0.34343195],
       [ 0.95945621, -0.02523765, -0.38028108, ...,  0.68611648,
         0.42213936,  1.42674078]])

In [10]:
# 创建对率回归模型
lr = LogisticRegression()

# 训练, 这尼玛就完了？
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
# 预测
lr_y_predict = lr.predict(X_test)

lr_y_predict

array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4,
       2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 2,
       2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2,
       2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 4, 4,
       2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 4, 2, 2, 2,
       2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4,
       4, 4, 2, 4, 2, 2, 4, 2, 4, 4])

In [12]:
from sklearn.metrics import classification_report

print("Accuracy of LR Classifier:", lr.score(X_test, y_test))
print(classification_report(y_test, lr_y_predict, target_names=["Benign", "Malignant"]))

Accuracy of LR Classifier: 0.970760233918
             precision    recall  f1-score   support

     Benign       0.96      0.99      0.98       100
  Malignant       0.99      0.94      0.96        71

avg / total       0.97      0.97      0.97       171



In [13]:
sgdc = SGDClassifier()

sgdc.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [14]:
sgdc_y_predict = sgdc.predict(X_test)

sgdc_y_predict

array([2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 4,
       2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2,
       4, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2,
       2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2,
       2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4,
       4, 4, 2, 4, 4, 2, 2, 2, 4, 4])

In [15]:
print("Accuracy of SGD Classifier:", sgdc.score(X_test, y_test))
print(classification_report(y_test, sgdc_y_predict, target_names=["Benign", "Malignant"]))

Accuracy of SGD Classifier: 0.842105263158
             precision    recall  f1-score   support

     Benign       0.79      0.99      0.88       100
  Malignant       0.98      0.63      0.77        71

avg / total       0.87      0.84      0.83       171

