In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline

#### 从url中导入数据

In [2]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"

columns = ['sample_code_number','clump thickness','uniformity_cell_size','uniformity_cell_shape','marginal_adhesion','single_ep_cell_size',\
          'bare_nuclei','bland_chromatin','normal_nucleoli','mitoses','class']

In [3]:
data = pd.read_csv(URL,names=columns)

In [4]:
data.head()

Unnamed: 0,sample_code_number,clump thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
sample_code_number       699 non-null int64
clump thickness          699 non-null int64
uniformity_cell_size     699 non-null int64
uniformity_cell_shape    699 non-null int64
marginal_adhesion        699 non-null int64
single_ep_cell_size      699 non-null int64
bare_nuclei              699 non-null object
bland_chromatin          699 non-null int64
normal_nucleoli          699 non-null int64
mitoses                  699 non-null int64
class                    699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


#### 缺失值 替换为标准的

In [15]:
data[data['bare_nuclei']=='?']

Unnamed: 0,sample_code_number,clump thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class


In [33]:
data = data.replace(to_replace='?',value=np.nan)

data = data.dropna(how='any')

data.shape

(683, 11)

In [34]:
x = data[columns[1:10]]

y = data[columns[10]]

In [35]:
x.shape

(683, 9)

In [37]:
x.dtypes

clump thickness           int64
uniformity_cell_size      int64
uniformity_cell_shape     int64
marginal_adhesion         int64
single_ep_cell_size       int64
bare_nuclei              object
bland_chromatin           int64
normal_nucleoli           int64
mitoses                   int64
dtype: object

In [20]:
y.shape

(683,)

#### 数据分割

In [22]:
from sklearn.cross_validation import train_test_split

In [38]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=2018)

In [40]:
type(x_test)

pandas.core.frame.DataFrame

In [25]:
y_train.value_counts()

2    358
4    188
Name: class, dtype: int64

In [26]:
y_test.value_counts()

2    86
4    51
Name: class, dtype: int64

#### 数据标准化

In [30]:
from sklearn.preprocessing import StandardScaler

In [41]:
ss = StandardScaler()

x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

注意：经过标准化后，数据由 pandas.core.frame.DataFrame 类型 变成了 numpy.ndarray 类型

In [56]:
type(y_train)

pandas.core.series.Series

In [57]:
type(x_train)

numpy.ndarray

#### 训练、评估、预测

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
lr = LogisticRegression()

In [45]:
# 训练

lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
# 评估
lr.score(x_test,y_test)

0.9635036496350365

In [48]:
# 预测
lr.predict(x_test)

array([2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 4, 2, 2, 4, 4, 4, 2, 2, 4, 2, 4,
       2, 2, 2, 2, 4, 2, 4, 2, 4, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2,
       2, 4, 2, 4, 2, 4, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 4, 4, 2, 4, 2, 4,
       4, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2,
       4, 4, 4, 2, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2,
       4, 2, 2, 2, 2], dtype=int64)

In [52]:
# 预测错了 5个
1 - 5/137

0.9635036496350365

#### 召回率、精确率、F1及准确率

In [53]:
from sklearn.metrics import classification_report

In [55]:
print(classification_report(y_test,lr.predict(x_test),target_names=['阴性','阳性']))

             precision    recall  f1-score   support

         阴性       0.98      0.97      0.97        86
         阳性       0.94      0.96      0.95        51

avg / total       0.96      0.96      0.96       137

