# 逻辑回归 API 选取部分特征

## 导入所需的包

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split #分割数据集
from sklearn.linear_model import LogisticRegression #逻辑回归模型
from sklearn.preprocessing import StandardScaler #特征标准化
from sklearn.feature_selection import VarianceThreshold

## 数据处理

In [21]:
# 1.获取数据集
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
         'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
         'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",names=names)

In [22]:
# 2.缺省值处理
data = data.replace(to_replace="?", value=np.NaN)
data = data.dropna()

In [28]:
# 3.确顶特征值，目标值
x = data.iloc[:, 1:10] #前:取行数，后:取列数  #从第2列到第10列 左闭右开
y = data["Class"] # 取"Class"列作为y
x.shape

(683, 9)

### 表示舍弃所有方差小于1的特征

### 选取了7个特征(全部9个)

In [31]:
selector = VarianceThreshold(5)
X_var0 = selector.fit_transform(x)
X_var0.shape

(683, 7)

In [32]:
# 4，分割数据集
# 用train_test_split函数划分出训练集和测试集，测试集占比0.2
x_train, x_test, y_train, y_test = train_test_split(X_var0, y, test_size=0.2,random_state=44)
# X指数据样本集合，y指样本标签，random_state指随机数种子,用来保证每次划分出的测试集和数据集是一样的

In [33]:
# 5，特征标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

## logisticRegression

In [34]:
# predict（）：训练后返回一个概率值数组，此数组的大小为 n·k，第i行第j列上对应的数值代表模型对此样本属于某类标签的概率值，行和为1。
# 例如预测结果为：[[0.66651809 0.53348191]，代表预测样本的标签是0的概率为0.66651809，1的概率为0.53348191。
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [35]:
y_predict = lr.predict(x_test)
y_predict


array([2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2,
       2, 4, 4, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 4, 4, 2,
       4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2,
       4, 4, 4, 4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 2, 2, 2,
       4, 2, 4, 4, 4, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2,
       2, 4, 2, 4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2,
       2, 2, 2, 2, 2], dtype=int64)

## 性能测评

### 结果0.9562043795620438小于0.9635036496350365（全部特征预测）

In [36]:
lr.score(x_test,y_test)

0.9562043795620438

### 总体预测结果比选取全部特征差