# 符号标记
- X_train 训练数据
- y_train  训练集标签
- X_test  测试数据
- y_test  测试集标签
- X 完整数据
- y 数据标签

# 基本建模流程

## 导入工具包

In [12]:
from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## 加载数据

In [13]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

## 训练集测试集划分

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## 数据预处理

In [15]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 模型构建与拟合

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

## 模型预测与评价

In [17]:
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

0.591314236002812

# 数据加载
- Scikit-learn支持以NumPy的arrays对象、Pandas对象、SciPy的稀疏 矩阵及其他可转换为数值型arrays的数据结构作为其输入，前提是数据 必须是数值型的.
- sklearn.datasets模块提供了一系列加载和获取著名数据集如鸢尾 花、波士顿房价、Olivetti人脸、MNIST数据集等的工具，也包括了一 些toy data如S型数据等的生成工具.

In [18]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

# 训练集-测试集划分

    将完整数据集的70%作为训练集，30%作为测试集，并使得测试集和训练 集中各类别数据的比例与原始数据集比例一致(stratify分层策略)， 另外可通过设置 shuffle=True 提前打乱数据.

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, stratify=y, test_size = 0.3)

# 数据预处理

    from sklearn.preprocessing import StandardScaler
- x 构建转换器实例

        scaler = StandardScaler()
- y 拟合及转换 scaler.fit_transform(X_train)

- 最小最大标准化           MinMaxScaler
- One-Hot 编码            OneHotEncoder
- 归一化                      Normalizer
- 二值化（单个特征转换）Binarizer
- 标签编码                   LabelEncoder
- 缺失值填补                Imputer
- 多项式特征生成          PolynomialFeatures

# 特征选择

- from sklearn import feature_selection as fs
- fs.SelectKBest(score_func, k) | 过滤式(Filter)，保留 得分排名前k的特征(top k方式).
- fs.RFECV(estimator, scoring=”r2”) | 封 装 式 (Wrap- per)，结合交叉验证的递归特征消除法，自动选择最优特征个数. 
- fs.SelectFromModel(estimator) | 嵌入式(Embedded)，从 模型中自动选择特征，任何具有coef_或者feature_importances_的 基模型都可以作为estimator参数传入.

In [21]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
X, y = load_digits(return_X_y=True)
print('X.shape:', X.shape)
X_new = SelectKBest(chi2, k = 20).fit_transform(X, y)
X_new.shape

X.shape: (1797, 64)


(1797, 20)

In [26]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

X, y = load_digits(return_X_y=True)
print('X.shape:', X.shape)
X_new = RFECV(estimator = LogisticRegression(), scoring='r2', min_features_to_select = 1).fit(X, y)
X_new.shape

X.shape: (1797, 64)




















KeyboardInterrupt: 

In [27]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

#递归特征消除法，返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target)



array([[3.5, 0.2],
       [3. , 0.2],
       [3.2, 0.2],
       [3.1, 0.2],
       [3.6, 0.2],
       [3.9, 0.4],
       [3.4, 0.3],
       [3.4, 0.2],
       [2.9, 0.2],
       [3.1, 0.1],
       [3.7, 0.2],
       [3.4, 0.2],
       [3. , 0.1],
       [3. , 0.1],
       [4. , 0.2],
       [4.4, 0.4],
       [3.9, 0.4],
       [3.5, 0.3],
       [3.8, 0.3],
       [3.8, 0.3],
       [3.4, 0.2],
       [3.7, 0.4],
       [3.6, 0.2],
       [3.3, 0.5],
       [3.4, 0.2],
       [3. , 0.2],
       [3.4, 0.4],
       [3.5, 0.2],
       [3.4, 0.2],
       [3.2, 0.2],
       [3.1, 0.2],
       [3.4, 0.4],
       [4.1, 0.1],
       [4.2, 0.2],
       [3.1, 0.2],
       [3.2, 0.2],
       [3.5, 0.2],
       [3.6, 0.1],
       [3. , 0.2],
       [3.4, 0.2],
       [3.5, 0.3],
       [2.3, 0.3],
       [3.2, 0.2],
       [3.5, 0.6],
       [3.8, 0.4],
       [3. , 0.3],
       [3.8, 0.2],
       [3.2, 0.2],
       [3.7, 0.2],
       [3.3, 0.2],
       [3.2, 1.4],
       [3.2, 1.5],
       [3.1,

In [28]:
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

# Load the boston dataset.
load_iris = load_iris()
X, y = load_iris['data'], load_iris['target']
print("X 共有 %s 个特征"%X.shape[1])

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc,prefit=True)
X_new = model.transform(X)
print("X_new 共有 %s 个特征"%X_new.shape[1]) 

X 共有 4 个特征
X_new 共有 3 个特征


# 有监督学习算法

    回归
- from sklearn.linear_model import LinearRegression
    - 构建模型实例
- lr = LinearRegression(normalize=True)  
    - 训练模型
- lr.fit(X_train, y_train)
    - 作出预测
- y_pred = lr.predict(X_test)

# 模型评估

- sklearn.metrics模块包含了一系列用于评价模型的评分函数、损失函 数以及成对数据的距离度量函数.
- from sklearn.metrics import accuracy_score 
- metrics.accuracy_score(y_true, y_pred)
- 对于测试集而言，y_test即是y_true，大部分函数都必须包含真实值y_true和预测 值y_pred，基于排版的关系，以下不再重复写这两个参数.

## 回归模型

In [29]:
"""
metrics.mean_absolute_error()| 平均绝对误差MAE. 
metrics.mean_squared_error() | 均方误差MSE. 
metrics.r2_score() | 决定系数R2.    
"""

'\nmetrics.mean_absolute_error()| 平均绝对误差MAE. \nmetrics.mean_squared_error() | 均方误差MSE. \nmetrics.r2_score() | 决定系数R2.    \n'

# 分类模型评价

In [30]:
"""
metrics.accuracy_score()| 正确率.
metrics.precision_score() | 各类精确率. 
metrics.f1_score() | F1值.
metrics.log_loss() | 对数损失或交叉熵损失. 
metrics.confusion_matrix | 混淆矩阵.
metrics.classification_report | 含多种评价的分类报告.
"""

'\nmetrics.accuracy_score()| 正确率.\nmetrics.precision_score() | 各类精确率. \nmetrics.f1_score() | F1值.\nmetrics.log_loss() | 对数损失或交叉熵损失. \nmetrics.confusion_matrix | 混淆矩阵.\nmetrics.classification_report | 含多种评价的分类报告.\n'

# 交叉验证及超参数调优

## 交叉验证

    使用5折交叉验证对决策树模型进行评估，使用的评分函数为F1值.
    􏰀 sklearn提供了部分带交叉验证功能的模型类如LassoCV、Logisti- cRegressionCV等，这些类包含cv参数.

In [31]:
"""
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(max_depth=5)
scores = cross_val_score(clf, X_train, y_train,
                       cv=5, scoring=’f1_weighted’)
"""

'\nfrom sklearn.model_selection import cross_val_score\nclf = DecisionTreeClassifier(max_depth=5)\nscores = cross_val_score(clf, X_train, y_train,\n                       cv=5, scoring=’f1_weighted’)\n'

## 超参数调优⸺网格搜索
    在参数网格上进行穷举搜索，方法简单但是搜索速度慢(超参数较多 时)，且不容易找到参数空间中的局部最优.

In [32]:
"""
from sklearn.model_selection import GridSearchCV 
svc = svm.SVC()
params = {’kernel’:[’linear’, ’rbf’], ’C’:[1, 10]}
grid_search = GridSearchCV(svc, params, cv=5) 
grid_search.fit(X_train, y_train) 
grid_search.best_params_
"""

'\nfrom sklearn.model_selection import GridSearchCV \nsvc = svm.SVC()\nparams = {’kernel’:[’linear’, ’rbf’], ’C’:[1, 10]}\ngrid_search = GridSearchCV(svc, params, cv=5) \ngrid_search.fit(X_train, y_train) \ngrid_search.best_params_\n'

## 超参数调优⸺随机搜索
    在参数子空间中进行随机搜索，选取空间中的100个点进行建模(可从 scipy.stats常见分布如正态分布norm、均匀分布uniform中随机采样 得到)，时间耗费较少，更容易找到局部最优.

In [33]:
"""
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
svc = svm.SVC()
param_dist = {’kernel’:[’linear’, ’rbf’], ’C’:randint(1, 20)} 
random_search = RandomizedSearchCV(svc, param_dist, n_iter=10) 
random_search.fit(X_train, y_train) 
random_search.best_params_
"""

'\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom scipy.stats import randint\nsvc = svm.SVC()\nparam_dist = {’kernel’:[’linear’, ’rbf’], ’C’:randint(1, 20)} \nrandom_search = RandomizedSearchCV(svc, param_dist, n_iter=10) \nrandom_search.fit(X_train, y_train) \nrandom_search.best_params_\n'