# 案例 - 线性回归 - 波士顿房价预测

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

In [2]:
# 获取数据
boston = load_boston()

In [4]:
# 划分数据集
x_train,x_test,y_train,y_test = train_test_split(boston.data,boston.target,random_state=8)

In [5]:
# 特征工程，标准化
# 1> 创建一个转换器
transfer = StandardScaler()
# 2> 数据标准化
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [9]:
# 方法一：正规方程求解
# 模型训练
# 1> 创建一个估计器
estimator_1 = LinearRegression()
# 2> 传入训练数据，进行机器学习
estimator_1.fit(x_train,y_train)
# 3> 打印梯度下降优化后的模型结果系数
print(estimator_1.coef_)
# 4> 打印梯度下降优化后的模型结果偏置
print(estimator_1.intercept_)

[-0.97231971  1.15986824  0.18913175  0.64921268 -1.48767577  2.6684006
 -0.16577486 -3.00663077  2.29528894 -1.83528364 -1.9229166   0.86732009
 -4.06006833]
22.52163588390508


In [13]:
# 方法二：梯度下降求解
# 模型训练
# 1> 创建一个估计器，可以通过调参数，找到学习率效果更好的值
estimator_2 = SGDRegressor(learning_rate='constant', eta0=0.001)
# 2> 传入训练数据，进行机器学习
estimator_2.fit(x_train,y_train)
# 3> 打印梯度下降优化后的模型结果系数
print(estimator_2.coef_)
# 4> 打印梯度下降优化后的模型结果偏置
print(estimator_2.intercept_)

[-0.60698913  0.53770286 -0.34562626  0.85302216 -0.305297    3.0631437
 -0.18783533 -1.37624705  0.37695909 -0.34913506 -1.63565401  0.70975816
 -3.00318461]
[19.18347064]




In [14]:
# 模型评估
# 使用均方误差对正规方程模型评估
y_predict = estimator_1.predict(x_test)
error = mean_squared_error(y_test,y_predict)
print('正规方程优化的均方误差为:\n',error)

# 使用均方误差对梯度下降模型评估
y_predict = estimator_2.predict(x_test)
error = mean_squared_error(y_test,y_predict)
print('梯度下降优化的均方误差为:\n',error)

正规方程优化的均方误差为:
 22.683150079709872
梯度下降优化的均方误差为:
 33.61046865346014


# 案例 - 岭回归 - 波士顿房价预测

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

In [2]:
# 获取数据
boston = load_boston()

In [3]:
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=6)

In [4]:
# 特征工程：标准化
# 1）实例化一个转换器类
transfer = StandardScaler()
# 2）调用fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [5]:
# 岭回归的预估器流程
estimator = Ridge()
estimator.fit(x_train, y_train)
y_predict = estimator.predict(x_test)
print("岭回归求出模型参数的方法预测的房屋价格为：\n", y_predict)

岭回归求出模型参数的方法预测的房屋价格为：
 [25.08460129 25.30307602 30.00448905 23.62600532 19.9768458  19.86241677
 16.50319497 40.62011448 20.75469738 19.56482676 23.56171257 12.777669
 28.57012218 34.19807441 32.72920466 18.17373065 13.08205288 28.84863583
 17.84336136 16.44163295 16.45130478 28.88749598 25.20275399 19.62877603
 20.26380009 19.60365368 20.80046422 31.50318212 19.92507094 25.76456174
 25.60397968 23.52732003 31.91924079 36.45724564 15.88775156 23.32795564
 14.80689408 21.23868885 19.99525176  3.05121005 11.6301399  11.17265832
 12.56960452 13.17050702 21.85747749 24.95335471 33.47596997 23.58819664
 19.58648961 18.80318134 21.29960584 16.53994932 24.20196534 20.81683737
 34.87429349 24.28417878 12.82102308 32.47880946 17.61288023 19.31542732
 24.12245971 23.9350186  11.82501344 29.7240362  14.54951584 16.1892973
 34.81842309 14.88301224 26.3474939  34.45589384  8.25192645 21.96223752
 16.37847526 22.78849139 30.27291398 22.84489328 14.15098486 42.35278012
 12.51338765 21.72067867 20.495

In [6]:
# 打印模型相关属性
print("岭回归求出的回归系数为：\n", estimator.coef_)
print("岭回归求出的偏置为：\n", estimator.intercept_)

岭回归求出的回归系数为：
 [-6.89058421e-01  1.31601008e+00 -2.68847598e-01  6.03967510e-01
 -1.64645481e+00  2.27142492e+00 -2.36046443e-04 -3.38063215e+00
  2.81504239e+00 -2.40058911e+00 -2.08064288e+00  8.16332904e-01
 -4.01891878e+00]
岭回归求出的偏置为：
 22.7759894459103


In [7]:
# 模型评估——均方误差
error = mean_squared_error(y_test, y_predict)
print("岭回归的均方误差为：\n", error)

岭回归的均方误差为：
 25.58537175025409


# 案例 - 逻辑回归 - 癌症分类预测-良／恶性乳腺癌肿瘤预测¶

In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,classification_report
from sklearn.preprocessing import StandardScaler

In [6]:
# 获取数据并添加字段名
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
cancer=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",names=column_name)
cancer.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [7]:
# 缺失值处理
cancer=cancer.replace(to_replace="?",value=np.nan)
cancer=cancer.dropna()

In [8]:
# 数据集划分
# 1> 提取特征数据与目标数据
x=cancer.iloc[:,1:-2]
y=cancer.iloc[:,-1]
# 2> 划分数据集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [9]:
# 标准化处理
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)

In [10]:
# 模型训练
# 创建一个逻辑回归估计器
estimator=LogisticRegression()
# 训练模型，进行机器学习
estimator.fit(x_train,y_train)
# 得到模型，打印模型回归系数，即权重值
print("logist回归系数为:\n",estimator.coef_)

logist回归系数为:
 [[1.54170901 0.15463467 0.5009949  1.00937972 0.26290729 1.14735367
  1.06181553 0.67939914]]


In [11]:
# 模型评估
# 方法1：真实值与预测值比对
y_predict=estimator.predict(x_test)
print("预测值为:\n",y_predict)
print("真实值与预测值比对:\n",y_predict==y_test)
# 方法2：计算准确率
print("直接计算准确率为:\n",estimator.score(x_test,y_test))

预测值为:
 [2 4 2 2 2 2 2 4 4 4 4 4 2 2 4 2 2 2 2 2 2 4 2 2 2 2 4 2 2 4 4 2 2 4 2 4 2
 2 2 4 4 2 4 2 2 2 4 4 2 2 2 4 4 2 2 4 2 2 4 4 2 2 2 4 2 2 2 4 4 2 2 4 2 4
 2 2 2 2 2 2 4 4 2 4 2 2 2 2 2 4 4 4 2 2 2 2 2 4 2 2 4 2 2 2 4 2 4 2 2 2 2
 2 2 4 2 4 4 4 4 2 4 4 2 2 4 2 2 4 4 2 2 2 2 2 4 4 4 2 2 4 2 4 4 2 4 2 4 4
 2 2 4 4 2 2 2 4 2 2 4 2 2 2 4 4 2 2 2 4 4 2 4 4 2 2 2 2 2 2 2 2 4 4 2 4 4
 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 4 4 4 2 4]
真实值与预测值比对:
 171    True
132    True
686    True
537    True
371    True
34     True
507    True
266    True
221    True
515    True
218    True
113    True
395    True
140    True
366    True
510    True
589    True
225    True
94     True
409    True
652    True
106    True
29     True
347    True
666    True
298    True
546    True
638    True
306    True
482    True
       ... 
281    True
463    True
650    True
649    True
256    True
159    True
594    True
17     True
344    True
128    True
450    True
534    True
133    True
544    True
394    True
516    True


In [14]:
#打印精确率、召回率、F1 系数以及该类占样本数
print("精确率与召回率为:\n",classification_report(y_test,y_predict,labels=[2,4],target_names=["良性","恶性"]))

精确率与召回率为:
              precision    recall  f1-score   support

         良性       0.99      0.98      0.98       131
         恶性       0.96      0.99      0.97        74

avg / total       0.98      0.98      0.98       205



In [15]:
###模型评估
#ROC曲线与AUC值
# 把输出的 2 4 转换为 0 或 1
y_test=np.where(y_test>2,1,0)  # 大于2就变为1，否则变为0
print("AUC值:\n",roc_auc_score(y_test,y_predict))

AUC值:
 0.9817928615638539
