欠拟合：训练集得分低，测试集得分低  
过拟合：训练集得分高，测试集得分低

In [1]:
from sklearn.datasets import load_boston # 加载波士顿房价数据集
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
# 加载数据集
x, y = load_boston(return_X_y=True)
# 切分数据，8:2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)
print(x_train.shape)

(404, 13)


由于模型比较简单，但是训练集和测试集分数相差大，出现过拟合

In [3]:
# 创建模型训练
model = LinearRegression()
model.fit(x_train, y_train)
print('train', model.score(x_train, y_train))
print('test', model.score(x_test, y_test))

train 0.7559380876016175
test 0.6592466510354097


In [4]:
x # 查看特征，数值相差大，进行特征缩放

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [5]:
# 因为数据已经经过特征缩放了，所以没有任何作用
from sklearn.preprocessing import MinMaxScaler

In [6]:
minmax = MinMaxScaler()
minmax.fit(x_train) # fit 只能在训练集
x_train = minmax.transform(x_train)
x_test = minmax.transform(x_test)

In [7]:
# 创建模型训练  正常特征缩放可以解决模型过拟合问题，但是当前数据已经做过特征缩放，所以没有作用
model = LinearRegression()
model.fit(x_train, y_train)
print('train', model.score(x_train, y_train))
print('test', model.score(x_test, y_test))

train 0.7559380876016175
test 0.6592466510354096


正则化最常用处理模型过拟合的方式

In [8]:
# L1 Lasso 套索回归  特征筛选     L2 Ridge 岭回归  处理过拟合问题  权重变小
from sklearn.linear_model import Lasso, Ridge

In [13]:
# L1正则
l1 = Lasso(alpha=0.1) # 正则化系数
l1.fit(x_train, y_train)
print('train', l1.score(x_train, y_train))
print('test', l1.score(x_test, y_test))

train 0.7122445217557418
test 0.5890105377961613


In [14]:
l1.coef_ # 权重

array([ -0.        ,   0.        ,  -0.        ,   0.22769147,
        -0.        ,  21.22252651,  -0.        ,  -1.86674769,
         0.        ,  -1.25476345,  -6.96153372,   1.1295474 ,
       -19.33703323])

In [16]:
# L2正则
l2 = Ridge(alpha=0.5)
l2.fit(x_train, y_train)
print('train', l2.score(x_train, y_train))
print('test', l2.score(x_test, y_test))

train 0.754785017061654
test 0.6628944355802883


网格搜索交叉验证  
调参 查找最优参数值 

In [17]:
from sklearn.model_selection import GridSearchCV

In [19]:
# 第一步创建需要调参的模型
l2 = Ridge() # 不能写参数  
# 填写调整的参数  {‘参数名’:[参数值]}
pg = {'alpha': [0.2, 0.4, 0.6, 0.8]}
# 创建网格搜索
model = GridSearchCV(l2, pg, cv=5)# 模型、调整的参数值、交叉验证的折数
model.fit(x_train, y_train)
print('打印模型最优得分', model.best_score_)
print('模型最优参数', model.best_params_)

打印模型最优得分 0.7324499790523256
模型最优参数 {'alpha': 0.4}


In [20]:
# 使用最优参数创建模型，打印最终的效果
l2 = Ridge(alpha=0.4)
l2.fit(x_train, y_train)
print('train', l2.score(x_train, y_train))
print('test', l2.score(x_test, y_test))

train 0.7551700779427534
test 0.6622833325446515
