# 模型评价：数据集划分方法

#### 划分基本准则：保持训练集和验证集之间的互斥性

准则解释：测试样本尽量不在训练样本中出现，以保证验证集上的表现能代表模型的泛化能力

#### 留出法：

直接将数据集划分成两个互斥的集合，其中一个做训练集，一个做验证集

常用划分比例：7:3、7.5:2.5、8:2

#### 交叉验证法（cv）

将数据集划分为k个大小相似的互斥子集，每一次以k-1个子集做训练，1个子集做验证，训练k次，最终返回的是k次训练结果的均值，因此交叉验证法又称为k折交叉法（k-fold）

In [29]:
# 基本数据读取
import pandas as pd
import matplotlib.pyplot as plt
# 样例数据读取
df = pd.read_excel('realestate_sample_preprocessed.xlsx')
# 根据共线性矩阵，保留与房价相关性最高的日间人口，将夜间人口和20-39岁夜间人口进行比例处理
def age_percent(row):
    if row['nightpop'] == 0:
        return 0
    else:
        return row['night20-39']/row['nightpop']
df['per_a20_39'] = df.apply(age_percent,axis=1)
df = df.drop(columns=['nightpop','night20-39'])
# 数据集基本情况查看
print(df.shape)
print(df.dtypes)
print(df.isnull().sum())

(898, 9)
id                 int64
complete_year      int64
average_price    float64
area             float64
daypop           float64
sub_kde          float64
bus_kde          float64
kind_kde         float64
per_a20_39       float64
dtype: object
id               0
complete_year    0
average_price    0
area             0
daypop           0
sub_kde          0
bus_kde          0
kind_kde         0
per_a20_39       0
dtype: int64


In [30]:
# 构建模型
import numpy as np
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# 构建模型工作流
pipe_lm = Pipeline([
        ('sc',StandardScaler()),
        ('power_trans',PowerTransformer()),
        ('polynom_trans',PolynomialFeatures(degree=3)),
        ('lasso_regr', LassoCV(alphas=(
                list(np.arange(8, 10) * 10)
            ),
    cv=KFold(n_splits=3, shuffle=True),
    n_jobs=-1))
        ])
print(pipe_lm)

Pipeline(memory=None,
     steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('power_trans', PowerTransformer(copy=True, method='yeo-johnson', standardize=True)), ('polynom_trans', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('lasso_regr', LassoCV(alphas=[80, 90], copy_X=Tr...ve=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False))])


In [31]:
# 留出法进行数据集划分
# 载入sklearn中数据集划分的方法
from sklearn.model_selection import train_test_split
# 将数据集划分成训练集和验证集：划分比例0.75训练，0.25验证
training, testing = train_test_split(df,test_size=0.25, random_state=1)
# 提取训练集中的x与y
x_train = training.copy()[['complete_year','area', 'daypop', 'sub_kde',
       'bus_kde', 'kind_kde','per_a20_39']]
y_train = training.copy()['average_price']
# 提取验证集中的x与y
x_test = testing.copy()[['complete_year','area', 'daypop', 'sub_kde',
       'bus_kde', 'kind_kde','per_a20_39']]
y_test = testing.copy()['average_price']
print(f'the shape of training set is: {training.shape}')
print(f'the shape of testing set is: {testing.shape}')

the shape of training set is: (673, 9)
the shape of testing set is: (225, 9)


In [32]:
# 查看留出法验证集上模型的表现
import warnings
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
warnings.filterwarnings('ignore')
pipe_lm.fit(x_train,y_train)
y_predict = pipe_lm.predict(x_test)
print(f'mean squared error is: {mean_squared_error(y_test,y_predict)}')
print(f'mean absolute error is: {mean_absolute_error(y_test,y_predict)}')
print(f'R Squared is: {r2_score(y_test,y_predict)}')

mean squared error is: 37631800.27942381
mean absolute error is: 4385.844228977758
R Squared is: 0.57602148623751


In [None]:
# 交叉验证法进行数据集划分
from sklearn.model_selection import KFold
x = df[['complete_year','area', 'daypop', 'sub_kde',
       'bus_kde', 'kind_kde','per_a20_39']]
y = df['average_price']
k = 10
kf = KFold(n_splits=k, shuffle=True)

In [33]:
# 查看交叉验证法模型表现
mse = []
mae = []
r_s2 = []
for train_index, test_index in kf.split(df):  # 拆分
    x_traincv, x_testcv = x.loc[train_index], x.loc[test_index]
    y_traincv, y_testcv = y.loc[train_index], y.loc[test_index]
    pipe_lm.fit(x_traincv, y_traincv)  # 训练
    y_predictcv = pipe_lm.predict(x_testcv)  # 预测
    k_mse = mean_squared_error(y_testcv, y_predictcv)
    mse.append(k_mse)
    print(f'mean squared error is: {k_mse}')
    k_mae = mean_absolute_error(y_testcv, y_predictcv)
    mae.append(k_mae)
    print(f'mean absolute error is: {k_mae}')
    k_r_s2 = r2_score(y_testcv, y_predictcv)
    r_s2.append(k_r_s2)
    print(f'R Squared is: {k_r_s2}')

mean squared error is: 35343529.48829773
mean absolute error is: 4440.496229506002
R Squared is: 0.624576918867833
mean squared error is: 37183112.59906151
mean absolute error is: 4298.302676162478
R Squared is: 0.4024228305339529
mean squared error is: 49452327.64452156
mean absolute error is: 4993.620904192527
R Squared is: 0.5383265602659288
mean squared error is: 29010157.909958623
mean absolute error is: 3687.2699912256844
R Squared is: 0.6556250793164027
mean squared error is: 30181604.83441743
mean absolute error is: 3927.2142567288797
R Squared is: 0.6662200142470954
mean squared error is: 42522602.850770354
mean absolute error is: 4669.897100062009
R Squared is: 0.5356679506743394
mean squared error is: 22961161.757603135
mean absolute error is: 3689.4269032828956
R Squared is: 0.6816637313881144
mean squared error is: 31222683.601054277
mean absolute error is: 4050.6438241341784
R Squared is: 0.5963288876075987
mean squared error is: 33605097.50118247
mean absolute error is: 

In [34]:
import numpy as np
print(f'mean squared error is: {np.array(mse).mean()}')
print(f'mean absolute error is: {np.array(mae).mean()}')
print(f'R Squared is: {np.array(r_s2).mean()}')

mean squared error is: 35107089.48225556
mean absolute error is: 4242.573669116604
R Squared is: 0.5729295106316359
