在之前的数据训练中，我发现线下测试的误差率和线上得到的误差率相差达到2-4个百分点，这对我们的模型评估非常不利，这可能跟我的线下模型评估时划分validation与train的方式有关。

为了更好地验证数据，可以尝试一下按照销售时间来划分训练集和验证集。

因为traindata中不同月份的数据价格差达到两万，而不同年份最大价格差仅有一万，所以这里以月份为依据来划分

In [56]:
import pandas as pd 
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoCV,Lasso
from sklearn.metrics import mean_squared_error
import seaborn as sns
import numpy as np 
%matplotlib inline 
PATH = '~/Documents/Github/learning_notes/project/kaggle/houseprice/data/'
test_data = pd.read_csv(PATH + 'new_test.csv')
train_data = pd.read_csv(PATH + 'new_train.csv')
print(list(train_data.columns))

['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Fence', 'FireplaceQu', 'Fireplaces', 'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'HeatingQC', 'Id', 'KitchenAbvGr', 'KitchenQual', 'LandSlope', 'LotArea', 'LotFrontage', 'LotShape', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'OpenPorchSF', 'OverallQual', 'PavedDrive', 'PoolArea', 'PoolQC', 'SalePrice', 'SaleTime', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'ReferencePrice', 'TotalSF', 'Age', 'Remodeled', 'RecentRemodel', 'TimeSinceSold', 'TotalArea', 'BldgType_1Fam', 'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE', 'Condition1_Artery', 'Condition1_Feedr', 'Conditi

In [57]:
def tr_te_split(data,test_size = 0.2):
    years = ['MoSold_0', 'MoSold_1', 'MoSold_10', 'MoSold_11', 'MoSold_2', 'MoSold_3', 'MoSold_4', 'MoSold_5', 'MoSold_6', 'MoSold_7', 'MoSold_8', 'MoSold_9']
    x_trains = []
    x_tests = []
    y_trains = []
    y_tests = []
    for year in years:
        partial_data = data[data[year] == 1]
        x = partial_data.drop(['SalePrice','SaleTime','Id'],axis=1)
        y = partial_data['SalePrice']
        x_tr,x_te,y_tr,y_te = train_test_split(x,y,test_size=test_size)
        x_trains.append(x_tr)
        x_tests.append(x_te)
        y_trains.append(y_tr)
        y_tests.append(y_te)
    x_train = pd.concat(x_trains,axis=0)
    x_test = pd.concat(x_tests,axis=0)
    y_train = pd.concat(y_trains,axis=0)
    y_test = pd.concat(y_tests,axis=0)
    
    return x_train,x_test,y_train,y_test

In [66]:
x_train,x_test,y_train,y_test = tr_te_split(train_data)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((1159, 241), (297, 241), (1159,), (297,))

In [68]:
clf = Lasso(alpha = 0.0005)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(np.sqrt(np.mean(np.square(y_pred-y_test))))

0.12040930989939924


In [69]:
def score(clf,data,cv=5):
    scores = []
    for i in range(cv):
        x_train,x_test,y_train,y_test = tr_te_split(train_data)
        clf = Lasso(alpha = 0.0005)
        clf.fit(x_train,y_train)
        y_pred = clf.predict(x_test)
        scores.append(np.sqrt(np.mean(np.square(y_pred-y_test))))
    return scores

In [77]:
np.mean(score(clf,train_data))



0.11985804502747319

修改划分方式之后本地验证的效果与线上验证的效果之差降低到了2个百分点以内，效果显著

**调参手段中RandomSearch和GridSearch的对比**

In [82]:
import numpy as np 
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

x = train_data.drop(['SalePrice','SaleTime','Id'],axis=1)
y = train_data['SalePrice']
clf = RandomForestRegressor()

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              }

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(x, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 2.68 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.839 (std: 0.017)
Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 3, 'min_samples_split': 7}

Model with rank: 2
Mean validation score: 0.839 (std: 0.017)
Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 1, 'min_samples_split': 3}

Model with rank: 3
Mean validation score: 0.823 (std: 0.009)
Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 5, 'min_samples_split': 7}



In [85]:
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              }

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(x, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

GridSearchCV took 13.19 seconds for 108 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.850 (std: 0.016)
Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.848 (std: 0.021)
Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 3, 'min_samples_split': 10}

Model with rank: 3
Mean validation score: 0.847 (std: 0.011)
Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 3, 'min_samples_split': 3}

