In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

In [2]:
train_df = pd.read_csv('./2-train-feature-engineering.csv',header=0)
train_df.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,YrSold_2008,YrSold_2007,YrSold_2006,YrSold_2009,YrSold_2010,SalePrice
0,11.692623,11.686189,0.0,1,3,4,3,11.170327,0.0,2,...,0,0,0,1,1,0,0,0,0,12.247699
1,12.792276,0.0,0.0,1,3,4,1,12.062832,0.0,0,...,0,0,0,1,0,1,0,0,0,12.109016
2,11.892039,11.724598,0.0,1,3,4,2,10.200343,0.0,2,...,0,0,0,1,1,0,0,0,0,12.317171
3,12.013683,11.354094,0.0,1,3,1,3,8.274266,0.0,0,...,0,0,0,1,0,0,1,0,0,11.849405
4,12.510588,12.271365,0.0,1,4,4,0,10.971129,0.0,2,...,0,0,0,1,1,0,0,0,0,12.42922


In [3]:
train = train_df.drop(['Id','SalePrice'],axis=1)
train.shape

(1458, 249)

In [4]:
y_train = train_df.SalePrice
y_train.shape

(1458,)

In [5]:
test_df = pd.read_csv('./2-test-feature-engineering.csv')
test_df.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,YrSold_2008,YrSold_2007,YrSold_2006,YrSold_2009,YrSold_2010
0,11.81868,0.0,0.0,1,2,4,3,10.105326,7.397498,5,...,0,0,0,0,1,0,0,0,0,1
1,12.943735,0.0,0.0,1,3,4,3,11.901094,0.0,0,...,0,0,0,0,1,0,0,0,0,1
2,11.916131,11.151348,0.0,1,3,4,3,11.476685,0.0,2,...,0,0,0,0,1,0,0,0,0,1
3,11.910125,11.062536,0.0,1,3,4,3,10.749651,0.0,2,...,0,0,0,0,1,0,0,0,0,1
4,12.833625,0.0,0.0,1,2,4,3,8.72017,0.0,0,...,0,0,0,0,1,0,0,0,0,1


In [6]:
test = test_df.drop(['Id'],axis=1)
test.shape

(1459, 249)

### 模型选择

预测房价属于回归问题，选用了几个回归模型，通过GridSearchCV暴力寻找最优参数。

In [7]:
#定义评分标准
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

- Lasso是增加L1正则化项的线性回归模型，对特征相关性较高的数据集，具有稀疏特征的功能，可以抑制过拟合。

In [9]:
lasso = Pipeline([('rbs',RobustScaler()),('lasso',Lasso())])
params = {'lasso__alpha':[0.5,0.1,0.05,0.01,0.005,0.001,0.0005]}
kf = KFold(5, shuffle=True, random_state=42)
grid_res = GridSearchCV(lasso,scoring='neg_mean_squared_error',param_grid=params,cv=kf)
grid_res.fit(train.values,y_train)
print(grid_res.best_params_, np.sqrt(-grid_res.best_score_))
grid_res.cv_results_['mean_test_score'] = np.sqrt(-grid_res.cv_results_['mean_test_score'])
print(pd.DataFrame(grid_res.cv_results_)[['params','mean_test_score','std_test_score']])
grid_best_train_pred = grid_res.best_estimator_.predict(train.values)
rmse(y_train,grid_best_train_pred)

{'lasso__alpha': 0.0005} 0.111470252229
                     params  mean_test_score  std_test_score
0     {'lasso__alpha': 0.5}         0.399626        0.013581
1     {'lasso__alpha': 0.1}         0.226071        0.005620
2    {'lasso__alpha': 0.05}         0.176119        0.002294
3    {'lasso__alpha': 0.01}         0.129153        0.002050
4   {'lasso__alpha': 0.005}         0.124591        0.002268
5   {'lasso__alpha': 0.001}         0.113460        0.001631
6  {'lasso__alpha': 0.0005}         0.111470        0.001509


0.099834027382049342

- Ridge是增加L2正则化项的线性回归，可以抑制过拟合。

In [10]:
ridge = Pipeline([('rbs',RobustScaler()),('ridge',Ridge())])
params = {'ridge__alpha':[50,30,10,8,5,3,1]}
kf = KFold(5, shuffle=True, random_state=42)
ridge_grid = GridSearchCV(ridge,param_grid=params,cv=kf,scoring='neg_mean_squared_error')
ridge_grid.fit(train.values,y_train)
print(ridge_grid.best_params_, np.sqrt(-ridge_grid.best_score_))
ridge_grid.cv_results_['mean_test_score'] = np.sqrt(-ridge_grid.cv_results_['mean_test_score'])
print(pd.DataFrame(ridge_grid.cv_results_)[['params','mean_test_score','std_test_score']])
ridge_grid_train_pred = ridge_grid.best_estimator_.predict(train.values)
rmse(y_train,ridge_grid_train_pred)

{'ridge__alpha': 10} 0.112911314624
                 params  mean_test_score  std_test_score
0  {'ridge__alpha': 50}         0.114270        0.001842
1  {'ridge__alpha': 30}         0.113389        0.001807
2  {'ridge__alpha': 10}         0.112911        0.001755
3   {'ridge__alpha': 8}         0.113011        0.001744
4   {'ridge__alpha': 5}         0.113443        0.001718
5   {'ridge__alpha': 3}         0.114290        0.001695
6   {'ridge__alpha': 1}         0.117521        0.001717


0.096620134809426148

- ElasticNet

In [11]:
ENet = Pipeline([('rbs',RobustScaler()),('enet',ElasticNet())])
params = {'enet__alpha':[0.1,0.01,0.001,0.0005,0.0001],
         'enet__l1_ratio':[0.1,0.3,0.5,0.9]}
kf = KFold(5, shuffle=True, random_state=42)
ENet_grid = GridSearchCV(ENet,param_grid=params,cv=kf,scoring='neg_mean_squared_error')
ENet_grid.fit(train.values,y_train)
print(ENet_grid.best_params_, np.sqrt(-ENet_grid.best_score_))
ENet_grid.cv_results_['mean_test_score'] = np.sqrt(-ENet_grid.cv_results_['mean_test_score'])
print(pd.DataFrame(ENet_grid.cv_results_)[['params','mean_test_score','std_test_score']])
ENet_grid_train_pred = ENet_grid.best_estimator_.predict(train.values)
rmse(y_train,ENet_grid_train_pred)

{'enet__alpha': 0.0005, 'enet__l1_ratio': 0.9} 0.111440844985
                                            params  mean_test_score  \
0      {'enet__alpha': 0.1, 'enet__l1_ratio': 0.1}         0.132490   
1      {'enet__alpha': 0.1, 'enet__l1_ratio': 0.3}         0.156847   
2      {'enet__alpha': 0.1, 'enet__l1_ratio': 0.5}         0.178813   
3      {'enet__alpha': 0.1, 'enet__l1_ratio': 0.9}         0.215808   
4     {'enet__alpha': 0.01, 'enet__l1_ratio': 0.1}         0.114749   
5     {'enet__alpha': 0.01, 'enet__l1_ratio': 0.3}         0.120985   
6     {'enet__alpha': 0.01, 'enet__l1_ratio': 0.5}         0.124430   
7     {'enet__alpha': 0.01, 'enet__l1_ratio': 0.9}         0.128178   
8    {'enet__alpha': 0.001, 'enet__l1_ratio': 0.1}         0.113391   
9    {'enet__alpha': 0.001, 'enet__l1_ratio': 0.3}         0.111867   
10   {'enet__alpha': 0.001, 'enet__l1_ratio': 0.5}         0.111654   
11   {'enet__alpha': 0.001, 'enet__l1_ratio': 0.9}         0.112910   
12  {'enet__alp

0.099230215312877595

- KernelRidge

In [13]:
KRR = Pipeline([('rbs',RobustScaler()),('krr',KernelRidge(kernel='polynomial',degree=2,coef0=2.5))])
params = {'krr__alpha':[0.7,0.8,0.9]}
kf = KFold(5, shuffle=True, random_state=42)
KRR_grid = GridSearchCV(KRR,param_grid=params,cv=kf,scoring='neg_mean_squared_error')
KRR_grid.fit(train.values,y_train)
print(KRR_grid.best_params_, np.sqrt(-KRR_grid.best_score_))
KRR_grid.cv_results_['mean_test_score'] = np.sqrt(-KRR_grid.cv_results_['mean_test_score'])
print(pd.DataFrame(KRR_grid.cv_results_)[['params','mean_test_score','std_test_score']])
KRR_grid_train_pred = KRR_grid.best_estimator_.predict(train.values)
rmse(y_train,KRR_grid_train_pred)

{'krr__alpha': 0.7} 0.112640395419
                params  mean_test_score  std_test_score
0  {'krr__alpha': 0.7}         0.112640        0.001752
1  {'krr__alpha': 0.8}         0.112915        0.001760
2  {'krr__alpha': 0.9}         0.113192        0.001768


0.093872902050172374

根据网格搜索的最优参数，构建回归模型

In [14]:
#交叉验证 5折
n_folds = 5
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [15]:
#Lasso
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005))

In [16]:
score = rmse_cv(lasso)
print("\nLasso score: mean({:.4f}) std({:.4f})\n".format(score.mean(), score.std()))


Lasso score: mean(0.1113) std(0.0069)



In [17]:
#Ridge
ridge = make_pipeline(RobustScaler(),Ridge(alpha=10))

In [18]:
score = rmse_cv(ridge)
print('\nRidge score:mean({:.4f})  std({:.4f})\n'.format(score.mean(),score.std()))


Ridge score:mean(0.1126)  std(0.0080)



In [19]:
#ElasticNet
ENet = make_pipeline(RobustScaler(),ElasticNet(alpha=0.0005,l1_ratio=.9))

In [20]:
score = rmse_cv(ENet)
print('\nENet score:mean({:.4f})  std({:.4f})\n'.format(score.mean(),score.std()))


ENet score:mean(0.1112)  std(0.0069)



In [21]:
#KernelRidge
KRR = make_pipeline(RobustScaler(), KernelRidge(alpha=0.7, kernel='polynomial', degree=2, coef0=2.5))

In [22]:
score = rmse_cv(KRR)
print("\n KRR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


 KRR score: 0.1124 (0.0080)



In [23]:
#GradientBoostingRegressor 由于Huber loss使得它对于异常值具有鲁棒性
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [24]:
score = rmse_cv(GBoost)
print("\n GBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


 GBoost score: 0.1171 (0.0100)



- 模型融合：Average-Stacking    
    采用简单的平均基本模型的方法，构建了一个新类来扩展scikit-learn和模型。

In [25]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [27]:
lasso.fit(train.values, y_train)
lasso_train_pred = lasso.predict(train.values)
lasso_pred = np.expm1(lasso.predict(test.values))
print(rmse(y_train, lasso_train_pred))

0.099834027382


In [28]:
ridge.fit(train.values,y_train)
ridge_train_pred = ridge.predict(train.values)
ridge_pred = np.expm1(ridge.predict(test.values))
print(rmse(y_train,ridge_train_pred))

0.0966201348094


In [29]:
ENet.fit(train.values,y_train)
ENet_train_pred = ENet.predict(train.values)
ENet_pred = np.expm1(ENet.predict(test.values))
print(rmse(y_train,ENet_train_pred))

0.0992302153129


In [30]:
KRR.fit(train.values, y_train)
KRR_train_pred = KRR.predict(train.values)
KRR_pred = np.expm1(KRR.predict(test.values))
print(rmse(y_train, KRR_train_pred))

0.0938729020502


In [31]:
GBoost.fit(train.values, y_train)
GBoost_train_pred = GBoost.predict(train.values)
GBoost_pred = np.expm1(GBoost.predict(test.values))
print(rmse(y_train, GBoost_train_pred))

0.0504016382037


平均四个模型ENet，GBoost，KRR和lasso。利用上面重写的方法，我们可以轻松地添加更多的模型：

In [33]:
averaged_models = AveragingModels(models = (ENet, GBoost,lasso,KRR))

score = rmse_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.1092 (0.0080)



In [35]:
averaged_models.fit(train.values, y_train)
aver_train_pred = averaged_models.predict(train.values)
aver_pred = np.expm1(averaged_models.predict(test.values))
print(rmse(y_train, aver_train_pred))

0.0819943122008


使用目前比较火的Xgboost和LightGBM

In [36]:
#XGBoost:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.06, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [38]:
score = rmse_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1172 (0.0085)



In [39]:
#XGBoost:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmse(y_train, xgb_train_pred))

0.079166223058


In [40]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [41]:
score = rmse_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

LGBM score: 0.1176 (0.0084)



In [42]:
#LightGBM:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmse(y_train, lgb_train_pred))

0.0730660720136


将三者进行融合，然后得到Ensemble prediction：

In [43]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmse(y_train,aver_train_pred*0.65 +
               xgb_train_pred*0.1  + lgb_train_pred*0.25))

RMSLE score on train data:
0.0772755604335


In [44]:
ensemble = aver_pred*0.65 + xgb_pred*0.1+lgb_pred*0.25

得到待提交的CSV文件：

In [45]:
submission = pd.DataFrame({'Id':test_df.Id,'SalePrice':ensemble})
submission.to_csv("./submission.csv", index= False)