# Content

__1. Exploratory Visualization__  
__2. Data Cleaning__  
__3. Feature Engineering__  
__4. Modeling & Evaluation__  
__5. Ensemble Methods__  

HIGHLIGHT: ... 的部分是需要补全的部分, 请根据代码和注释对相应的部分进行补全 

In [None]:

# 加载数据处理分析，可视化等实验过程中可能用到的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


## 忽视掉 一些 warning 提醒
warnings.filterwarnings('ignore')
## 设置可视化的风格 
plt.style.use('ggplot')

In [None]:
## 加载机器学习相关的库
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import Imputer

In [None]:
## 加载机器学习相关的库
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

In [None]:
## 设置 panda 展示数据时的相关参数
pd.set_option('max_colwidth',200)
pd.set_option('display.width',200)
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',1000)

In [None]:
# 填写数据的路径并读取
train_path = ...
test_path = ...
train=pd.read_csv(...)
test=pd.read_csv(...)

In [None]:
# 简单看下数据
train

# Exploratory Visualization

In [None]:
# train的属性有 YearBuilt, SalePrice，可以用 train.YearBuilt 和 train.SalePrice 来获取
# 新住宅是否比老住宅均价更高？

plt.figure(figsize=(15,8))
sns.boxplot(..., ...)  # 可以按照年份画出价格的箱线图

In [None]:
sns.regplot(..., ...) # 可以按照年份画出价格的线性图

+ __好像年份也是和价格有关系的，后续可以把年份也作为一项特征.__

# Data Cleaning

In [None]:
# HIGHLIGHT: 在数据清洗阶段注重理解背后的逻辑，我们会在删除和填充等做法各举一个例子，然后引导大家分析其他的特征。

### Deleting Data

In [None]:
# 查看训练集数据的其他属性比如 GrLivArea 和 SalePrice 的关系
plt.figure(figsize=(12,6))
plt.scatter(x=..., y=...) # 可以按照GrLivArea的大小来画出价格的散点图
plt.xlabel("GrLivArea", fontsize=13)
plt.ylabel("SalePrice", fontsize=13)
plt.ylim(0,800000)

+ __As is shown in the kernels above, the bottom right two two points with extremely large GrLivArea are likely to be outliers. So we delete them.__

In [None]:
# GrLivArea字段过大的值可能是离群点, 可以通过删除训练样本的方式处理

In [None]:
## 丢掉 GrLivArea 大于 4000 且 SalePrice 小于 300000 的点
train.drop(train[(train["..."]>...)&(train["..."]<...)].index,inplace=True) 

In [None]:
# 初步了解数据后，我们把训练数据和测试数据拼在一起方便后续处理
# 注意 训练集在前
full=pd.concat([..., ...], ignore_index=True)

In [None]:
## 去掉 id 这一列的属性 查看数据的形状
full.drop(['...'],axis=1, inplace=True)
full.shape

### Missing Data

In [None]:
## 查看数据是否有缺失
aa = full.isnull().sum()
aa[aa>0].sort_values(ascending=False)  

# 我们将对缺失的数据进行补充

In [None]:
# 对于部分字段的缺失值, 可以通过填充中位数处理, 例如 “LotFrontage” 
# Let's first imput the missing values of LotFrontage based on the median of LotArea and Neighborhood.

In [None]:
# 我们对整个数据根据 Neighborhood 进行聚类, 并 计算 LotFrontage  均值 (mean), 中值 (media), 和数目 
# 并查看结果
 
full.groupby(['Neighborhood'])[['...']].agg(['...','...','count']) 

In [None]:
 # 由于LotArea是一个连续的特性, 我们使用qcut函数将其划分为10个部分

full["LotAreaCut"] = pd.qcut(full.LotArea, q=...)

In [None]:
# 我们根据 LotAreaCut 对数据进行聚类，计算 LotFrontage 均值, 中值, 和 数目
# 并查看结果
full.groupby(['...'])[['LotFrontage']].agg(['mean','median','...']) 

In [None]:
# 根据 LotAreaCut 和 Neighborhood 对数据进行聚类, 对 LotFrontage 空的值填充 LotFrontage 的中值 

full['LotFrontage']=full.groupby(['...','...'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [None]:
# 由于LotArea和Neighborhood的某些组合不可用, 因此我们仅使用LotAreaCut来进行聚类, 对LotFrontage填充 LotFrontage 的 中值

full['LotFrontage']=full.groupby(['...'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

+ __Then we filling in other missing values according to data_description.__

In [None]:
# 如果我们仔细观察data_description里面的内容，就会发现很多缺失值都有迹可寻，比如PoolQC, 表示游泳池的质量，如果这个值缺失, 就代表这套房子没有游泳池，因此用 None 来填补
cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
    full[col].fillna("...", inplace=True)

In [None]:
### 下面这些多为表示XX面积，比如 TotalBsmtSF 表示的是地下室面积，如果一个房间，没有地下室，那就可以使用 0 来填充
cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols:
    full[col].fillna(..., inplace=True)

# fill in with mode
cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
    full[col].fillna(full[col].mode()[0], inplace=True)

+ __And there is no missing data except for the value we want to predict !__

In [None]:
## 检查是否还有其他需要填充的数据呢？

full.isnull().sum()[full.isnull().sum()>0]

# Feature Engineering

In [None]:
# 部分类别特征, 可以通过映射成数值特征来完成转换, 如下所示
# 换句话说，我们可以通过设置一个映射函数, 把部分类别特征转化为数值特征, 这样可以很方便后续我们进行预测

In [None]:
# 下面是相关的类别特征 (通常会以字符串记录), 所以我们将其转化为字符串类型
NumStr = ["MSSubClass","BsmtFullBath","BsmtHalfBath","HalfBath","BedroomAbvGr","KitchenAbvGr","MoSold","YrSold","YearBuilt","YearRemodAdd","LowQualFinSF","GarageYrBlt"]
for col in NumStr:
    full[col]=full[col].astype(str)

In [None]:
# 问题就转化为了如何构造这个映射函数呢？ 
# 我们可以根据一个特性按SalePrice分组, 并根据平均值和中值对其进行排序
# 以 MSSubClass 为例, 我们根据 MSSubClass 进行聚类，计算 SalePrice 的统计均值, 中位数等信息
full.groupby(['...'])[['SalePrice']].agg(['mean','median','count'])

+ __根据上面的计算结果，我们设计了如下的映射函数__  

                '180' : 1
                '30' : 2   '45' : 2
                '190' : 3, '50' : 3, '90' : 3,
                '85' : 4, '40' : 4, '160' : 4
                '70' : 5, '20' : 5, '75' : 5, '80' : 5, '150' : 5
                '120': 6, '60' : 6

+ __你也可以根据你自己的直觉和偏好来设计出你自己的映射函数，但是必须符合逻辑，否则模型很难从这一特征中学到有价值的信息__  

In [None]:
# 得到映射函数, 我们用代码实现这个构造函数
full["oMSSubClass"] = full.MSSubClass.map({'180':1, 
                                        '30':2, '45':2, 
                                        '190':3, '50':3, '90':3, 
                                        '85':4, '40':4, '160':4, 
                                        '70':5, '20':5, '75':5, '80':5, '150':5,
                                        '120': 6, '60':6})

+ __我们可以参考上述过程继续构造下面的映射函数，我还在特性前面添加了一个小“o”，以便保留原始特性，以便稍后使用 get_dummies 编码函数.__

In [None]:
### 鼓励大家尝试参考上述方案构造映射函数，填空的部分较为简单
### 我也把参考的映射函数放在代码的最后

def map_values():
    full["oMSSubClass"] = full.MSSubClass.map({'180':1, 
                                        '30':2, '45':2, 
                                        '190':3, '50':3, '90':3, 
                                        '85':4, '40':4, '160':4, 
                                        '70':5, '20':5, '75':5, '80':5, '150':5,
                                        '120': 6, '60':6})
    
    full["oMSZoning"] = full.MSZoning.map(...)
    
    full["oNeighborhood"] = full.Neighborhood.map({'MeadowV':1,
                                               'IDOTRR':2, 'BrDale':2,
                                               'OldTown':3, 'Edwards':3, 'BrkSide':3,
                                               'Sawyer':4, 'Blueste':4, 'SWISU':4, 'NAmes':4,
                                               'NPkVill':5, 'Mitchel':5,
                                               'SawyerW':6, 'Gilbert':6, 'NWAmes':6,
                                               'Blmngtn':7, 'CollgCr':7, 'ClearCr':7, 'Crawfor':7,
                                               'Veenker':8, 'Somerst':8, 'Timber':8,
                                               'StoneBr':9,
                                               'NoRidge':10, 'NridgHt':10})
    
    full["oCondition1"] = full.Condition1.map(...)
    
    full["oBldgType"] = full.BldgType.map(...)
    
    full["oHouseStyle"] = full.HouseStyle.map(...)
    
    full["oExterior1st"] = full.Exterior1st.map({'BrkComm':1,
                                             'AsphShn':2, 'CBlock':2, 'AsbShng':2,
                                             'WdShing':3, 'Wd Sdng':3, 'MetalSd':3, 'Stucco':3, 'HdBoard':3,
                                             'BrkFace':4, 'Plywood':4,
                                             'VinylSd':5,
                                             'CemntBd':6,
                                             'Stone':7, 'ImStucc':7})
    
    full["oMasVnrType"] = full.MasVnrType.map(...)
    
    full["oExterQual"] = full.ExterQual.map(...)
    
    full["oFoundation"] = full.Foundation.map({'Slab':1, 
                                           'BrkTil':2, 'CBlock':2, 'Stone':2,
                                           'Wood':3, 'PConc':4})
    
    full["oBsmtQual"] = full.BsmtQual.map({'Fa':2, 'None':1, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oBsmtExposure"] = full.BsmtExposure.map(...)
    
    full["oHeating"] = full.Heating.map({'Floor':1, 'Grav':1, 'Wall':2, 'OthW':3, 'GasW':4, 'GasA':5})
    
    full["oHeatingQC"] = full.HeatingQC.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oKitchenQual"] = full.KitchenQual.map(...)
    
    full["oFunctional"] = full.Functional.map({'Maj2':1, 'Maj1':2, 'Min1':2, 'Min2':2, 'Mod':2, 'Sev':2, 'Typ':3})
    
    full["oFireplaceQu"] = full.FireplaceQu.map(...)
    
    full["oGarageType"] = full.GarageType.map({'CarPort':1, 'None':1,
                                           'Detchd':2,
                                           '2Types':3, 'Basment':3,
                                           'Attchd':4, 'BuiltIn':5})
    
    full["oGarageFinish"] = full.GarageFinish.map(...)
    
    full["oPavedDrive"] = full.PavedDrive.map(...)
    
    full["oSaleType"] = full.SaleType.map({'COD':1, 'ConLD':1, 'ConLI':1, 'ConLw':1, 'Oth':1, 'WD':1,
                                       'CWD':2, 'Con':3, 'New':3})
    
    full["oSaleCondition"] = full.SaleCondition.map({'AdjLand':1, 'Abnorml':2, 'Alloca':2, 'Family':2, 'Normal':3, 'Partial':4})            
                
                        
    return "Done!"

In [None]:
map_values()

In [None]:
# 去掉 LotAreaCut 和 SalePrice 这两个 unwanted value
full.drop("LotAreaCut",axis=1,inplace=True)
full.drop(['SalePrice'],axis=1,inplace=True)

## Pipeline

+ __接下来我们可以建立一个管道。 拥有管道后，可以很方便地试验不同的功能组合.__

In [None]:
# 按照我们最开始的观察，year 有关的变量是和价格有关系的，因此我们需要对三个和 Year 有关的变量进行编码
class labelenc(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        lab=LabelEncoder()
        X["YearBuilt"] = lab.fit_transform(X["YearBuilt"])
        X["YearRemodAdd"] = lab.fit_transform(X["YearRemodAdd"])
        X["GarageYrBlt"] = lab.fit_transform(X["GarageYrBlt"])
        return X

In [None]:
# 抽取特征的斜度, 对于符合斜度要求的特征应用 log1p, 然后使用 get_dummies函数 进行独热编码
class skew_dummies(BaseEstimator, TransformerMixin):
    def __init__(self,skew=0.5):
        self.skew = skew
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X_numeric=X.select_dtypes(exclude=["object"])
        skewness = X_numeric.apply(lambda x: skew(x)) 
        """
        Examples
        skew(x) Compute the skewness of a data set.
        For normally distributed data, the skewness should be about 0. For
        unimodal continuous distributions, a skewness value > 0 means that
        there is more weight in the right tail of the distribution.
        --------
        >>> from scipy.stats import skew
        >>> skew([1, 2, 3, 4, 5])
        0.0
        >>> skew([2, 8, 0, 4, 1, 9, 9, 0])
        0.2650554122698573
        """
        skewness_features = skewness[abs(skewness) >= self.skew].index
        X[skewness_features] = np.log1p(X[skewness_features])
        X = pd.get_dummies(X)
        return X

In [None]:
# build pipeline
pipe = Pipeline([
    ('labenc', labelenc()),
    ('skew_dummies', skew_dummies(skew=1)),
    ])

In [None]:
full.shape

In [None]:
# save the original data for later use
full2 = full.copy()
data_pipe = pipe.fit_transform(full2)
data_pipe.shape

In [None]:
### 你能理清楚为什么 full.shape 是怎样一步一步变化到 data_pipe的shape的吗？
# 可以画一个流程图，帮助自己理解整个过程
# (i,j)-> action -> (m,n) -> action ->...... 

In [None]:
data_pipe.head() # 展示头几个数据

+ __因为咱们的数据里面还有其他异常值，所以我们需要某些特定的函数来进行缩放比如：robustscaler .__

In [None]:
scaler = RobustScaler()  # 对数据进行缩放

In [None]:


# 首先对训练数据进行缩放，根据对训练数据拟合的结果然后再对测试数据进行缩放

n_train=train.shape[...] #获得有多少条训练数据

X = data_pipe[:n_train] # 获得训练数据
test_X = data_pipe[...:] # 获得测试数据
y= train.SalePrice

X_scaled = scaler.fit(X).transform(X)
y_log = np.log(train.SalePrice) # 对于 y 进行人为定义log 进行缩放 
test_X_scaled = scaler.transform(...) # 对测试数据进行缩放

## Feature Selection

+ __上面的特征工程还不够，所以我们需要更多.__   
+ __组合不同的特征通常是一个好方法，但我们不知道应该选择什么特征。 幸运的是，有些模型可以提供特征选择，这里我使用 Lasso，但你可以自由选择 Ridge、RandomForest 或 GradientBoostingTree.__

In [None]:
# 一般可以先构造线性模型，分析权重来判断特征的重要性

lasso=Lasso(alpha=0.001)
lasso.fit(..., y_log)  # 用lasso 拟合 缩放后的训练数据 和 对应的 y 值 

In [None]:
FI_lasso = pd.DataFrame({"Feature Importance":lasso.coef_}, index=data_pipe.columns)

In [None]:
FI_lasso.sort_values("Feature Importance",ascending=False)  # 利用Lasso计算出一个特征重要性的分数

In [None]:
FI_lasso[FI_lasso["Feature Importance"]!=0].sort_values("Feature Importance").plot(kind="barh",figsize=(15,25))
plt.xticks(rotation=90)
plt.show()  ### 更直观的可视化一下

+ __基于上述的可视化，我们还可以向管道中添加一些特征.__

In [None]:

# 根据特征的重要性，可以通过加减乘等方法构建新的特征
# 我也把参考的构建方案放在了代码的最后

class add_feature(BaseEstimator, TransformerMixin):
    def __init__(self,additional=1):
        self.additional = additional
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        if self.additional==1:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]   
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
            
        else:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]   
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
            
                       
            X["Bsmt"] = X["BsmtFinSF1"] + X["BsmtFinSF2"] + X["BsmtUnfSF"]
            X["Rooms"] = X["FullBath"]+X["TotRmsAbvGrd"]
            X["PorchArea"] = X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]
            X["TotalPlace"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"] + X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]

            ## here you add your new feature

    
            return X

+ __By using a pipeline, you can quickily experiment different feature combinations.__

In [None]:
## 再一次完善我们的 pipeline
pipe = Pipeline([
    ('labenc', labelenc()),
    ('add_feature', add_feature(additional=2)),
    ('skew_dummies', skew_dummies(skew=1)),
    ])

In [None]:
## 思考： 我们设计的这些特征（原始的和新增加的）是否存在着冗余呢？ 这些冗余（特征高度相关，导致了多重共线性）可能不利于模型更好的拟合 

## PCA

In [None]:
## PCA 可以解除这些特征的关联！

+ __在PCA中使用与原始数据中大致相同的维度.__

In [None]:
full_pipe = pipe.fit_transform(full)

In [None]:
full.shape, data_pipe.shape, full_pipe.shape

In [None]:
## 想想为什么特征维度也不一样，是什么导致的呢？和data_pipe的特征维度之间又有哪些不同呢？

In [None]:
## 类似的，对数据进行缩放
n_train=train.shape[0]
X = full_pipe[:...]
test_X = full_pipe[n_train:]
y= train.SalePrice

X_scaled = scaler.fit(X).transform(X)
y_log = np.log(train.SalePrice)
test_X_scaled = scaler.transform(...)

In [None]:
pca = PCA(n_components=410)  # pca 降维，去掉特征间的冗余

In [None]:
X_scaled=pca.fit_transform(...)
test_X_scaled = pca.transform(...)

In [None]:
X_scaled.shape, test_X_scaled.shape

# Modeling & Evaluation

In [None]:
# define cross validation strategy 定义一个交叉验证的策略
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=...))  # 5折交叉验证
    return rmse

## 五折交叉验证： 把数据平均分成5等份，每次实验拿一份做测试，其余用做训练。

+ __我们选择了 13 个模型并使用 5 折交叉验证来评估这些模型.__

Models include:

+ LinearRegression (LinearRegression)
+ Ridge (Ridge)
+ Lasso (Lasso)
+ Random Forrest (RandomForestRegressor)
+ Gradient Boosting Tree (GradientBoostingRegressor)
+ Support Vector Regression (SVR)
+ Linear Support Vector Regression (LinearSVR)
+ ElasticNet (ElasticNet)
+ Stochastic Gradient Descent (SGDRegressor)
+ BayesianRidge (BayesianRidge)
+ KernelRidge (KernelRidge)
+ ExtraTreesRegressor (ExtraTreesRegressor)
+ XgBoost (XGBRegressor)

In [None]:
## 对这些模型进行初始化
models = [LinearRegression(),Ridge(), ...(alpha=0.01,max_iter=10000), ...(), ...(), ...(), ...(),
          ...(alpha=0.001,max_iter=10000), ...(max_iter=1000,tol=1e-3), ...(), ...(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ...(), ...()] 

In [None]:
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
for name, model in zip(names, models):
    score = rmse_cv(model, X_scaled, y_log)
    print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))

+ __接下来我们进行一些超参数调整。 首先定义一个grid search方法.__

In [None]:
## 网格搜索 可以简单理解为：对超参数空间进行等分或者不等分，遍历所有可能的值

In [None]:
class grid():
    def __init__(self,model):
        self.model = model
    
    def grid_get(self,X,y,param_grid):
        grid_search = GridSearchCV(self.model,param_grid,cv=5, scoring="neg_mean_squared_error")
        grid_search.fit(X,y)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

### Lasso

In [None]:
grid(Lasso()).grid_get(X_scaled, ..., {'alpha': [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008],'max_iter':[10000]})

## [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008] 就是需要搜索的超参数，下面同理

### Ridge

In [None]:
...(...()).grid_get(..., y_log,{'alpha':[35,40,45,50,55,60,65,70,80,90]})

### SVR

In [None]:
grid(SVR()).grid_get(..., ..., {'C':[11,12,13,14,15],'kernel':["rbf"],"gamma":[0.0003,0.0004],"epsilon":[0.008,0.009]})

### Kernel Ridge

In [None]:
param_grid={'alpha':[0.2,0.3,0.4,0.5], 'kernel':["polynomial"], 'degree':[3],'coef0':[0.8,1,1.2]}
grid(KernelRidge()).grid_get(X_scaled, y_log, ...)

### ElasticNet

In [None]:
grid(...()).grid_get(X_scaled,y_log,{'alpha':[0.0005,0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3,0.5,0.7],'max_iter':[10000]})

# Ensemble Methods 

In [None]:
## 对模型进行集成。
## 我们将多个表现较好的模型进行集成，这样可以进一步的提升性能。

### Weight Average

+ __根据权重对基础模型进行平均.__

In [None]:
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w

In [None]:
lasso = Lasso(alpha=0.0005,max_iter=10000)
ridge = Ridge(alpha=60)
svr = SVR(gamma= 0.0004,kernel='rbf',C=13,epsilon=0.009)
ker = KernelRidge(alpha=0.2 ,kernel='polynomial',degree=3 , coef0=0.8)
ela = ElasticNet(alpha=0.005,l1_ratio=0.08,max_iter=10000)
bay = BayesianRidge()

In [None]:
# assign weights based on their gridsearch score
# 根据他们的网格搜索 给出的测试分数 分配权重
# 我也把参考权重放在代码的最后，鼓励大家自己去探索，知其然，知其所以然
w1 = ...
w2 = ...
w3 = ...
w4 = ...
w5 = ...
w6 = ...

In [None]:
weight_avg = AverageWeight(mod = [..., ..., ..., ..., ..., ...],weight=[w1,w2,w3,w4,w5,w6])

In [None]:
rmse_cv(weight_avg,X_scaled,y_log),  rmse_cv(weight_avg,X_scaled,y_log).mean()

+ __但是如果我们只对两个最好的模型进行平均，我们会获得更好的交叉验证分数.__

In [None]:
weight_avg = AverageWeight(mod = [..., ...],weight=[0.5,0.5])

In [None]:
rmse_cv(weight_avg,X_scaled,y_log),  rmse_cv(weight_avg,X_scaled,y_log).mean()

In [None]:
## 可见在对模型的预测结果进行加权平均的时候，选择什么模型作为我们的基础模型很重要！

## Stacking

+ __除了正常的堆叠外，还添加了“get_oof”方法，可将堆叠生成的特征与原始特征结合起来.__

In [None]:
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self,mod,meta_model):
        self.mod = mod
        self.meta_model = meta_model
        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)
        
    def fit(self,X,y):
        self.saved_model = [list() for i in self.mod]
        oof_train = np.zeros((X.shape[0], len(self.mod)))
        
        for i,model in enumerate(self.mod):
            for train_index, val_index in self.kf.split(X,y):
                renew_model = clone(model)
                renew_model.fit(X[train_index], y[train_index])
                self.saved_model[i].append(renew_model)
                oof_train[val_index,i] = renew_model.predict(X[val_index])
        
        self.meta_model.fit(oof_train,y)
        return self
    
    def predict(self,X):
        whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) 
                                      for single_model in self.saved_model]) 
        return self.meta_model.predict(whole_test)
    
    def get_oof(self,X,y,test_X):
        oof = np.zeros((X.shape[0],len(self.mod)))
        test_single = np.zeros((test_X.shape[0],5))
        test_mean = np.zeros((test_X.shape[0],len(self.mod)))
        for i,model in enumerate(self.mod):
            for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):
                clone_model = clone(model)
                clone_model.fit(X[train_index],y[train_index])
                oof[val_index,i] = clone_model.predict(X[val_index])
                test_single[:,j] = clone_model.predict(test_X)
            test_mean[:,i] = test_single.mean(axis=1)
        return oof, test_mean

+ __运行这个方法有点慢，因为这个过程很复杂. __

In [None]:
# must do imputer first, otherwise stacking won't work.
# 用于完成缺失值的插补转换器
a = Imputer().fit_transform(X_scaled)
b = Imputer().fit_transform(y_log.values.reshape(-1,1)).ravel()

In [None]:
stack_model = stacking(mod=[lasso,ridge, ..., ..., ..., ...], meta_model=ker)

In [None]:
print(rmse_cv(stack_model,a,b))
print(rmse_cv(stack_model,a,b).mean())

+ __接下来我们提取stacking生成的特征，然后与原始特征结合.__

In [None]:
X_train_stack, X_test_stack = stack_model.get_oof(a,b,test_X_scaled)  # test_X_scale 的 也需要

In [None]:
X_train_stack.shape, a.shape  # 分析 shape 变化的状态轨迹

In [None]:
X_train_add = np.hstack((..., ...)) # 和训练集的原始特征在特征维度拼接 注意 原始特征在前

In [None]:
X_test_add = np.hstack((..., ...)) # 和测试集的原始特征在特征维度拼接

In [None]:
X_train_add.shape, X_test_add.shape

In [None]:
print(rmse_cv(stack_model,X_train_add,b))
print(rmse_cv(stack_model,X_train_add,b).mean())

+ __甚至可以在获得“X_train_stack”后对您的元模型进行参数调优，或者结合原始特征进行调参, 可以改进的地方还有很多, 有兴趣的同学可以自行探索。__

In [None]:
### 参考映射函数

def map_values():
    full["oMSSubClass"] = full.MSSubClass.map({'180':1, 
                                        '30':2, '45':2, 
                                        '190':3, '50':3, '90':3, 
                                        '85':4, '40':4, '160':4, 
                                        '70':5, '20':5, '75':5, '80':5, '150':5,
                                        '120': 6, '60':6})
    
    full["oMSZoning"] = full.MSZoning.map({'C (all)':1, 'RH':2, 'RM':2, 'RL':3, 'FV':4})
    
    full["oNeighborhood"] = full.Neighborhood.map({'MeadowV':1,
                                               'IDOTRR':2, 'BrDale':2,
                                               'OldTown':3, 'Edwards':3, 'BrkSide':3,
                                               'Sawyer':4, 'Blueste':4, 'SWISU':4, 'NAmes':4,
                                               'NPkVill':5, 'Mitchel':5,
                                               'SawyerW':6, 'Gilbert':6, 'NWAmes':6,
                                               'Blmngtn':7, 'CollgCr':7, 'ClearCr':7, 'Crawfor':7,
                                               'Veenker':8, 'Somerst':8, 'Timber':8,
                                               'StoneBr':9,
                                               'NoRidge':10, 'NridgHt':10})
    
    full["oCondition1"] = full.Condition1.map({'Artery':1,
                                           'Feedr':2, 'RRAe':2,
                                           'Norm':3, 'RRAn':3,
                                           'PosN':4, 'RRNe':4,
                                           'PosA':5 ,'RRNn':5})
    
    full["oBldgType"] = full.BldgType.map({'2fmCon':1, 'Duplex':1, 'Twnhs':1, '1Fam':2, 'TwnhsE':2})
    
    full["oHouseStyle"] = full.HouseStyle.map({'1.5Unf':1, 
                                           '1.5Fin':2, '2.5Unf':2, 'SFoyer':2, 
                                           '1Story':3, 'SLvl':3,
                                           '2Story':4, '2.5Fin':4})
    
    full["oExterior1st"] = full.Exterior1st.map({'BrkComm':1,
                                             'AsphShn':2, 'CBlock':2, 'AsbShng':2,
                                             'WdShing':3, 'Wd Sdng':3, 'MetalSd':3, 'Stucco':3, 'HdBoard':3,
                                             'BrkFace':4, 'Plywood':4,
                                             'VinylSd':5,
                                             'CemntBd':6,
                                             'Stone':7, 'ImStucc':7})
    
    full["oMasVnrType"] = full.MasVnrType.map({'BrkCmn':1, 'None':1, 'BrkFace':2, 'Stone':3})
    
    full["oExterQual"] = full.ExterQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
    
    full["oFoundation"] = full.Foundation.map({'Slab':1, 
                                           'BrkTil':2, 'CBlock':2, 'Stone':2,
                                           'Wood':3, 'PConc':4})
    
    full["oBsmtQual"] = full.BsmtQual.map({'Fa':2, 'None':1, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oBsmtExposure"] = full.BsmtExposure.map({'None':1, 'No':2, 'Av':3, 'Mn':3, 'Gd':4})
    
    full["oHeating"] = full.Heating.map({'Floor':1, 'Grav':1, 'Wall':2, 'OthW':3, 'GasW':4, 'GasA':5})
    
    full["oHeatingQC"] = full.HeatingQC.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oKitchenQual"] = full.KitchenQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
    
    full["oFunctional"] = full.Functional.map({'Maj2':1, 'Maj1':2, 'Min1':2, 'Min2':2, 'Mod':2, 'Sev':2, 'Typ':3})
    
    full["oFireplaceQu"] = full.FireplaceQu.map({'None':1, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oGarageType"] = full.GarageType.map({'CarPort':1, 'None':1,
                                           'Detchd':2,
                                           '2Types':3, 'Basment':3,
                                           'Attchd':4, 'BuiltIn':5})
    
    full["oGarageFinish"] = full.GarageFinish.map({'None':1, 'Unf':2, 'RFn':3, 'Fin':4})
    
    full["oPavedDrive"] = full.PavedDrive.map({'N':1, 'P':2, 'Y':3})
    
    full["oSaleType"] = full.SaleType.map({'COD':1, 'ConLD':1, 'ConLI':1, 'ConLw':1, 'Oth':1, 'WD':1,
                                       'CWD':2, 'Con':3, 'New':3})
    
    full["oSaleCondition"] = full.SaleCondition.map({'AdjLand':1, 'Abnorml':2, 'Alloca':2, 'Family':2, 'Normal':3, 'Partial':4})            
                
                        
                        
    
    return "Done!"

In [None]:
class add_feature(BaseEstimator, TransformerMixin):
    def __init__(self,additional=1):
        self.additional = additional
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        if self.additional==1:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]   
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
            
        else:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]   
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
            
            X["+_TotalHouse_OverallQual"] = X["TotalHouse"] * X["OverallQual"]
            X["+_GrLivArea_OverallQual"] = X["GrLivArea"] * X["OverallQual"]
            X["+_oMSZoning_TotalHouse"] = X["oMSZoning"] * X["TotalHouse"]
            X["+_oMSZoning_OverallQual"] = X["oMSZoning"] + X["OverallQual"]
            X["+_oMSZoning_YearBuilt"] = X["oMSZoning"] + X["YearBuilt"]
            X["+_oNeighborhood_TotalHouse"] = X["oNeighborhood"] * X["TotalHouse"]
            X["+_oNeighborhood_OverallQual"] = X["oNeighborhood"] + X["OverallQual"]
            X["+_oNeighborhood_YearBuilt"] = X["oNeighborhood"] + X["YearBuilt"]
            X["+_BsmtFinSF1_OverallQual"] = X["BsmtFinSF1"] * X["OverallQual"]
            
            X["-_oFunctional_TotalHouse"] = X["oFunctional"] * X["TotalHouse"]
            X["-_oFunctional_OverallQual"] = X["oFunctional"] + X["OverallQual"]
            X["-_LotArea_OverallQual"] = X["LotArea"] * X["OverallQual"]
            X["-_TotalHouse_LotArea"] = X["TotalHouse"] + X["LotArea"]
            X["-_oCondition1_TotalHouse"] = X["oCondition1"] * X["TotalHouse"]
            X["-_oCondition1_OverallQual"] = X["oCondition1"] + X["OverallQual"]
            
           
            X["Bsmt"] = X["BsmtFinSF1"] + X["BsmtFinSF2"] + X["BsmtUnfSF"]
            X["Rooms"] = X["FullBath"]+X["TotRmsAbvGrd"]
            X["PorchArea"] = X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]
            X["TotalPlace"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"] + X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]

    
            return X

In [None]:
w1 = 0.02
w2 = 0.2
w3 = 0.25
w4 = 0.3
w5 = 0.03
w6 = 0.2