In [None]:
import numpy as np  
import pandas as pd  
from datetime import datetime
from sklearn.linear_model import ElasticNet, Lasso, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 读取预处理好的数据，设置训练集、测试集

In [None]:
numerical_df = pd.read_csv("preprocessed/numerical.csv")
cate_with_num_df = pd.read_csv("preprocessed/df_final.csv")
df_final=cate_with_num_df.copy()

In [None]:
Train=df_final[df_final['train']==1].copy()
Test=df_final[df_final['train']==0].copy()

Y_train=Train['SalePrice'].to_numpy()
X_train=Train.drop(['SalePrice','train','Id'],axis=1).to_numpy()

X_test=Test.drop(['SalePrice','train','Id'],axis=1).to_numpy()
print(X_test.shape)

# 初始化模型

In [None]:
kfolds = KFold(n_splits=10, shuffle=True)

ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kfolds))

lasso = Lasso(alpha=0.5)

elastic = ElasticNet(alpha=0.5,l1_ratio=0.5)
				                           
svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003,))
									
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber')                             
								
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )
                                  
xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=4, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     scale_pos_weight=1, 
                                     reg_alpha=0.00006)
                                     
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elastic, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

# 对模型进行评估

In [None]:
def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, Y_train, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [None]:
score0 = cv_rmse(ridge)
print("Ridge得分: {:.4f} ({:.4f})\n".format(score0.mean(), score0.std()), datetime.now(), )

score1 = cv_rmse(lasso)
print("LASSO得分: {:.4f} ({:.4f})\n".format(score1.mean(), score1.std()), datetime.now(), )

score2 = cv_rmse(elastic)
print("ElasticNet得分: {:.4f} ({:.4f})\n".format(score2.mean(), score2.std()), datetime.now(), )

score3 = cv_rmse(svr)
print("SVR得分: {:.4f} ({:.4f})\n".format(score3.mean(), score3.std()), datetime.now(), )

score4 = cv_rmse(gbr)
print("gbr得分: {:.4f} ({:.4f})\n".format(score4.mean(), score4.std()), datetime.now(), )

score5 = cv_rmse(lightgbm)
print("lightgbm得分: {:.4f} ({:.4f})\n".format(score5.mean(), score5.std()), datetime.now(), )

score6 = cv_rmse(xgboost)
print("xgboost得分: {:.4f} ({:.4f})\n".format(score6.mean(), score6.std()), datetime.now(), )

score7 = cv_rmse(stack_gen)
print("stack_gen得分: {:.4f} ({:.4f})\n".format(score7.mean(), score7.std()), datetime.now(), )

# 训练模型

In [None]:
ridge.fit(X_train, Y_train)
print("Finish Training Ridge")

lasso.fit(X_train, Y_train)
print("Finish Training LASSO")

elastic.fit(X_train, Y_train)
print("Finish Training ElasticNet")

svr.fit(X_train, Y_train)
print("Finish Training SVM")

gbr.fit(X_train, Y_train)
print("Finish Training GBR")

lightgbm.fit(X_train, Y_train)
print("Finish Training Lightgbm")

xgboost.fit(X_train, Y_train)
print("Finish Training Xgboost")

stack_gen.fit(X_train, Y_train)
print("Finish Training stack_gen")

print("Finish Training All")

# 根据每个模型的表现，确定最终每个模型的权重

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def normalize(scores):
    total = 0
    for score in scores:
        if score != 0:
            total += 1/score
    for i,score in enumerate(scores):
        if score != 0:
            scores[i]=(1/score)/total
    return scores

def ensemble_models_predict(X, weights):
    result = weights[0] * ridge.predict(X) + \
             weights[1] * lasso.predict(X) + \
             weights[2] * elastic.predict(X) + \
             weights[3] * svr.predict(X)+ \
             weights[4] * gbr.predict(X) + \
             weights[5] * lightgbm.predict(X) + \
             weights[6] * xgboost.predict(X) + \
             weights[7] * stack_gen.predict(X) 
    return result

In [None]:
scores = [score0.mean(),score1.mean(),score2.mean(),score3.mean(),score4.mean(),score5.mean(),score6.mean(),score7.mean()]
print(scores)
scores = normalize(scores)
print(scores)

In [None]:
# weights
# 0 ridge
# 1 lasso
# 2 elasticnet
# 3 svm
# 4 gbr
# 5 lightgbm
# 6 xgboost
# 7 stack_gen
weights_final = scores.copy()
weights_final = normalize(weights_final)
# weights_final[0] = 0
weights_final[1] = 0
weights_final[2] = 0
weights_final[3] = 0
# weights_final[4] = 0
# weights_final[5] = 0
# weights_final[6] = 0
# weights_final[7] = 0
# weights_final = [0.1, 0.05, 0.1, 0, 0.21, 0.19, 0.22, 0.20]        
total = sum(weights_final)
for i in range(len(weights_final)):
    weights_final[i] = weights_final[i]/total
print(weights_final)

In [None]:
# 暴力搜索权重空间

# min_score = 10000000
# for a in np.arange(0,1,0.001):
#     for b in np.arange(0,1,0.001):
#         if a+b>1:
#             break
#         for c in np.arange(0,1,0.001):
#             if a+b+c>1:
#                 break
#             for d in np.arange(0,1,0.001):
#                 if a+b+c+d>1:
#                     break
#                 e = 1-a-b-c-d
#                 weights_final = [a,0,0,0,b,c,d,e]
#                 score = rmsle(Y_train, ensemble_models_predict(X_train,weights_final))
#                 if score < min_score:
#                     min_score = score
#                     pred = ensemble_models_predict(X_test, weights_final)

#                     submission = pd.DataFrame({
#                             "Id": Test["Id"],
#                             "SalePrice": pred
#                         })
#                     result_path="result/5_models.csv"
#                     submission.to_csv(result_path, index=False)

# 在训练集上进行评估

In [None]:
score = rmsle(Y_train, ensemble_models_predict(X_train,weights_final))
print(score)
print(score/Y_train.mean())

# 输出并保存预测结果

In [None]:
pred = ensemble_models_predict(X_test, weights_final)

submission = pd.DataFrame({
        "Id": Test["Id"],
        "SalePrice": pred
    })
result_path="result/5_models.csv"
submission.to_csv(result_path, index=False)