In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
BASE = "/kaggle/input/house-prices-advanced-regression-techniques"

train = pd.read_csv(f"{BASE}/train.csv")
test  = pd.read_csv(f"{BASE}/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head(3)

In [None]:
# Step 1.2 - 缺失值统计
missing = train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

missing_df = pd.DataFrame({
    "Missing Count": missing,
    "Missing %": 100 * missing / len(train)
})

missing_df.head(30)

In [None]:
# Step 1.3 - 数值列的分布与偏度
num_features = train.select_dtypes(include=[np.number]).columns
train[num_features].describe().T.head(10)

In [None]:
# Step 1.4 - SalePrice 分布
plt.figure(figsize=(8,4))
sns.histplot(train["SalePrice"], kde=True, color="skyblue")
plt.title("Distribution of SalePrice")
plt.show()

In [None]:
# =====================================
# Step 1: Smart Missing Value Imputation
# =====================================

from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

# 1️⃣ 删除缺失率过高列
cols_to_drop = ["PoolQC", "MiscFeature", "Alley"]
train = train.drop(columns=cols_to_drop)
test = test.drop(columns=cols_to_drop)

# 2️⃣ “无该设施” → 填 None
none_cols = [
    "Fence", "MasVnrType", "FireplaceQu", "GarageType", 
    "GarageFinish", "GarageQual", "GarageCond", 
    "BsmtFinType1", "BsmtFinType2", "BsmtExposure", 
    "BsmtCond", "BsmtQual"
]
for col in none_cols:
    train[col] = train[col].fillna("None")
    test[col]  = test[col].fillna("None")

# 3️⃣ 社区分组中位数填补 LotFrontage
for df in [train, test]:
    df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(
        lambda x: x.fillna(x.median())
    )

# 4️⃣ GarageYrBlt 缺失 → YearBuilt
train["GarageYrBlt"] = train["GarageYrBlt"].fillna(train["YearBuilt"])
test["GarageYrBlt"]  = test["GarageYrBlt"].fillna(test["YearBuilt"])

# 5️⃣ Electrical → 众数
train["Electrical"] = train["Electrical"].fillna(train["Electrical"].mode()[0])

# 6️⃣ 用回归模型预测 MasVnrArea
features = ["MasVnrType", "OverallQual", "YearBuilt", "TotalBsmtSF", "1stFlrSF"]
train_temp = pd.get_dummies(train[features + ["MasVnrArea"]], drop_first=True)
test_temp  = pd.get_dummies(test[features + ["MasVnrArea"]], drop_first=True)

# 对齐列！！！！（很重要的点）
train_temp, test_temp = train_temp.align(test_temp, join="left", axis=1, fill_value=0)

# 训练随机森林用于预测 MasVnrArea
notnull_mask = train_temp["MasVnrArea"].notnull()
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(train_temp.loc[notnull_mask].drop(columns=["MasVnrArea"]),
       train_temp.loc[notnull_mask, "MasVnrArea"])

# 预测 MasVnrArea 缺失部分
null_mask = train_temp["MasVnrArea"].isnull()
train.loc[null_mask, "MasVnrArea"] = rf.predict(
    train_temp.loc[null_mask].drop(columns=["MasVnrArea"])
)
# 对 test 同样填补
null_mask_test = test_temp["MasVnrArea"].isnull()
test.loc[null_mask_test, "MasVnrArea"] = rf.predict(
    test_temp.loc[null_mask_test].drop(columns=["MasVnrArea"])
)

# 7️⃣ KNNImputer 全局补数值列
num_cols = [c for c in train.select_dtypes(include=[np.number]).columns if c != "SalePrice"]
imputer = KNNImputer(n_neighbors=5)
train[num_cols] = imputer.fit_transform(train[num_cols])
test[num_cols]  = imputer.transform(test[num_cols])

print("✅ Smart Missing Value Imputation complete!")

# 验证
missing_after = train.isnull().sum()
missing_after = missing_after[missing_after > 0].sort_values(ascending=False)
print("仍存在缺失列:")
print(missing_after if len(missing_after) > 0 else "无缺失值 ✅")

In [None]:
# ========================================
# Step 2: Feature Engineering
# ========================================

# ------- 1️⃣ 新建衍生特征 -------
train["TotalSF"] = train["TotalBsmtSF"] + train["1stFlrSF"] + train["2ndFlrSF"]
test["TotalSF"]  = test["TotalBsmtSF"]  + test["1stFlrSF"]  + test["2ndFlrSF"]

train["Age"] = train["YrSold"] - train["YearBuilt"]
test["Age"]  = test["YrSold"]  - test["YearBuilt"]

train["RemodAge"] = train["YrSold"] - train["YearRemodAdd"]
test["RemodAge"]  = test["YrSold"]  - test["YearRemodAdd"]

# 车库平均面积（避免除以0）！！！很重要
train["GarageAreaPerCar"] = train["GarageArea"] / (train["GarageCars"] + 1e-3)
test["GarageAreaPerCar"]  = test["GarageArea"]  / (test["GarageCars"] + 1e-3)

# ------- 2️⃣ 等级型文字转数值 -------
qual_map = {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "None": 0}

quality_cols = [
    "ExterQual", "ExterCond", "HeatingQC", "KitchenQual",
    "FireplaceQu", "GarageQual", "GarageCond",
    "BsmtQual", "BsmtCond"
]

# #！！！很重要 也就是说：
# 	•	有的缺失值是 NaN（真正的空值）
# 	•	有的缺失值是 字符串 "None"

# map(qual_map) 只会映射 "None" → 0
# 但如果某个格子原本是 NaN，那它不会被映射成功，结果还是 NaN。
# 这里保证无论是 “None” 还是 NaN → 都统一映射为 0
for col in quality_cols:
    if col in train.columns:  # 防止有的列已被删除
        train[col] = train[col].map(qual_map).fillna(0)
        test[col]  = test[col].map(qual_map).fillna(0)

# ------- 3️⃣ 检查结果 -------
new_features = ["TotalSF", "Age", "RemodAge", "GarageAreaPerCar"] + quality_cols
print("✅ 特征工程完成！新增列：")
print(new_features)

train[new_features].head(5)

In [None]:
# =====================================
# Step 3: Model Comparison (Ridge / Lasso / ElasticNet)
# =====================================

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso, ElasticNet

# 目标变量 log1p 转换（RMSLE 对应）
y = np.log1p(train["SalePrice"])
X = train.drop(columns=["SalePrice"])

# 数值/类别列划分
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# --- 预处理模块：数值和类别分别处理 ---
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

# --- 模型定义 ---
models = {
    "Ridge": Ridge(alpha=10.0, random_state=42),
    "Lasso": Lasso(alpha=0.001, random_state=42, max_iter=10000),
    "ElasticNet": ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42, max_iter=10000)
}

# --- 交叉验证设置 ---
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 对比三种模型 ---
results = []
for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])
    scores = cross_val_score(
        pipe, X, y,
        scoring="neg_root_mean_squared_error",
        cv=cv,
        n_jobs=-1
    )
    results.append({
        "Model": name,
        "Mean_RMSLE": -scores.mean(),
        "Std": scores.std()
    })

# --- 输出结果 ---
results_df = pd.DataFrame(results).sort_values("Mean_RMSLE")
print("✅ Model comparison complete!")
results_df

In [None]:
# # =====================================
# # Step 4: XGBoost Hyperparameter Tuning
# # =====================================
# from xgboost import XGBRegressor
# from sklearn.model_selection import RandomizedSearchCV, KFold
# from sklearn.pipeline import Pipeline
# from scipy.stats import uniform, randint

# # 1️⃣ 定义模型
# xgb_model = XGBRegressor(
#     objective="reg:squarederror",
#     random_state=42,
#     n_jobs=-1
# )

# # 2️⃣ 定义参数搜索空间
# param_dist = {
#     "model__n_estimators": randint(400, 1500),
#     "model__learning_rate": uniform(0.01, 0.09),
#     "model__max_depth": randint(3, 8),
#     "model__min_child_weight": randint(1, 8),
#     "model__subsample": uniform(0.6, 0.4),
#     "model__colsample_bytree": uniform(0.6, 0.4),
#     "model__reg_alpha": uniform(0, 0.5),
#     "model__reg_lambda": uniform(0.5, 2.0)
# }

# # 3️⃣ 构建 Pipeline
# pipe_xgb = Pipeline([
#     ("prep", preprocess),
#     ("model", xgb_model)
# ])

# # 4️⃣ 交叉验证方案
# cv = KFold(n_splits=5, shuffle=True, random_state=42)

# # 5️⃣ 随机搜索（每次尝试 30 组参数）
# random_search = RandomizedSearchCV(
#     estimator=pipe_xgb,
#     param_distributions=param_dist,
#     n_iter=30,                     # 搜索 30 次，兼顾精度与时间
#     scoring="neg_root_mean_squared_error",
#     cv=cv,
#     verbose=2,
#     random_state=42,
#     n_jobs=-1
# )

# # 6️⃣ 开始搜索
# random_search.fit(X, y)

# # 7️⃣ 输出最佳结果
# print("✅ Best CV RMSLE-like score:", -random_search.best_score_)
# print("Best Parameters:")
# for k, v in random_search.best_params_.items():
#     print(f"   {k}: {v}")

In [None]:
# Step 4: (Optional) Hyperparameter Tuning - Skipped in Final Version
# Best CV RMSLE: 0.1259
# Best Parameters:
#   n_estimators = 512
#   learning_rate = 0.0356
#   max_depth = 3
#   min_child_weight = 3
#   subsample = 0.613
#   colsample_bytree = 0.720
#   reg_alpha = 0.488
#   reg_lambda = 1.322

In [None]:
# =====================================
# Step 5: Model Ensemble (Averaging + Stacking)
# =====================================
from sklearn.linear_model import Ridge, ElasticNet, LassoCV
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

# ---- 1️⃣ 定义三个基础模型 ----
ridge = Ridge(alpha=10.0, random_state=42)
elastic = ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42, max_iter=10000)
xgb_best = XGBRegressor(
    n_estimators=512,
    learning_rate=0.0356,
    max_depth=3,
    min_child_weight=3,
    subsample=0.613,
    colsample_bytree=0.720,
    reg_alpha=0.488,
    reg_lambda=1.322,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

# ---- 2️⃣ 准备预处理和数据 ----
from sklearn.pipeline import Pipeline

models = {
    "ridge": ridge,
    "elastic": elastic,
    "xgb": xgb_best
}

pred_train = pd.DataFrame(index=X.index)
pred_test = pd.DataFrame(index=test.index)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# ---- 3️⃣ 5 折交叉预测，用于融合 ----
for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])
    oof_pred = np.zeros(len(X))   # Out-Of-Fold prediction
    test_pred = np.zeros(len(test))

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipe.fit(X_train, y_train)
        oof_pred[val_idx] = pipe.predict(X_val)
        test_pred += pipe.predict(test) / cv.n_splits

    pred_train[name] = oof_pred
    pred_test[name] = test_pred
    print(f"✅ {name} done")

# ---- 4️⃣ 简单加权平均 ----
# 权重可以调整，例如 0.2, 0.3, 0.5
weights = {"ridge": 0.2, "elastic": 0.3, "xgb": 0.5}
pred_train["blend"] = sum(pred_train[k] * w for k, w in weights.items())
pred_test["blend"] = sum(pred_test[k] * w for k, w in weights.items())

print("✅ Blending complete!")


# ---- 5️⃣ 输出 submission ----
y_pred_final = np.expm1(pred_test["blend"])  # 还原 log
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": y_pred_final
})
submission["Id"] = submission["Id"].astype(int)
submission.to_csv("submission_ensemble.csv", index=False)
print("✅ submission_ensemble.csv generated successfully!")
submission.head()

In [None]:
# # =====================================
# # Step 6: Model Ensemble (Stacking with Meta-Model)
# # =====================================

# from sklearn.linear_model import Ridge, ElasticNet, LassoCV, LinearRegression
# from xgboost import XGBRegressor
# from sklearn.model_selection import KFold
# from sklearn.pipeline import Pipeline
# import numpy as np
# import pandas as pd

# # ---- 1️⃣ 定义基础模型 ----
# ridge = Ridge(alpha=10.0, random_state=42)
# elastic = ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42, max_iter=10000)
# xgb_best = XGBRegressor(
#     n_estimators=512,
#     learning_rate=0.0356,
#     max_depth=3,
#     min_child_weight=3,
#     subsample=0.613,
#     colsample_bytree=0.720,
#     reg_alpha=0.488,
#     reg_lambda=1.322,
#     objective="reg:squarederror",
#     random_state=42,
#     n_jobs=-1
# )

# models = {
#     "ridge": ridge,
#     "elastic": elastic,
#     "xgb": xgb_best
# }

# pred_train = pd.DataFrame(index=X.index)
# pred_test = pd.DataFrame(index=test.index)

# cv = KFold(n_splits=5, shuffle=True, random_state=42)

# # ---- 2️⃣ 5 折交叉预测，生成 meta 模型输入 ----
# for name, model in models.items():
#     pipe = Pipeline([
#         ("prep", preprocess),
#         ("model", model)
#     ])
#     oof_pred = np.zeros(len(X))   # Out-Of-Fold 预测
#     test_pred = np.zeros(len(test))

#     for train_idx, val_idx in cv.split(X, y):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         pipe.fit(X_train, y_train)
#         oof_pred[val_idx] = pipe.predict(X_val)
#         test_pred += pipe.predict(test) / cv.n_splits

#     pred_train[name] = oof_pred
#     pred_test[name] = test_pred
#     print(f"✅ Base model {name} done")

# # ---- 3️⃣ 二层模型：用 OOF 结果训练 meta 模型 ----
# from sklearn.ensemble import GradientBoostingRegressor
# meta_model = GradientBoostingRegressor(random_state=42)
# meta_model.fit(pred_train, y)

# # ---- 4️⃣ 预测 ----
# pred_train["stacked"] = meta_model.predict(pred_train)
# pred_test["stacked"] = meta_model.predict(pred_test)

# # ---- 5️⃣ 输出最终结果 ----
# y_pred_final = np.expm1(pred_test["stacked"])  # 还原 log1p
# submission = pd.DataFrame({
#     "Id": test["Id"].astype(int),
#     "SalePrice": y_pred_final
# })
# submission.to_csv("submission_stacking.csv", index=False)
# print("✅ submission_stacking.csv generated successfully!")
# submission.head()

In [None]:
# import matplotlib.pyplot as plt

# # 取出 base model 列（不包括 'stacked'）
# base_cols = [c for c in pred_train.columns if c != "stacked"]

# # 对齐维度
# importances = meta_model.feature_importances_

# # 构建 DataFrame
# importance_df = pd.DataFrame({
#     "Base_Model": base_cols,
#     "Importance": importances
# }).sort_values("Importance", ascending=False)

# # 打印结果
# print(importance_df)

# # 可视化
# plt.bar(importance_df["Base_Model"], importance_df["Importance"])
# plt.title("Meta Model Feature Importance (GradientBoosting)")
# plt.ylabel("Importance")
# plt.show()