In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# 步骤1：加载数据
train_data = pd.read_csv('kaggle_house_pred_train.csv')
test_data = pd.read_csv('kaggle_house_pred_test.csv')

# 步骤2：数据预处理
# 填充缺失值
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

# 特征选择
numeric_features = train_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_data.select_dtypes(include=['object']).columns

# 创建列转换器来转换特征
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 步骤3：特征工程（根据需要添加你自己的特征工程步骤）

# 步骤4：模型优化
# 定义一个XGBoost回归模型
xgb_model = xgb.XGBRegressor()

# 设置模型的参数
parameters = {
    'objective':['reg:squarederror'],
    'learning_rate': [.03, 0.05, .07],
    'max_depth': [5, 6, 7],
    'min_child_weight': [4],
    'subsample': [0.7],
    'colsample_bytree': [0.7],
    'n_estimators': [500]
}

# 使用GridSearchCV找到最佳参数
xgb_grid = GridSearchCV(xgb_model,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

# 步骤5：训练模型
xgb_grid.fit(train_data[numeric_features], train_data['SalePrice'])

# 输出最佳得分和参数
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

# 步骤6：预测和提交
predictions = xgb_grid.predict(test_data[numeric_features])

# 创建提交文件
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)

NameError: name 'download' is not defined