In [None]:
# ======= Step 0: 导入需要的库 =======
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# ======= Step 1:  数据准备+参数设置 =======

data = pd.read_csv("xgboost_data.csv")

# Configuration
TARGET_VARIABLE = 'PCT_PRICE_CHANGE_DETRENDED (%)'
RAW_TARGET_VARIABLE = 'PCT_PRICE_CHANGE (%)'
YEAR_COLUMN = 'YEAR'
ZIP_COLUMN = 'ZIP_CODE'
TRAIN_YEAR_CUTOFF = 2020

# XGBoost specific params
XGB_PARAMS = {
    'objective': 'multi:softprob', 
    'num_class': 3,
    'eval_metric':'merror',
    'n_estimators': 100,           
    'learning_rate': 0.1,       
    'max_depth': 5,                
    'subsample': 0.8,             
    'colsample_bytree': 0.8,       
    'random_state': 42,
    'n_jobs': -1                   
}

# Step 1: Detrending Target Variable
print("--- Detrending Target Variable ---")
# Ensure YEAR is numeric
data[YEAR_COLUMN] = pd.to_numeric(data[YEAR_COLUMN], errors='coerce')
data = data.dropna(subset=[YEAR_COLUMN, RAW_TARGET_VARIABLE]) # Drop rows where year or target is missing

trend_model = LinearRegression()
# Reshape YEAR for sklearn compatibility
trend_model.fit(data[[YEAR_COLUMN]], data[RAW_TARGET_VARIABLE])
predicted_trend = trend_model.predict(data[[YEAR_COLUMN]])
data[TARGET_VARIABLE] = data[RAW_TARGET_VARIABLE] - predicted_trend
print(f"Target variable '{TARGET_VARIABLE}' created.")
print(f"Target variable mean: {data[TARGET_VARIABLE].mean():.4f}, std: {data[TARGET_VARIABLE].std():.4f}")



# step2: 创建新的Severity分类标签 + 数据分离

def assign_damage_severity(pct_change):
    if pct_change > -5:
        return 0  # Small Drop
    elif -10 < pct_change <= -5:
        return 1  # Medium Drop
    else:
        return 2  # Severe Drop

data['DAMAGE_SEVERITY_CLASS'] = data[TARGET_VARIABLE].apply(assign_damage_severity)
# Train/Test Split based on YEAR
train_df = data[data[YEAR_COLUMN] <= TRAIN_YEAR_CUTOFF]
test_df = data[data[YEAR_COLUMN] > TRAIN_YEAR_CUTOFF]


NameError: name 'train_df' is not defined

In [None]:

# Apply to both train and test data
train_df["DAMAGE_SEVERITY_CLASS"] = train_df["PCT_PRICE_CHANGE_DETRENDED"].apply(
    assign_damage_severity
)
test_df["DAMAGE_SEVERITY_CLASS"] = test_df["PCT_PRICE_CHANGE_DETRENDED"].apply(
    assign_damage_severity
)

# ======= Step 2: 准备特征和标签 =======

# 选择你的特征列（这里假设Baseline特征）
feature_cols = [
    "Median_Household_Income",
    "Total_Population",
    "Avg_Household_Size",
    "Gini_Index",
    "Employment_Rate",
    "Below_Poverty_Rate",
    "Rate_College_or_Higher",
    "Black_Population_Share",
    "White_Population_Share",
    "Asian_Population_Share",
    "Native_Population_Share",
    "HOME_PRICE_LAG1",
    "PRICE_CHANGE_LAG1",
    "PRICE_CHANGE_DIFF",
    "ROLLING_1yr_PRICE_CHANGE",
    "ROLLING_3yr_PRICE_CHANGE_STD",
]

X_train = train_df[feature_cols]
y_train = train_df["DAMAGE_SEVERITY_CLASS"]

X_test = test_df[feature_cols]
y_test = test_df["DAMAGE_SEVERITY_CLASS"]

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ======= Step 3: 训练XGBoost多分类模型 =======

model = XGBClassifier(
    objective="multi:softprob",  # 多分类 soft probability
    num_class=3,
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42,
)

model.fit(X_train_scaled, y_train)

# ======= Step 4: 预测并评估模型 =======

y_pred = model.predict(X_test_scaled)

# 准确率
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# Macro F1
macro_f1 = f1_score(y_test, y_pred, average="macro")
print(f"Macro F1 Score: {macro_f1:.4f}")

# Classification Report
print("\nClassification Report:")
print(
    classification_report(
        y_test, y_pred, target_names=["Small Drop", "Medium Drop", "Severe Drop"]
    )
)

# Confusion Matrix 可视化
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Small", "Medium", "Severe"],
    yticklabels=["Small", "Medium", "Severe"],
)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# ======= Step 5: 可选 - 特征重要性 =======

importances = model.feature_importances_
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": importances})
importance_df = importance_df.sort_values("Importance", ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.title("Feature Importance")
plt.show()