# 城乡收入预测与解释性建模实验
> 融合机器学习 + 政策模拟 + 社会治理


## 📘 实验背景
城乡收入差距长期存在，借助数据建模评估政策效果、预测趋势，是推动“共同富裕”目标的重要手段。

## 📊 步骤 1：模拟数据生成

In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)

cities = [f"City_{i+1}" for i in range(100)]
regions = np.random.choice(['东部', '中部', '西部'], size=100, p=[0.4, 0.3, 0.3])
data = []

for city, region in zip(cities, regions):
    for year in range(2018, 2023):
        gdp_pc = np.random.normal(loc=80000 if region == '东部' else 50000 if region == '中部' else 30000, scale=5000)
        net_access = np.clip(np.random.normal(0.9 if region == '东部' else 0.75, 0.1), 0.5, 1.0)
        edu_exp = np.random.normal(loc=15000, scale=2000)
        med_exp = np.random.normal(loc=10000, scale=1500)
        urban_ratio = np.clip(np.random.normal(0.7 if region == '东部' else 0.6, 0.1), 0.3, 0.95)
        policy_index = np.random.uniform(0, 1)
        infra_score = net_access * 50 + urban_ratio * 30 + np.random.normal(0, 5)
        unemp_rate = np.clip(np.random.normal(0.08, 0.03), 0.03, 0.15)
        rural_income = gdp_pc * np.random.uniform(0.3, 0.5) + policy_index * 5000 - unemp_rate * 2000
        urban_income = gdp_pc * np.random.uniform(0.6, 0.9)
        data.append([city, year, region, gdp_pc, net_access, edu_exp, med_exp, urban_ratio, policy_index, infra_score, unemp_rate, rural_income, urban_income])

columns = ['city', 'year', 'region', 'gdp_pc', 'net_access', 'edu_exp', 'med_exp', 'urban_ratio', 'policy_index', 'infra_score', 'unemp_rate', 'rural_income', 'urban_income']
df = pd.DataFrame(data, columns=columns)
df.head()

## 🔧 步骤 2：特征工程与标准化

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df[['gdp_pc', 'net_access', 'edu_exp', 'med_exp', 'urban_ratio', 'policy_index', 'infra_score', 'unemp_rate']]
y = df['rural_income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 🤖 步骤 3：模型构建与性能评估

In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import time

def train_and_evaluate(model, name):
    start = time.time()
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    end = time.time()
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"🔎 {name} | RMSE: {rmse:.2f} | R²: {r2:.3f} | 训练时间: {end-start:.2f}s")

train_and_evaluate(Ridge(), "Ridge回归")
train_and_evaluate(RandomForestRegressor(n_estimators=100), "随机森林")
train_and_evaluate(XGBRegressor(n_estimators=100, verbosity=0), "XGBoost")
train_and_evaluate(CatBoostRegressor(verbose=0), "CatBoost")

## 🔍 步骤 4：SHAP 模型解释

In [None]:
import shap
model = RandomForestRegressor(n_estimators=100).fit(X_train_scaled, y_train)
explainer = shap.Explainer(model.predict, X_test_scaled)
shap_values = explainer(X_test_scaled)
shap.plots.bar(shap_values)

## 🌍 步骤 5：区域异质性建模

In [None]:
for region in df['region'].unique():
    sub_df = df[df['region'] == region]
    X_r = sub_df[X.columns]
    y_r = sub_df['rural_income']
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_r, y_r, test_size=0.2)
    model_r = RandomForestRegressor()
    model_r.fit(X_train_r, y_train_r)
    print(f"📍 {region} 区模型 R²: {model_r.score(X_test_r, y_test_r):.3f}")

## 💡 步骤 6：政策模拟实验

In [None]:
X_sim = X_test.copy()
X_sim['policy_index'] += 0.2  # 加强政策支持
X_sim_scaled = scaler.transform(X_sim)

model_final = RandomForestRegressor().fit(X_train_scaled, y_train)
y_before = model_final.predict(X_test_scaled)
y_after = model_final.predict(X_sim_scaled)

print(f"💡 模拟后，农村收入平均提升：{np.mean(y_after - y_before):.2f} 元")