In [None]:
# %% [markdown]
# # 城市功能区分析 - 特征工程
# 
# 作者: 张笔弈
# 日期: 2026-01-20
# 
# 本笔记本展示特征工程的完整流程

# %%
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from src.feature_engineering.spatial_features import SpatialFeatureExtractor
from src.feature_engineering.temporal_features import TemporalFeatureExtractor
from src.feature_engineering.feature_selection import FeatureSelector
import yaml

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# %%
# 加载配置
with open("../config/settings.yaml", "r") as f:
    config = yaml.safe_load(f)

# %%
# 加载数据
poi_data = gpd.read_file("../data/processed/guangzhou_pois.geojson")
grid_data = gpd.read_file("../data/processed/integrated_grid.geojson")

print(f"POI数据: {len(poi_data)}条记录")
print(f"网格数据: {len(grid_data)}个网格")
print(f"POI类别分布:")
print(poi_data['category'].value_counts())

# %%
# 1. 空间特征提取
spatial_extractor = SpatialFeatureExtractor(config)

# 提取POI特征
poi_features = spatial_extractor.extract_poi_features(grid_data, poi_data)

print("POI特征提取完成")
print(poi_features.head())

# %%
# 可视化POI密度分布
plt.figure(figsize=(12, 8))

# POI总数分布
plt.subplot(2, 2, 1)
plt.hist(poi_features['poi_count'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('POI数量')
plt.ylabel('频数')
plt.title('POI数量分布')

# POI多样性分布
plt.subplot(2, 2, 2)
plt.hist(poi_features['poi_diversity'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('POI多样性指数')
plt.ylabel('频数')
plt.title('POI多样性分布')

# POI类别分布热力图
plt.subplot(2, 2, 3)
category_cols = [col for col in poi_features.columns if col.startswith('poi_') and col != 'poi_count' and col != 'poi_diversity']
category_data = poi_features[category_cols].sum().sort_values(ascending=False)
plt.bar(range(len(category_data)), category_data.values)
plt.xticks(range(len(category_data)), category_data.index.str.replace('poi_', ''), rotation=45)
plt.xlabel('POI类别')
plt.ylabel('总数')
plt.title('POI类别分布')

# 空间相关性
plt.subplot(2, 2, 4)
if 'centroid_lon' in grid_data.columns and 'centroid_lat' in grid_data.columns:
    plt.scatter(grid_data['centroid_lon'], grid_data['centroid_lat'], 
               c=poi_features['poi_count'], s=10, cmap='viridis', alpha=0.6)
    plt.colorbar(label='POI数量')
    plt.xlabel('经度')
    plt.ylabel('纬度')
    plt.title('POI空间分布')

plt.tight_layout()
plt.savefig("../output/poi_features_visualization.png", dpi=300, bbox_inches='tight')
plt.show()

# %%
# 2. 时序特征提取（使用模拟数据）
# 创建模拟时序数据
np.random.seed(42)
time_points = pd.date_range('2023-01-01', '2023-12-31', freq='D')
temporal_data = []

for grid_id in grid_data['grid_id'].sample(20):  # 选择20个网格
    for timestamp in time_points[::30]:  # 每月一个点
        temporal_data.append({
            'grid_id': grid_id,
            'timestamp': timestamp,
            'poi_count': np.random.poisson(10) + np.random.randint(0, 10),
            'ndvi_mean': np.random.uniform(0.1, 0.8),
            'temperature': np.random.normal(25, 5)
        })

temporal_df = pd.DataFrame(temporal_data)

temporal_extractor = TemporalFeatureExtractor(config)
temporal_features = temporal_extractor.extract_time_series_features(temporal_df)

print("时序特征提取完成")
print(temporal_features.head())

# %%
# 3. 特征选择
# 准备特征矩阵
feature_cols = [
    'poi_count', 'poi_diversity',
    'poi_餐饮', 'poi_购物', 'poi_交通', 'poi_教育', 'poi_医疗', 'poi_办公'
]

# 添加模拟的NDVI和路网特征
feature_data = poi_features.copy()
feature_data['ndvi_mean'] = np.random.uniform(0, 0.8, len(feature_data))
feature_data['road_density'] = np.random.exponential(0.01, len(feature_data))

# 创建目标变量（模拟功能区类型）
feature_data['zone_type'] = np.random.choice(
    ['商业区', '居住区', '办公区', '绿地'], 
    len(feature_data),
    p=[0.3, 0.4, 0.2, 0.1]
)

# 特征选择
feature_selector = FeatureSelector(config)
selected_features = feature_selector.select_features(
    feature_data[feature_cols],
    feature_data['zone_type'],
    method='kbest',
    n_features=5
)

print(f"选择的特征: {selected_features}")

# %%
# 4. 特征重要性分析
if 'zone_type' in feature_data.columns:
    from sklearn.ensemble import RandomForestClassifier
    
    X = feature_data[feature_cols].fillna(0)
    y = feature_data['zone_type']
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X, y)
    
    importance_df = feature_selector.create_feature_importance_plot(
        X, y, rf_model, "../output/feature_importance.png"
    )
    
    if importance_df is not None:
        print("特征重要性排名:")
        print(importance_df.head(10))

# %%
# 5. 特征相关性分析
plt.figure(figsize=(12, 10))

# 计算相关性矩阵
correlation_matrix = feature_data[feature_cols].corr()

# 绘制热力图
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', 
            center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('特征相关性矩阵')
plt.tight_layout()
plt.savefig("../output/feature_correlation.png", dpi=300, bbox_inches='tight')
plt.show()

# %%
# 6. 特征分布分析
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(feature_cols[:9]):
    axes[i].hist(feature_data[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'{col}分布')
    axes[i].set_xlabel('值')
    axes[i].set_ylabel('频数')

plt.tight_layout()
plt.savefig("../output/feature_distributions.png", dpi=300, bbox_inches='tight')
plt.show()

# %%
# 7. 保存特征数据
feature_output_path = "../data/processed/features.csv"
feature_data.to_csv(feature_output_path, index=False, encoding='utf-8')

print(f"特征数据已保存: {feature_output_path}")
print("特征工程流程完成!")