# 酒店选址与客流预测基础模型 (修复版)

本笔记本演示酒店选址和客流预测的基础模型实现。

## 注意：模型更新

原始模型在处理没有节点索引的数据时出现错误：
```
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>
```

我们已修复`SpatialTemporalDataset`类和`SpatialTemporalModel.forward`方法，使其能够正确处理没有节点索引的情况。修复包括：

1. 将`None`值替换为空张量
2. 在模型的`forward`方法中添加适当的检查

如果您遇到这个错误，请确保使用最新版本的模型文件。

## 1. 导入必要的库

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import sys
import os
import importlib

# 添加项目根目录到路径
sys.path.append('..')

# 导入自定义模块前确保重新加载以使用最新版本
if 'src.models.spatial_temporal_model' in sys.modules:
    importlib.reload(sys.modules['src.models.spatial_temporal_model'])

# 导入自定义模块
from src.models.spatial_temporal_model import SpatialTemporalModel, SpatialTemporalTrainer, SpatialTemporalDataset, prepare_sequence_data
from src.models.location_selection_model import LocationSelectionModel
from src.utils.data_collection import download_hotel_booking_data, fetch_poi_data
from src.utils.data_preprocessing import preprocess_hotel_data, create_features, integrate_poi_data

# 设置绘图样式
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# 设置随机种子
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1e01c6a39b0>

## 2. 数据加载与预处理

In [12]:
# 加载原始数据（可选）
try:
    hotel_df = pd.read_csv('../data/raw/hotel-booking-demand/hotel_bookings.csv')
    print(f"原始数据加载成功！形状: {hotel_df.shape}")
except FileNotFoundError:
    print("原始数据文件不存在，尝试下载数据...")
    try:
        # 尝试从项目根目录导入下载函数
        from download_data import download_hotel_data
        download_success = download_hotel_data()
        if download_success:
            print("数据下载成功，正在加载...")
            hotel_df = pd.read_csv('../data/raw/hotel-booking-demand/hotel_bookings.csv')
            print(f"原始数据加载成功！形状: {hotel_df.shape}")
        else:
            print("数据下载失败，请手动下载数据")
            hotel_df = pd.DataFrame()  # 创建空DataFrame
    except (ImportError, ModuleNotFoundError):
        print("未找到下载模块，请手动下载数据或运行项目根目录的download_data.py")
        hotel_df = pd.DataFrame()  # 创建空DataFrame

NameError: name 'pd' is not defined

In [None]:
# 直接加载预处理后的数据
try:
    processed_df = pd.read_csv('../data/processed/hotel_processed.csv')
    print(f"预处理数据加载成功！形状: {processed_df.shape}")
    processed_df.head()
except FileNotFoundError:
    print("预处理数据文件不存在，请先运行fix_preprocessing.py脚本")

预处理数据加载成功！形状: (5000, 43)


In [None]:
# 加载特征工程后的数据
try:
    features_df = pd.read_csv('../data/processed/hotel_features.csv')
    print(f"特征数据加载成功！形状: {features_df.shape}")
    features_df.head()
except FileNotFoundError:
    print("特征数据文件不存在，请先运行fix_preprocessing.py脚本")
    # 如果预处理数据存在但特征数据不存在，可以创建特征
    if 'processed_df' in locals() and not processed_df.empty:
        print("从预处理数据创建特征...")
        features_df = create_features(processed_df)
        # 保存处理后的数据
        os.makedirs('../data/processed', exist_ok=True)
        features_df.to_csv('../data/processed/hotel_features.csv', index=False)
        print("特征数据已保存到 ../data/processed/hotel_features.csv")

特征数据加载成功！形状: (5000, 63)


## 3. 空间时间序列模型

In [None]:
# 准备时间序列数据
if 'features_df' in locals() and not features_df.empty:
    # 选择特征列和目标列
    # 注意: 实际列名应根据实际数据调整
    feature_cols = ['lead_time', 'arrival_date_month_num', 'stays_in_weekend_nights', 
                     'stays_in_week_nights', 'adults', 'children', 'is_repeated_guest', 
                     'previous_cancellations', 'previous_bookings_not_canceled', 
                     'booking_changes', 'required_car_parking_spaces', 'total_of_special_requests']
    
    # 确保所有特征列都存在
    for col in feature_cols:
        if col not in features_df.columns:
            print(f"警告：列 {col} 不在特征数据中")
    
    # 只使用存在的特征列
    available_feature_cols = [col for col in feature_cols if col in features_df.columns]
    
    target_cols = ['adr']  # 平均每日房价
    
    # 准备序列数据
    X, y, _ = prepare_sequence_data(features_df, available_feature_cols, target_cols, seq_len=7, stride=1)
    
    print(f"序列数据形状: X={X.shape}, y={y.shape}")
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"训练集: X={X_train.shape}, y={y_train.shape}")
    print(f"测试集: X={X_test.shape}, y={y_test.shape}")

序列数据形状: X=(4994, 7, 12), y=(4994, 1)
训练集: X=(3995, 7, 12), y=(3995, 1)
测试集: X=(999, 7, 12), y=(999, 1)


In [None]:
# 创建数据集和数据加载器
if 'X_train' in locals():
    batch_size = 32
    
    train_dataset = SpatialTemporalDataset(X_train, y_train)
    test_dataset = SpatialTemporalDataset(X_test, y_test)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    print(f"训练数据批次数: {len(train_loader)}")
    print(f"测试数据批次数: {len(test_loader)}")

训练数据批次数: 125
测试数据批次数: 32


In [None]:
# 创建并训练模型
if 'train_loader' in locals():
    # 模型参数
    input_dim = X_train.shape[2]  # 特征维度
    hidden_dim = 64
    output_dim = y_train.shape[1]  # 目标维度
    num_nodes = 100  # 空间节点数量
    seq_len = X_train.shape[1]  # 序列长度
    
    # 创建模型
    model = SpatialTemporalModel(
        input_dim=input_dim, 
        hidden_dim=hidden_dim, 
        output_dim=output_dim, 
        num_nodes=num_nodes, 
        seq_len=seq_len
    )
    
    # 创建训练器
    trainer = SpatialTemporalTrainer(model, learning_rate=0.001)
    
    # 训练模型
    print("开始训练空间时间模型...")
    # 注意: 实际训练时应使用更多的epoch
    history = trainer.train(train_loader, test_loader, epochs=5, patience=3)

2025-04-19 21:43:09,120 - src.models.spatial_temporal_model - INFO - 模型初始化完成，使用设备: cpu
2025-04-19 21:43:09,121 - src.models.spatial_temporal_model - INFO - 开始训练，总epochs: 5
2025-04-19 21:43:09,152 - src.models.spatial_temporal_model - INFO - 批次 0/125, 损失: nan
2025-04-19 21:43:09,278 - src.models.spatial_temporal_model - INFO - 批次 10/125, 损失: nan


开始训练空间时间模型...


2025-04-19 21:43:09,390 - src.models.spatial_temporal_model - INFO - 批次 20/125, 损失: nan
2025-04-19 21:43:09,480 - src.models.spatial_temporal_model - INFO - 批次 30/125, 损失: nan
2025-04-19 21:43:09,576 - src.models.spatial_temporal_model - INFO - 批次 40/125, 损失: nan
2025-04-19 21:43:09,669 - src.models.spatial_temporal_model - INFO - 批次 50/125, 损失: nan
2025-04-19 21:43:09,761 - src.models.spatial_temporal_model - INFO - 批次 60/125, 损失: nan
2025-04-19 21:43:09,858 - src.models.spatial_temporal_model - INFO - 批次 70/125, 损失: nan
2025-04-19 21:43:09,950 - src.models.spatial_temporal_model - INFO - 批次 80/125, 损失: nan
2025-04-19 21:43:10,059 - src.models.spatial_temporal_model - INFO - 批次 90/125, 损失: nan
2025-04-19 21:43:10,152 - src.models.spatial_temporal_model - INFO - 批次 100/125, 损失: nan
2025-04-19 21:43:10,252 - src.models.spatial_temporal_model - INFO - 批次 110/125, 损失: nan
2025-04-19 21:43:10,347 - src.models.spatial_temporal_model - INFO - 批次 120/125, 损失: nan


ValueError: Input contains NaN.

In [None]:
# 可视化训练历史
if 'history' in locals():
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(history['train_losses'], label='训练损失')
    plt.plot(history['valid_losses'], label='验证损失')
    plt.title('训练过程中的损失')
    plt.xlabel('Epoch')
    plt.ylabel('损失')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    metrics = [m['rmse'] for m in history['valid_metrics']]
    plt.plot(metrics, label='验证RMSE')
    plt.title('验证集RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

## 4. 选址评分模型

In [None]:
# 创建位置特征数据集
# 注意: 实际应用中需要真实的位置数据
# 这里仅为演示，创建一些假数据

def create_dummy_location_data(n_locations=50):
    """创建模拟位置数据"""
    np.random.seed(42)
    
    location_data = {
        'location_id': [f'LOC_{i:03d}' for i in range(n_locations)],
        'latitude': np.random.uniform(31.1, 31.3, n_locations),
        'longitude': np.random.uniform(121.4, 121.6, n_locations),
        'poi_restaurant_count': np.random.randint(5, 50, n_locations),
        'poi_shopping_count': np.random.randint(3, 30, n_locations),
        'poi_entertainment_count': np.random.randint(2, 20, n_locations),
        'poi_transport_count': np.random.randint(1, 15, n_locations),
        'distance_to_city_center': np.random.uniform(0.5, 15.0, n_locations),
        'distance_to_airport': np.random.uniform(5.0, 50.0, n_locations),
        'distance_to_subway': np.random.uniform(0.1, 3.0, n_locations),
        'population_density': np.random.uniform(5000, 25000, n_locations),
        'income_per_capita': np.random.uniform(80000, 150000, n_locations),
        'unemployment_rate': np.random.uniform(2.0, 8.0, n_locations),
        'competitor_count': np.random.randint(0, 10, n_locations),
        'area_km2': np.random.uniform(0.5, 5.0, n_locations)
    }
    
    # 添加一个模拟目标值(可以视为历史业绩)
    # 假设业绩与多个因素有关，并添加一些随机噪声
    score = (
        0.3 * location_data['poi_restaurant_count'] + 
        0.2 * location_data['poi_shopping_count'] + 
        0.2 * location_data['poi_entertainment_count'] + 
        0.3 * location_data['poi_transport_count'] - 
        0.5 * location_data['distance_to_city_center'] - 
        0.2 * location_data['distance_to_subway'] + 
        0.1 * location_data['population_density'] / 1000 + 
        0.1 * location_data['income_per_capita'] / 10000 - 
        0.3 * location_data['competitor_count'] + 
        np.random.normal(0, 5, n_locations)
    )
    # 转换为正值并归一化到0-100范围
    score = score - min(score)
    score = 100 * score / max(score)
    location_data['historical_score'] = score
    
    return pd.DataFrame(location_data)

# 创建位置数据
location_df = create_dummy_location_data(100)
print(f"生成模拟位置数据: {location_df.shape}")
location_df.head()

In [None]:
# 准备选址模型数据
feature_cols = [
    'poi_restaurant_count', 'poi_shopping_count', 'poi_entertainment_count', 'poi_transport_count',
    'distance_to_city_center', 'distance_to_airport', 'distance_to_subway',
    'population_density', 'income_per_capita', 'unemployment_rate', 'competitor_count', 'area_km2'
]

X = location_df[feature_cols].values
y = location_df['historical_score'].values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"训练集: X={X_train.shape}, y={y_train.shape}")
print(f"测试集: X={X_test.shape}, y={y_test.shape}")

In [None]:
# 训练选址评分模型
print("开始训练选址评分模型...")

# 使用XGBoost模型
location_model = LocationSelectionModel(model_type='xgboost')
location_model.fit(X_train, y_train, feature_names=feature_cols)

# 在测试集上评估
y_pred = location_model.predict(X_test)

# 计算指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"选址模型评估指标:")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")

In [None]:
# 查看特征重要性
feature_importance = location_model.get_feature_importance(plot=True)
feature_importance

In [None]:
# 为候选位置评分（使用测试集作为候选位置）
test_df = location_df.iloc[list(np.random.choice(len(location_df), 20, replace=False))].copy()
scored_locations = location_model.score_locations(test_df, feature_cols)

print("候选位置评分结果:")
scored_locations[['location_id', 'predicted_score'] + feature_cols].head(10)

In [None]:
# 可视化候选位置得分
plt.figure(figsize=(12, 6))
sns.barplot(x='location_id', y='predicted_score', data=scored_locations.head(10))
plt.title('前10个候选位置得分')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. 保存模型

In [None]:
# 创建模型目录
os.makedirs('../data/models', exist_ok=True)

# 保存选址模型
if 'location_model' in locals():
    location_model.save_model('../data/models/location_model.pkl')
    print("选址模型已保存到 ../data/models/location_model.pkl")

# 保存空间时间模型
if 'trainer' in locals() and 'model' in locals():
    torch.save({
        'model_state_dict': model.state_dict(),
        'input_dim': input_dim,
        'hidden_dim': hidden_dim,
        'output_dim': output_dim,
        'num_nodes': num_nodes,
        'seq_len': seq_len
    }, '../data/models/spatial_temporal_model.pt')
    print("空间时间模型已保存到 ../data/models/spatial_temporal_model.pt")

## 6. 总结

在本笔记本中，我们实现了酒店选址与客流预测的两个基础模型：

1. **空间时间注意力机制模型**：用于预测酒店入住率和每日平均房价，结合了空间和时间两个维度的信息。
2. **选址评分模型**：使用XGBoost算法，基于位置特征为候选地址打分，帮助酒店选择最佳位置。

这些模型可以进一步优化和扩展：

- 收集更多真实数据，特别是位置相关的POI数据和历史入住率数据
- 优化模型超参数
- 增加更多特征，如季节性特征、天气数据、事件数据等
- 实现冷启动策略，处理新酒店没有历史数据的情况
- 部署模型并提供API服务