# MyTrain

## DataSetInitialization

In [2]:
import sys
import logging
from logging import getLogger
from recbole.utils import init_logger, init_seed
from mamba4poi import Mamba4POI
from recbole.config import Config
from utils import *
from recbole.data.transform import construct_transform
from recbole.utils import (
    init_logger,
    get_model,
    get_trainer,
    init_seed,
    set_color,
    get_flops,
    get_environment,
)
import torch

config = Config(model=Mamba4POI, config_file_list=['config.yaml'])
dataset = create_dataset(config)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(dataset.item_feat)

       venue_id  venue_category_id      x      y
0             0                0.0  16308  24789
1             1                1.0  13416  23128
2             2                2.0   4580  22009
3             3                3.0  15948  32880
4             4                4.0  15652  24245
...         ...                ...    ...    ...
38329     38329              122.0  20358  32667
38330     38330              312.0   7326   9691
38331     38331              119.0  17896  26165
38332     38332               59.0  29282  28075
38333     38333              119.0  14448  23140

[38334 rows x 4 columns]


# CreatingLoactionHashMap

In [20]:
import torch
import numpy as np
import pandas as pd

# 1. 加载数据（假设 dataset 已经是 DataFrame 格式）
locations = dataset.item_feat.to_numpy()  # 获取 numpy 数组
venue_id = locations[:, 0]  
venue_category = locations[:, 1]  
x = locations[:, 2]  
y = locations[:, 3]

# 2. 将数据转为 PyTorch Tensor 并移动到 GPU 上
venue_id = torch.tensor(venue_id, dtype=torch.long).cuda()  
venue_category = torch.tensor(venue_category, dtype=torch.long).cuda()
x = torch.tensor(x, dtype=torch.float32).cuda()  
y = torch.tensor(y, dtype=torch.float32).cuda()

# 3. 设置网格的尺寸（例如：每个网格大小为500x500）
grid_width = 500  # 网格宽度
grid_height = 500  # 网格高度

# 4. 计算网格的数量（基于坐标的最大最小值）
x_min, x_max = x.min(), x.max()
y_min, y_max = y.min(), y.max()

# 计算网格行列数
num_x_grids = int((x_max - x_min) // grid_width) + 1
num_y_grids = int((y_max - y_min) // grid_height) + 1


# 5. 分批次处理
batch_size = 10000  # 每批处理的样本数量
num_batches = len(venue_id) // batch_size + 1  # 批次数量

# 创建稠密张量来暂存更新，最后将其转换为稀疏张量
venue_density_matrix = torch.zeros((num_y_grids, num_x_grids), dtype=torch.int32).cuda()
category_density_matrix = torch.zeros((num_y_grids, num_x_grids, len(torch.unique(venue_category))), dtype=torch.int32).cuda()

for batch_idx in range(num_batches):
    # 计算每个批次的索引范围
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(venue_id))
    
    # 获取当前批次数据
    batch_venue_id = venue_id[start_idx:end_idx]
    batch_venue_category = venue_category[start_idx:end_idx]
    batch_x = x[start_idx:end_idx]
    batch_y = y[start_idx:end_idx]

    # 计算每个数据点所属的网格
    grid_x = ((batch_x - x_min) / grid_width).floor().long()  # 计算 x 对应的网格位置
    grid_y = ((batch_y - y_min) / grid_height).floor().long()  # 计算 y 对应的网格位置

    # 确保网格索引不超出边界
    grid_x = torch.clamp(grid_x, 0, num_x_grids - 1)
    grid_y = torch.clamp(grid_y, 0, num_y_grids - 1)

    # 使用 scatter_add_ 更新稠密张量的计数
    venue_density_matrix.index_put_((grid_y, grid_x), torch.ones(len(grid_y), dtype=torch.int32).cuda(), accumulate=True)
    
    # 更新类别密度矩阵
    for i in range(len(batch_venue_category)):
        category_density_matrix[grid_y[i], grid_x[i], batch_venue_category[i]] += 1

# 7. 将稠密张量转换为稀疏张量
venue_density_matrix_sparse = venue_density_matrix.to_sparse()
category_density_matrix_sparse = category_density_matrix.to_sparse()

# 8. 打印结果
print("Venue Density Matrix Sparse:")
print(venue_density_matrix_sparse)
print("Category Density Matrix Sparse:")
print(category_density_matrix_sparse)


Venue Density Matrix Sparse:
tensor(indices=tensor([[ 0,  0,  0,  ..., 96, 96, 97],
                       [20, 24, 25,  ..., 45, 47, 41]]),
       values=tensor([1, 5, 1,  ..., 4, 1, 2]),
       device='cuda:0', size=(98, 69), nnz=3227, dtype=torch.int32,
       layout=torch.sparse_coo)
Category Density Matrix Sparse:
tensor(indices=tensor([[ 0,  0,  0,  ..., 96, 97, 97],
                       [20, 24, 24,  ..., 47, 41, 41],
                       [27, 36, 63,  ..., 35, 15, 67]]),
       values=tensor([1, 1, 1,  ..., 1, 1, 1]),
       device='cuda:0', size=(98, 69, 399), nnz=25802, dtype=torch.int32,
       layout=torch.sparse_coo)


In [None]:
import plotly.graph_objects as go
import plotly.colors as pc

# 将 x_min 和 y_min 移回 CPU
xmin = x_min.cpu().item()
ymin = y_min.cpu().item()

# 选择多种颜色以区分不同类别
category_colors = pc.qualitative.Set3 + pc.qualitative.Pastel1 + pc.qualitative.Pastel2

# 创建图形对象
fig = go.Figure()

# 添加网格图层
for i in range(num_y_grids):
    for j in range(num_x_grids):
        # 计算网格的边界
        grid_x_min = xmin + j * grid_width
        grid_x_max = grid_x_min + grid_width
        grid_y_min = ymin + i * grid_height
        grid_y_max = grid_y_min + grid_height

        # 随机选择颜色或交替颜色方案
        fillcolor = f"rgba({(i+j)%255}, {(j*5)%255}, {(i*5)%255}, 0.1)"  # 生成不同的半透明颜色

        # 绘制矩形网格，设置透明度
        fig.add_trace(go.Scatter(
            x=[grid_x_min, grid_x_min, grid_x_max, grid_x_max, grid_x_min],
            y=[grid_y_min, grid_y_max, grid_y_max, grid_y_min, grid_y_min],
            fill='toself',
            fillcolor=fillcolor,
            line=dict(width=0),
            mode='lines'
        ))

# 添加数据点，使用离散颜色方案区分不同类别
fig.add_trace(go.Scatter(
    x=x.cpu().numpy(),
    y=y.cpu().numpy(),
    mode='markers',
    marker=dict(
        size=5,
        color=[category_colors[c % len(category_colors)] for c in venue_category.cpu().numpy()],  # 使用离散颜色方案
        showscale=False
    )
))

# 设置布局
fig.update_layout(
    title="Venue Points with Distinct Category Colors and Grid Layer at the Bottom",
    xaxis_title="Longitude",
    yaxis_title="Latitude",
    showlegend=False
)

# 在浏览器中显示
fig.show(renderer="browser")


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

地点：偏远中心稀有丰富
街区：偏远中心密集稀疏
用500m*500m作为基本地理块进行聚合
街区密度阈值 街区偏远度阈值
中心和密集相关
丰富地点一定是便利店之类的常用地点
重点是稀有地点区分
偏远但密集的是乡镇 偏远密集街区的丰富地点是便民设施 稀有地点大概率是学校 政府等大场地 小概率是景点
偏远稀疏的是无人区 偏远稀疏街区的丰富地点可能就是便利店 但是稀有地点很可能是景点
中心稀疏是城郊 景点的概率较大
中心密集的是市中心 这是推荐的重点

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio


class DynamicGridModel(nn.Module):
    def __init__(self, x, y, venue_category, num_categories, grid_size=50):
        super(DynamicGridModel, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # 将坐标和类目信息移至设备上
        self.x = torch.tensor(x, dtype=torch.float32).to(self.device)
        self.y = torch.tensor(y, dtype=torch.float32).to(self.device)
        self.venue_category = torch.tensor(venue_category, dtype=torch.long).to(self.device)
        self.num_categories = num_categories  # 将类目总数保存为模型属性
        # 计算全体类目的比例
        category_counts = torch.bincount(self.venue_category)
        self.category_proportions = category_counts.float() / category_counts.sum()
        
        # 初始化网格边界和网格大小
        self.grid_size = grid_size
        self.grid_boundaries = nn.Parameter(torch.rand((grid_size, 2), dtype=torch.float32).to(self.device))  # 每个网格的左下角坐标
        self.grid_sizes = nn.Parameter(torch.rand((grid_size, 2), dtype=torch.float32).to(self.device))  # 每个网格的宽度和高度

    def compute_grid_category_proportions(self):
        grid_category_proportions = torch.zeros(self.grid_size, len(self.category_proportions)).to(self.device)
        grid_item_count = torch.zeros(self.grid_size).to(self.device)
        grid_area = torch.zeros(self.grid_size).to(self.device)
        
        for i in range(self.grid_size):
            x_min, y_min = self.grid_boundaries[i, 0], self.grid_boundaries[i, 1]
            x_max, y_max = x_min + self.grid_sizes[i, 0], y_min + self.grid_sizes[i, 1]

            in_grid = (self.x >= x_min) & (self.x < x_max) & (self.y >= y_min) & (self.y < y_max)
            grid_category_count = torch.bincount(self.venue_category[in_grid], minlength=len(self.category_proportions))
            grid_item_count[i] = in_grid.sum().float()
            grid_area[i] = self.grid_sizes[i, 0] * self.grid_sizes[i, 1]

            if grid_category_count.sum() > 0:
                grid_category_proportions[i] = grid_category_count.float() / grid_category_count.sum()

        return grid_category_proportions, grid_item_count, grid_area

    def compute_mse(self, predictions, target):
        return ((predictions - target) ** 2).mean()

    def compute_density_loss(self, grid_item_count, grid_area):
        grid_density = grid_item_count / grid_area
        target_density = grid_item_count.sum() / grid_area.sum()
        return ((grid_density - target_density) ** 2).mean()

    def forward(self):
        grid_category_proportions, grid_item_count, grid_area = self.compute_grid_category_proportions()
        category_loss = self.compute_mse(grid_category_proportions, self.category_proportions)
        density_loss = self.compute_density_loss(grid_item_count, grid_area)
        total_loss = category_loss + density_loss
        return total_loss, category_loss, density_loss

    def visualize_grid(self):
        """
        使用 Plotly 可视化网格分布。
        """
        grid_boundaries = self.grid_boundaries.detach().cpu().numpy()
        grid_sizes = self.grid_sizes.detach().cpu().numpy()
        grid_colors = []
        grid_text = []

        # 遍历每个网格，计算密度与类目比例
        for i in range(self.grid_size):
            x_min, y_min = grid_boundaries[i]
            x_max, y_max = x_min + grid_sizes[i, 0], y_min + grid_sizes[i, 1]
            grid_points = ((self.x >= x_min) & (self.x < x_max) &
                           (self.y >= y_min) & (self.y < y_max))
            category_counts = torch.bincount(self.venue_category[grid_points], minlength=len(self.category_proportions)).cpu().numpy()
            density = grid_points.sum().item() / ((x_max - x_min) * (y_max - y_min) + 1e-6)

            # 计算网格颜色，基于类目比例和密度
            category_ratio = category_counts / (category_counts.sum() + 1e-6)
            color = f"rgba({category_ratio[0] * 255}, {category_ratio[1] * 255}, {category_ratio[2] * 255}, 0.5)"
            grid_colors.append(color)

            # 添加文本信息（密度和类目比例）
            grid_text.append(f"Density: {density:.2f}<br>Category Ratios: {category_ratio}")

        # 使用 Plotly 绘制网格边界和中心点
        fig = go.Figure()

        # 绘制网格
        for i in range(self.grid_size):
            x_min, y_min = grid_boundaries[i]
            x_max, y_max = x_min + grid_sizes[i, 0], y_min + grid_sizes[i, 1]
            fig.add_trace(go.Scatter(
                x=[x_min, x_min, x_max, x_max, x_min],
                y=[y_min, y_max, y_max, y_min, y_min],
                fill='toself',
                fillcolor=grid_colors[i],
                line=dict(color='black', width=1),
                mode='lines',
                hoverinfo="text",
                text=grid_text[i]
            ))

        # 绘制中心点
        fig.add_trace(go.Scatter(
            x=self.x.cpu(),
            y=self.y.cpu(),
            mode='markers',
            marker=dict(
                color=[f"rgba({c * 255}, {c * 255}, 150, 0.5)" for c in self.venue_category.cpu().numpy() / self.num_categories],
                size=5,
            ),
            hoverinfo="skip"
        ))

        fig.update_layout(
            title="Dynamic Grid Visualization with Density and Category Ratios",
            xaxis_title="Longitude",
            yaxis_title="Latitude",
            showlegend=False
        )

        pio.renderers.default = 'browser'
        fig.show()

# 使用模型
# 假设 dataset 是包含经纬度和类目信息的 DataFrame
locations = dataset.item_feat.to_numpy()  # 获取 numpy 数组
venue_id = locations[:, 0]
venue_category = locations[:, 1]
x = locations[:, 2]
y = locations[:, 3]

num_categories = len(np.unique(venue_category))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DynamicGridModel(x, y, venue_category, num_categories, grid_size=50).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01)
num_epochs = 100

# 训练模型
for epoch in range(num_epochs):
    optimizer.zero_grad()
    total_loss, category_loss, density_loss = model()
    total_loss.backward()
    optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Total Loss: {total_loss.item()}, Category Loss: {category_loss.item()}, Density Loss: {density_loss.item()}")

# 可视化结果
model.visualize_grid()


Epoch [1/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [2/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [3/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [4/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [5/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [6/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [7/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [8/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [9/100], Total Loss: 2.4291703084600158e-05, Category Loss: 2.4291703084600158e-05, Density Loss: 0.0
Epoch [10/100], Total Loss: 

In [None]:
import utils
importlib.reload(utils)
from utils import *
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)


In [None]:
print(dataset.inter_feat)
for batch_idx, interaction in enumerate(train_data):
    print(interaction)
    break

In [None]:

import importlib
import mamba4poi
import utils
importlib.reload(mamba4poi)
importlib.reload(utils)
from utils import *
from mamba4poi import Mamba4POI
import os
# 设置 CUDA_LAUNCH_BLOCKING 为 1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
if __name__ == '__main__':
    config = Config(model=Mamba4POI, config_file_list=['config.yaml'])
    init_seed(config['seed'], config['reproducibility'])
    
    # logger initialization
    init_logger(config)
    logger = getLogger()
    logger.info(sys.argv)
    logger.info(config)

    logger.info(dataset)

    # model loading and initialization
    init_seed(config["seed"] + config["local_rank"], config["reproducibility"])
    model = Mamba4POI(config, train_data.dataset).to(config['device'])
    logger.info(model)
    
    transform = construct_transform(config)
    flops = get_flops(model, dataset, config["device"], logger, transform)
    logger.info(set_color("FLOPs", "blue") + f": {flops}")

    # trainer loading and initialization
    trainer = Trainer(config, model)

    best_valid_score, best_valid_result = trainer.fit(
    train_data,
    valid_data,  # 可以保留验证数据集
    verbose=True,    # 保留详细信息，打印结果
    saved=True,      # 根据需要决定是否保存模型参数
    show_progress=True,  # 不显示进度条
    callback_fn=None  # 如果不需要回调函数，可以设置为 None
)



    # model evaluation
    test_result = trainer.evaluate(
        test_data, show_progress=config["show_progress"]
    )
    
    environment_tb = get_environment(config)
    logger.info(
        "The running environment of this training is as follows:\n"
        + environment_tb.draw()
    )

    logger.info(set_color("best valid ", "yellow") + f": {best_valid_result}")
    logger.info(set_color("test result", "yellow") + f": {test_result}")