In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
根据《Benchmarking Spatial Joins À La Carte》中的思路生成随机矩形集合：
- 矩形边与坐标轴平行
- 用随机变量 (x, y, t, a) 控制位置、形状和面积
- N = 100000
- 保证一定覆盖率，从而产生适度重叠
- 将结果写入 CSV 文件
"""

import math
import random
import csv
from dataclasses import dataclass
from typing import List, Tuple


@dataclass
class Rectangle:
    """简单表示一个矩形：左下角 + 右上角坐标"""
    x_min: float
    y_min: float
    x_max: float
    y_max: float


# ====================== 随机分布采样函数 ======================

def sample_truncated_normal(mu: float,
                            sigma: float,
                            low: float,
                            high: float,
                            max_tries: int = 100) -> float:
    """截断正态分布采样，超出范围就重采"""
    for _ in range(max_tries):
        x = random.gauss(mu, sigma)
        if low < x < high:
            return x
    # 多次失败就退而求其次，用均匀分布兜底
    return random.uniform(low, high)


def sample_truncated_exponential(mean: float,
                                 min_factor: float,
                                 max_factor: float,
                                 max_tries: int = 1000) -> float:
    """
    截断指数分布采样：
    - 期望值 ≈ mean（lambda = 1 / mean）
    - 实际取值范围 [min_factor * mean, max_factor * mean]
    """
    lam = 1.0 / mean
    a_min = min_factor * mean
    a_max = max_factor * mean

    for _ in range(max_tries):
        x = random.expovariate(lam)
        if a_min <= x <= a_max:
            return x

    # 兜底：强行截断
    return max(a_min, min(a_max, random.expovariate(lam)))


# ====================== 核心生成逻辑 ======================

def generate_rectangles(n: int,
                        universe: Tuple[float, float] = (1.0, 1.0),
                        coverage: float = 0.7,
                        angle_mu: float = math.pi / 4,
                        angle_sigma: float = 0.5,
                        area_min_factor: float = 0.05,
                        area_max_factor: float = 40.0,
                        seed: int = 42) -> List[Rectangle]:
    """
    按论文思路生成 n 个矩形：
    - 宇宙空间为 [0, Ux] × [0, Uy]
    - coverage: 所有矩形面积之和 / 宇宙面积
    - angle_mu, angle_sigma: 控制 t 的正态分布（主对角线与 x 轴夹角）
    - area_*_factor: 控制面积指数分布的截断范围
    """
    random.seed(seed)

    Ux, Uy = universe
    U = Ux * Uy  # 宇宙面积
    mean_area = coverage * U / n  # 论文中的 μ_a = C * U / N

    rects: List[Rectangle] = []

    while len(rects) < n:
        # 1) 采样角度 t（控制长宽比）
        t = sample_truncated_normal(
            mu=angle_mu,
            sigma=angle_sigma,
            low=0.01,          # 避免 tan(t) 过大/过小
            high=math.pi / 2 - 0.01
        )

        # 2) 采样面积 a（指数分布，长尾，大小“自由”）
        a = sample_truncated_exponential(
            mean=mean_area,
            min_factor=area_min_factor,
            max_factor=area_max_factor
        )

        # 3) 由 (a, t) 计算宽高:
        #    tan(t) = h / w,  a = w * h
        tan_t = math.tan(t)
        w = math.sqrt(a / tan_t)
        h = math.sqrt(a * tan_t)

        # 太大的矩形直接丢弃（极少发生）
        if w >= Ux or h >= Uy:
            continue

        # 4) 在能放下矩形的范围内均匀采样左下角坐标
        x_min = random.uniform(0.0, Ux - w)
        y_min = random.uniform(0.0, Uy - h)
        x_max = x_min + w
        y_max = y_min + h

        rects.append(Rectangle(x_min, y_min, x_max, y_max))

    return rects


# ====================== 存储函数 ======================

def save_rectangles_to_csv(rects: List[Rectangle],
                           path: str) -> None:
    """
    将矩形保存为 CSV 文件，列为：
    id, x_min, y_min, x_max, y_max, width, height, area
    """
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(
            ["id", "x_min", "y_min", "x_max", "y_max", "width", "height", "area"]
        )
        for i, r in enumerate(rects):
            width = r.x_max - r.x_min
            height = r.y_max - r.y_min
            area = width * height
            writer.writerow([i, r.x_min, r.y_min, r.x_max, r.y_max, width, height, area])


# ====================== 主函数 ======================

def main():
    N = 100_00
    universe = (1.0, 1.0)
    coverage = 0.7  # 覆盖率（矩形总面积 / 宇宙面积），越大重叠越多

    rects = generate_rectangles(
        n=N,
        universe=universe,
        coverage=coverage,
        angle_mu=math.pi / 4,   # 以接近正方形为主
        angle_sigma=0.5,        # 波动大一些，保证形状多样
        area_min_factor=0.05,   # 面积下截断：0.05 * mean_area
        area_max_factor=40.0,   # 面积上截断：40 * mean_area
        seed=42                 # 固定随机种子，保证可复现
    )

    save_rectangles_to_csv(rects, "rectangles_100k.csv")
    print(f"生成 {len(rects)} 个矩形，已保存到 rectangles_100k.csv")


if __name__ == "__main__":
    main()


生成 10000 个矩形，已保存到 rectangles_100k.csv
