In [28]:
# Milestone 1.2.1

import numpy as np
from collections import defaultdict

class Rectangle:
    def __init__(self, x_min, x_max, y_min, y_max, points):
        self.x_min = x_min
        self.x_max = x_max
        self.y_min = y_min
        self.y_max = y_max
        self.points = points  # List of (x,y) points in rectangle

    def min_distance_to_point(self, x, y):
        """Calculate minimum l-infinity distance from point to rectangle"""
        dx = 0 if self.x_min <= x <= self.x_max else min(abs(x - self.x_min), abs(x - self.x_max))
        dy = 0 if self.y_min <= y <= self.y_max else min(abs(y - self.y_min), abs(y - self.y_max))
        return max(dx, dy)

    def max_distance_to_point(self, x, y):
        """Calculate maximum l-infinity distance from point to rectangle"""
        dx = max(abs(x - self.x_min), abs(x - self.x_max))
        dy = max(abs(y - self.y_min), abs(y - self.y_max))
        return max(dx, dy)


def calculate_radius_suna(R, T, k, join_key):
    """Calculate k-nearest neighbor radius"""
    
    # Form rectangles for each join key
    rectangles = {}
    for key in set(R[join_key].unique()) & set(T[join_key].unique()):
        x_vals = R[R[join_key] == key]['x']
        y_vals = T[T[join_key] == key]['y']

        points = []
        for x in x_vals:
            for y in y_vals:
                points.append((x, y))

        rectangles[key] = Rectangle(
            x_min=x_vals.min(),
            x_max=x_vals.max(),
            y_min=y_vals.min(),
            y_max=y_vals.max(),
            points=points
        )

    radii = []

    # For each point in the hypothetical join
    for key1, rect1 in rectangles.items():
        for x, y in rect1.points:
            # Find k-closest rectangles by minimum distance
            rect_distances = []
            for key2, rect2 in rectangles.items():
                if key1 != key2:
                    min_dist = rect2.min_distance_to_point(x, y)
                    max_dist = rect2.max_distance_to_point(x, y)
                    rect_distances.append((min_dist, max_dist, key2))

            # Sort by minimum distance
            rect_distances.sort(key=lambda x: x[0])

            # Take enough rectangles to guarantee k neighbors
            selected_rects = []
            total_points = 0
            for min_dist, max_dist, rect_key in rect_distances:
                selected_rects.append((min_dist, max_dist, rect_key))
                total_points += len(rectangles[rect_key].points)
                if total_points >= k:
                    break

            # Estimate radius using selected rectangles
            if not selected_rects:
                # Only points in same rectangle
                max_dist = rect1.max_distance_to_point(x, y)
                radii.append(max_dist / 2)
            else:
                # Use maximum possible distance to k-th closest rectangle
                _, max_dist, _ = selected_rects[-1]
                radii.append(max_dist / 2)

    return np.array(radii)

In [29]:
import pandas as pd

def generate_sample_data(seed):
    np.random.seed(seed)
    size = 100
    R = pd.DataFrame({'join_key': np.random.randint(0, 10, size),'x': np.random.rand(size)})
    T = pd.DataFrame({'join_key': np.random.randint(0, 10, size),'y': np.random.rand(size)})
    return R, T


# Test cases
test_cases = [
    (generate_sample_data(0), 5),  # Small k
    (generate_sample_data(1), 20),  # Medium k
    (generate_sample_data(2), 50),  # Large k
    (generate_sample_data(4), 80)   # k close to dataset size
]

# Run and store results
results = []
for (R, T), k in test_cases:
    radii = calculate_radius_suna(R, T, k, 'join_key')
    results.append(radii)

# Further analysis of the results
for i, radii in enumerate(results):
    print(f"Test case {i+1} (k={test_cases[i][1]}):")
    print(f"  Minimum radius: {np.min(radii)}")
    print(f"  Maximum radius: {np.max(radii)}")
    print(f"  Average radius: {np.mean(radii)}")
    print(f"  Median radius: {np.median(radii)}")
    print(f"  Standard deviation of radii: {np.std(radii)}")
    print("-" * 20)

Test case 1 (k=5):
  Minimum radius: 0.2458437374225118
  Maximum radius: 0.49014066398984785
  Average radius: 0.3818738386966848
  Median radius: 0.3740189524571936
  Standard deviation of radii: 0.06622769094399238
--------------------
Test case 2 (k=20):
  Minimum radius: 0.19096355140698418
  Maximum radius: 0.48637432211279424
  Average radius: 0.33293733053887875
  Median radius: 0.3397777153069938
  Standard deviation of radii: 0.06486930507992161
--------------------
Test case 3 (k=50):
  Minimum radius: 0.2282481534200067
  Maximum radius: 0.49274994291494695
  Average radius: 0.38175288988939776
  Median radius: 0.38665774247672624
  Standard deviation of radii: 0.06635901086500919
--------------------
Test case 4 (k=80):
  Minimum radius: 0.21873622038059742
  Maximum radius: 0.48347702843172397
  Average radius: 0.3753109291397963
  Median radius: 0.3795074083975241
  Standard deviation of radii: 0.0641483196938243
--------------------
