In [1]:
!pip install apache-sedona[spark]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting apache-sedona[spark]
  Downloading apache_sedona-1.3.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 1.0 MB/s 
Collecting pyspark>=2.3.0
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 54 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 98.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=06c9390c4bd29a2755b3a84724b08a3f785d2fac03f5245ac584b644908a2be1
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark, apache-sedon

In [2]:
from pyspark.sql import SparkSession

from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

spark = SparkSession. \
    builder. \
    appName('appName'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.2.0-incubating,'
           'org.datasyslab:geotools-wrapper:1.1.0-25.2'). \
    getOrCreate()

SedonaRegistrator.registerAll(spark)

True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
DATA_PATH = '/content/drive/MyDrive/sedona_osm_data'

In [6]:
sc = spark.sparkContext

In [7]:
from sedona.core.formatMapper import WktReader
import csv, sys, pprint, hashlib
from shapely import wkt
import numpy as np

## Points:

In [8]:
points_rdd = WktReader.readToGeometryRDD(sc, DATA_PATH + '/all_points_100K.wkt', 1, True, False)

In [None]:
points_rdd

<sedona.core.SpatialRDD.spatial_rdd.SpatialRDD at 0x7f9625ddb460>

In [None]:
from sedona.core.enums import GridType, IndexType
from sedona.utils.adapter import Adapter

points_rdd.analyze()

True

In [None]:
len = 100000

In [None]:
coords = [73.512247, 4.083805]

In [None]:
points_rdd.spatialPartitioning(GridType.QUADTREE)

True

In [None]:
from sedona.core.spatialOperator import KNNQuery
from shapely.geometry import Polygon, Point
import time

point = Point(coords)

for i in range(2):
  using_index = False
  if i == 1:
    # using R-tree index
    print("'R-tree index'")
    using_index = True

    build_on_spatial_partitioned_rdd = False ## Set to TRUE only if run join query
    points_rdd.buildIndex(IndexType.RTREE, build_on_spatial_partitioned_rdd)
  else:
    print("'No index'")

  k_vals = [ 1, 5, 10, 20, 30, 50, 100, 500, 1000, 5000, 10000, 20000, 30000, 50000, 75000 ]

  print("'k,time(s)'")
  for k in k_vals:
    if k <= len:
      s = time.time()
      result = KNNQuery.SpatialKnnQuery(points_rdd, point, k, using_index)
      # print(result)
      d = time.time() - s
      print("'" + str(k) + "," + str(round(d, 3)) + "'")


'No index'
'k,time(s)'
'1,0.213'
'5,0.291'
'10,0.248'
'20,0.197'
'30,0.197'
'50,0.21'
'100,0.273'
'500,0.297'
'1000,0.436'
'5000,1.636'
'10000,2.736'
'20000,5.37'
'30000,7.999'
'50000,12.769'
'75000,12.777'
'R-tree index'
'k,time(s)'
'1,1.144'
'5,0.274'
'10,0.255'
'20,0.281'
'30,0.224'
'50,0.238'
'100,0.257'
'500,0.344'
'1000,0.434'
'5000,1.405'
'10000,2.929'
'20000,5.229'
'30000,7.361'
'50000,12.443'
'75000,18.052'


In [None]:
# polygons_nearby[0].geom.wkt

'POINT (73.512247 4.083805)'

## Polygons:

In [9]:
polygon_rdd = WktReader.readToGeometryRDD(sc, DATA_PATH + '/all_source_10K.wkt', 1, True, False)

In [10]:
polygon_rdd

<sedona.core.SpatialRDD.spatial_rdd.SpatialRDD at 0x7f5be791f760>

In [11]:
from sedona.core.enums import GridType, IndexType
from sedona.utils.adapter import Adapter

polygon_rdd.analyze()

True

In [12]:
len = 10000

In [13]:
polygon = open(DATA_PATH + "/knn_polygon.tsv", "r").read()
shape = wkt.loads(polygon)
coords = np.dstack(shape.boundary.xy).tolist()[0][:-1]

In [14]:
polygon_rdd.spatialPartitioning(GridType.QUADTREE)

True

In [15]:
from sedona.core.spatialOperator import KNNQuery
from shapely.geometry import Polygon, Point
import time

polygon = Polygon(coords)

for i in range(0, 2):
  using_index = False
  if i == 1:
    # using R-tree index
    print("'R-tree index'")
    using_index = True

    build_on_spatial_partitioned_rdd = False ## Set to TRUE only if run join query
    polygon_rdd.buildIndex(IndexType.RTREE, build_on_spatial_partitioned_rdd)
  else:
    print("'No index'")

  k_vals = [ 1000 ]

  print("'k,time(s)'")
  for k in k_vals:
    if k <= len:
      s = time.time()
      result = KNNQuery.SpatialKnnQuery(polygon_rdd, polygon, k, using_index)
      d = time.time() - s
      print("'" + str(k) + "," + str(round(d, 3)) + "'")


'No index'
'k,time(s)'
'1000,858.343'
'R-tree index'
'k,time(s)'
'1000,635.147'
