In [1]:
!pip install apache-sedona[spark]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting apache-sedona[spark]
  Downloading apache_sedona-1.3.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 1.5 MB/s 
Collecting pyspark>=2.3.0
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 42 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 59.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=fb967643e9fbaaaeb42805299d8b8bd69c0ade603b209102e1f3379ec82a34aa
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark, apache-sedon

In [2]:
from pyspark.sql import SparkSession

from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

spark = SparkSession. \
    builder. \
    appName('appName'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.2.0-incubating,'
           'org.datasyslab:geotools-wrapper:1.1.0-25.2'). \
    getOrCreate()

SedonaRegistrator.registerAll(spark)

True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
DATA_PATH = '/content/drive/MyDrive/sedona_osm_data'

In [5]:
sc = spark.sparkContext

In [6]:
from sedona.core.formatMapper import WktReader
import csv, sys, pprint, hashlib
from shapely import wkt
import numpy as np

### Points

In [7]:
points_rdd = WktReader.readToGeometryRDD(sc, DATA_PATH + '/all_points_1K.wkt', 1, True, False)
polygon_rdd = WktReader.readToGeometryRDD(sc, DATA_PATH + '/all_source_1K.wkt', 1, True, False)

In [8]:
points_rdd
polygon_rdd

<sedona.core.SpatialRDD.spatial_rdd.SpatialRDD at 0x7f59172378e0>

In [9]:
len = 1000

In [10]:
from sedona.core.enums import GridType, IndexType
from sedona.utils.adapter import Adapter

points_rdd.analyze()
polygon_rdd.analyze()

True

In [11]:
points_rdd.spatialPartitioning(GridType.QUADTREE)
polygon_rdd.spatialPartitioning(points_rdd.getPartitioner())

In [12]:
import time
from sedona.core.spatialOperator import JoinQuery

for i in range(3):
  if i == 0:
    print("'No Index'")
    using_index = False
  elif i == 1:
    print("'R-Tree Index'")
    # using R-tree index
    using_index = True

    build_on_spatial_partitioned_rdd = True ## Set to TRUE only if run join query
    polygon_rdd.buildIndex(IndexType.RTREE, build_on_spatial_partitioned_rdd)
  elif i == 2:
    print("'Quad-Tree Index'")
    # using Quad-tree index
    using_index = True

    build_on_spatial_partitioned_rdd = True ## Set to TRUE only if run join query
    polygon_rdd.buildIndex(IndexType.QUADTREE, build_on_spatial_partitioned_rdd)


  print("'n,time(s)'")
  s = time.time()
  result = JoinQuery.SpatialJoinQuery(points_rdd, polygon_rdd, True, using_index)
  result.count()
  d = time.time() - s
  print("'" + str(len) + "," + str(round(d, 3)) + "'")

'No Index'
'n,time(s)'
'1000,8.519'
'R-Tree Index'
'n,time(s)'
'1000,2.435'
'Quad-Tree Index'
'n,time(s)'
'1000,2.85'
