In [1]:
""" Filter India data

aws emr add-steps --cluster-id <Your EMR cluster id> --steps Type=spark,Name=TestJob,Args=[--deploy-mode,cluster,--master,yarn,--conf,spark.yarn.submit.waitAppCompletion=true,s3a://your-source-bucket/code/pythonjob.py,s3a://your-source-bucket/data/data.csv,s3a://your-destination-bucket/test-output/],ActionOnFailure=CONTINUE
"""

from collections import namedtuple
import logging
import sys

from geopy.distance import great_circle
import pandas as pd
import geopandas as gpd
import numpy as np

from datetime import timedelta, date, datetime
from statistics import *


from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType,
    LongType,
    StructField,
    IntegerType,
    StringType,
    DoubleType,
    TimestampType,
    ArrayType,
    BinaryType
)
import pyspark.sql.functions as F
from math import *
import time
import json
import boto3

from sedona.register import SedonaRegistrator  
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from pyspark.sql.functions import udf
from sedona.utils.adapter import Adapter
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.core.SpatialRDD import PointRDD, SpatialRDD, CircleRDD
from sedona.sql.types import GeometryType
from sedona.core.enums import GridType, IndexType
from sedona.core.spatialOperator import JoinQueryRaw
from sedona.core.spatialOperator import JoinQuery
from sedona.core.enums import IndexType
from sedona.core.formatMapper.disc_utils import load_spatial_rdd_from_disc, GeoType
from sedona.core.formatMapper import WktReader, GeoJsonReader

from shapely.wkt import loads as wkt_loads
from shapely.geometry import Point, Polygon, shape
from shapely.ops import transform
import shapely

import s3fs

s3 = s3fs.S3FileSystem(anon=False)

spark = (SparkSession.builder.appName("sedona")
                 .config("spark.serializer", KryoSerializer.getName)          
        .config("spark.kryo.registrator",     
                  SedonaKryoRegistrator.getName)    
         .config("spark.driver.maxResultSize", "3g")
    .getOrCreate() 
        )

# Register Sedona UDTs and UDFs
SedonaRegistrator.registerAll(spark)
spark.sparkContext.addPyFile("s3://ipsos-dvd/scripts/utils.py")

bsdir = "s3://ipsos-dvd/fdd/"
data_dir = bsdir + "data/"
fn = data_dir +  "pings_IN_2021-09-01_2021-09-30/"




VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1699933210764_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
india = spark.read.parquet(fn)


In [None]:
# filter on bounding box
minx, maxx = 68.716667, 80.983333 # Longitude
miny, maxy = 22.166667, 30.916667 # Latitude

# filter on rajastan
india = india.filter(F.col('longitude').between(minx, maxx) & F.col('latitude').between(miny, maxy))

# filter on internet blackout dates
india = india.withColumn("date", F.from_utc_timestamp(F.col("utc_timestamp").cast(TimestampType()), tz = "IST"))
# india = india.filter(F.to_date(F.col("date")).isin(["2021-09-05", "2021-09-12", "2021-09-19", "2021-09-26"]))
india = india.filter(F.to_date(F.col("date")).isin(["2021-09-06", "2021-09-13", "2021-09-20", "2021-09-27"]))


india.write.mode("overwrite").parquet("s3://ipsos-dvd/fdd/data/rajasatan_monday_sep_2021")

## Intersect with shops and grid for subsampling

In [2]:
### First intersect shops

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# read data
pings = spark.read.parquet("s3://ipsos-dvd/fdd/data/rajasatan_sundays_sep_2021")



VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
pings.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+---------+-------------+--------+-------------------+
|                caid| latitude|longitude|utc_timestamp|altitude|               date|
+--------------------+---------+---------+-------------+--------+-------------------+
|f2be0cf8cbca54449...|  28.6542|  77.2373|   1631428195|    null|2021-09-12 11:59:55|
|f2be0cf8cbca54449...|  28.6542|  77.2373|   1631434184|    null|2021-09-12 13:39:44|
|fc0b23d7eab67cad7...|30.725517|76.651452|   1631414310|    null|2021-09-12 08:08:30|
|fc0b23d7eab67cad7...| 30.72552| 76.65146|   1631414325|    null|2021-09-12 08:08:45|
|59b38e4c4d40a7bc1...| 28.62679| 77.37367|   1631413870|    null|2021-09-12 08:01:10|
|59b38e4c4d40a7bc1...| 28.62679| 77.37367|   1631413873|    null|2021-09-12 08:01:13|
|59b38e4c4d40a7bc1...| 28.62679| 77.37367|   1631413884|    null|2021-09-12 08:01:24|
|d5270b5203bf1448d...| 28.70826| 77.28874|   1631470995|    null|2021-09-12 23:53:15|
|d5270b5203bf1448d...| 28.70825| 77.28873|   163147100

In [5]:
def spatialIntersection(pings, poly, build_on_spatial_partitioned_rdd = True, using_index = True, crs="epsg:4326"): 
    
    poly_rdd = Adapter.toSpatialRdd(poly, "geometry")

    pings.createOrReplaceTempView("pings")

    # Read Hive table
    pings = spark.sql(
          f"""SELECT ST_FlipCoordinates(ST_Transform(ST_FlipCoordinates(ST_Point(cast(pings.longitude as Decimal(24,20)), 
          cast(pings.latitude as Decimal(24,20)))), "epsg:4326", "{crs}")) AS point, 
          *
          FROM pings;
          """
    )
    num_partitions = 1000
    pings = pings.repartition(num_partitions)
    pings = pings.cache()
    
    grid_type = GridType.QUADTREE # this shit works so much better for skewed data

    points_rdd = Adapter.toSpatialRdd(pings, "point")
    points_rdd.analyze()
    points_rdd.spatialPartitioning(grid_type)
    
    poly_rdd.analyze()
    poly_rdd.spatialPartitioning(points_rdd.getPartitioner())
    
     ## Set to TRUE only if run join query
    points_rdd.buildIndex(IndexType.QUADTREE, build_on_spatial_partitioned_rdd)
    
    result = JoinQueryRaw.SpatialJoinQueryFlat(points_rdd, poly_rdd, using_index, True)

    return Adapter.toDf(result, poly_rdd.fieldNames, points_rdd.fieldNames, spark)



VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# downloading geojson from s3 is much quicker
s3 = boto3.client('s3')
s3.download_file('ipsos-dvd', 'fdd/data/shops.geojson', 'shops_s3.geojson')
poly_raw = gpd.read_file("shops_s3.geojson")
crs = "EPSG:7774"
poly_raw = poly_raw.to_crs(crs)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
poly_raw.drop(columns = "tags", inplace=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
poly = spark.createDataFrame(poly_raw).cache()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
joined = spatialIntersection(pings, poly, crs=crs)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('3.0', '2.12', '1.4.0')]

In [10]:
# combine with original points data and filter duplicates
test = pings.withColumns({"leftgeometry" : F.lit(None), 'id' : F.lit(None)}).union(joined.select(*pings.columns, "leftgeometry", "id"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
test = (test.filter(F.hour(F.col("date")).between(6,18))
            .withColumns({"post" : (F.col("date") == "2021-09-26"), "shop" : F.col("id").isNotNull().cast(IntegerType())})
            .groupBy("caid", "latitude", "longitude", "utc_timestamp", "date", "post").agg(F.max("shop").alias("shop"))
        )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Now intersect with grid

In [12]:
def create_grid(min_x, max_x, min_y, max_y, step):
    grid = []
    for x in np.arange(min_x, max_x, step):
        for y in np.arange(min_y, max_y, step):
            grid.append(Polygon([(x, y), (x+step, y), (x+step, y+step), (x, y+step)]))
    return grid

# create the grid
min_x, min_y, max_x, max_y = poly_raw.geometry.total_bounds
grid = create_grid(min_x, max_x, min_y, max_y, step=40000) # 5km grid

# create a GeoDataFrame from the grid
grid_gdf = gpd.GeoDataFrame(geometry=grid)
grid_gdf['x'] = grid_gdf.geometry.centroid.x
grid_gdf['y'] = grid_gdf.geometry.centroid.y

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
grid_gdf.crs = poly_raw.crs
grid_poly = spark.createDataFrame(grid_gdf).cache()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
grid_gdf

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                                              geometry  ...             y
0    POLYGON ((480540.601 477591.750, 520540.601 47...  ...  4.975917e+05
1    POLYGON ((480540.601 517591.750, 520540.601 51...  ...  5.375917e+05
2    POLYGON ((480540.601 557591.750, 520540.601 55...  ...  5.775917e+05
3    POLYGON ((480540.601 597591.750, 520540.601 59...  ...  6.175917e+05
4    POLYGON ((480540.601 637591.750, 520540.601 63...  ...  6.575917e+05
..                                                 ...  ...           ...
795  POLYGON ((1720540.601 1277591.750, 1760540.601...  ...  1.297592e+06
796  POLYGON ((1720540.601 1317591.750, 1760540.601...  ...  1.337592e+06
797  POLYGON ((1720540.601 1357591.750, 1760540.601...  ...  1.377592e+06
798  POLYGON ((1720540.601 1397591.750, 1760540.601...  ...  1.417592e+06
799  POLYGON ((1720540.601 1437591.750, 1760540.601...  ...  1.457592e+06

[800 rows x 3 columns]

In [15]:
joined = spatialIntersection(test, grid_poly, crs=crs)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
joined = joined.cache()
joined = joined.withColumn("date", F.to_date("date"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
joined = joined.drop("rightgeometry", "leftgeometry", "caid", "latitude", "longitude", "utc_timestamp")

joined.repartition(1).write.mode("overwrite").option("header","true").csv(data_dir + "grid_subsampling", compression="gzip")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [48]:
# convert to geojson 
gdf = pd.read_csv(data_dir + "grid_subsampling/" + "part-00000-c1035a5d-2054-4a64-b724-4335883121d7-c000.csv.gz")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…