In [6]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, explode, json_tuple, regexp_replace, udf
from pyspark.sql.functions import sum as col_sum
from pyspark.sql.types import BooleanType

from math import radians, cos, sin, asin, sqrt
from typing import List

import re
import os

## Read the stop data

In [3]:
# Read in the file (as json) and convert to a single column
# The 'haltes' column gets renamed to 'stops'
stops = spark.read.json("data/stops.txt")
stops = stops.select((explode("haltes").alias("stops")))

# Map each entry in the dataframe to its own stop
stops = stops.select('stops').rdd.map(lambda x: x.stops).toDF()
# Drop unnecessary data
stops = stops.drop('links', 'gemeentenummer', 'entiteitnummer')

# Rename columns of dataframe to better name
stops = stops \
            .withColumnRenamed('haltenummer', 'stop_number') \
            .withColumnRenamed('omschrijving', 'desc') \
            .withColumnRenamed('geoCoordinaat', 'coord') \
            .withColumnRenamed('omschrijvingGemeente', 'village') \

# Show the first 20 entries of the dataframe
stops.show()

+--------------------+-----------+--------------------+---------+
|               coord|stop_number|                desc|  village|
+--------------------+-----------+--------------------+---------+
|[51.1638893702134...|     101000| A. Chantrainestraat|  Wilrijk|
|[51.2062496902375...|     101001|           Zurenborg|Antwerpen|
|[51.1660665941742...|     101002|Verenigde Natieslaan|  Hoboken|
|[51.1660216374063...|     101003|Verenigde Natieslaan|  Hoboken|
|[51.1740548394127...|     101004|     D. Baginierlaan|  Hoboken|
|[51.1630084393468...|     101005| A. Chantrainestraat|  Wilrijk|
|[51.1597748887066...|     101006|      Fotografielaan|  Wilrijk|
|[51.1599636330007...|     101007|      Fotografielaan|  Wilrijk|
|[51.1629556669243...|     101008|            Moerelei|  Wilrijk|
|[51.1634592883462...|     101009|            Moerelei|  Wilrijk|
|[51.1887431659368...|     101010|        J. De Voslei|Antwerpen|
|[51.1829725415369...|     101011|   Middelheim Vijver|Antwerpen|
|[51.16220

## Haversine function
A function to calculate the distance in between two geocoordinates. <br>
Found at https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points

In [7]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

## Create filter function and parameters

In [15]:
# Radius to check for (in km)
radius = 1

# Position to check from. The location provided here is the location of the A. Chantrainestraat in Wilrijk.
latlng = [51.16388937021345, 4.392073389160737]

@udf(returnType=BooleanType())
def in_radius(latlng_col: List):
    lat1 = latlng_col[0]
    lon1 = latlng_col[1]
    
    lat2 = latlng[0]
    lon2 = latlng[1]
    
    return haversine(lon1, lat1, lon2, lat2) <= radius
    

## Filter the data

In [17]:
stops.filter(in_radius('coord')).show(100)

+--------------------+-----------+--------------------+-------+
|               coord|stop_number|                desc|village|
+--------------------+-----------+--------------------+-------+
|[51.1638893702134...|     101000| A. Chantrainestraat|Wilrijk|
|[51.1630084393468...|     101005| A. Chantrainestraat|Wilrijk|
|[51.1629556669243...|     101008|            Moerelei|Wilrijk|
|[51.1634592883462...|     101009|            Moerelei|Wilrijk|
|[51.1585508443406...|     101137|           Geleegweg|Wilrijk|
|[51.1588924647597...|     101138|           Geleegweg|Wilrijk|
|[51.1561506927094...|     101141|     Planetariumlaan|Wilrijk|
|[51.1561237501239...|     101142|     Planetariumlaan|Wilrijk|
|[51.1676829178755...|     101445|          Boekstraat|Wilrijk|
|[51.1683390726348...|     101450|          Boekstraat|Wilrijk|
|[51.1624349973914...|     101699|         Legerstraat|Wilrijk|
|[51.1626237301586...|     101701|         Legerstraat|Wilrijk|
|[51.1597168043640...|     102035|      