In [5]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, explode, json_tuple, regexp_replace, udf
from pyspark.sql.functions import sum as col_sum
from pyspark.sql.types import BooleanType, StructType, StructField, StringType, IntegerType, FloatType

from math import radians, cos, sin, asin, sqrt
from typing import List

import re
import os

## Read the stop data

In [3]:
# Read in the file (as json) and convert to a single column
# The 'haltes' column gets renamed to 'stops'
stops = spark.read.json("data/stops.txt")
stops = stops.select((explode("haltes").alias("stops")))

# Map each entry in the dataframe to its own stop
stops = stops.select('stops').rdd.map(lambda x: x.stops).toDF()
# Drop unnecessary data
stops = stops.drop('links', 'gemeentenummer', 'entiteitnummer')

# Rename columns of dataframe to better name
stops = stops \
            .withColumnRenamed('haltenummer', 'stop_number') \
            .withColumnRenamed('omschrijving', 'desc') \
            .withColumnRenamed('geoCoordinaat', 'coord') \
            .withColumnRenamed('omschrijvingGemeente', 'village') \

# Show the first 20 entries of the dataframe
stops.show()

+--------------------+-----------+--------------------+---------+
|               coord|stop_number|                desc|  village|
+--------------------+-----------+--------------------+---------+
|[51.1638893702134...|     101000| A. Chantrainestraat|  Wilrijk|
|[51.2062496902375...|     101001|           Zurenborg|Antwerpen|
|[51.1660665941742...|     101002|Verenigde Natieslaan|  Hoboken|
|[51.1660216374063...|     101003|Verenigde Natieslaan|  Hoboken|
|[51.1740548394127...|     101004|     D. Baginierlaan|  Hoboken|
|[51.1630084393468...|     101005| A. Chantrainestraat|  Wilrijk|
|[51.1597748887066...|     101006|      Fotografielaan|  Wilrijk|
|[51.1599636330007...|     101007|      Fotografielaan|  Wilrijk|
|[51.1629556669243...|     101008|            Moerelei|  Wilrijk|
|[51.1634592883462...|     101009|            Moerelei|  Wilrijk|
|[51.1887431659368...|     101010|        J. De Voslei|Antwerpen|
|[51.1829725415369...|     101011|   Middelheim Vijver|Antwerpen|
|[51.16220

## Find centers of villages

In [6]:
data = sc.textFile("data/zipcodes.csv")
data = data.map(lambda x: re.sub(r' \(.*\)', '', x))
data = data.map(lambda x: re.sub(r"[^\S\n\t]+", ' ', x))
data = data.map(lambda x: x.split(';'))

schema = StructType([
    StructField('postal code', StringType(), False),
    StructField('Village', StringType(), False),
    StructField('lat_center', StringType(), False),
    StructField('lon_center', StringType(), False),
    StructField('url', StringType(), False)
])


data = data.toDF(schema=schema).drop("url")
data = data.withColumn("lat_center", data['lat_center'].cast(FloatType()))
data = data.withColumn("lon_center", data['lon_center'].cast(FloatType()))

centers = data.where(col("lat_center").isNotNull() & col("lon_center").isNotNull())

centers.show()

+-----------+--------------------+----------+----------+
|postal code|             Village|lat_center|lon_center|
+-----------+--------------------+----------+----------+
|       1000|             Brussel|  50.84275|   4.35155|
|       1000|           Bruxelles|  50.84275|   4.35155|
|       1005|Brusselse Hoofdst...| 50.844875| 4.3514333|
|       1005|Conseil Region Br...|  50.84786|  4.367408|
|       1008|Chambre des Repr�...| 50.846558|  4.364662|
|       1008|Kamer van Volksve...| 50.846558|  4.364662|
|       1009|   Senat de Belgique|  50.79834|   4.39565|
|       1010|Rijksadministrati...|  50.82433|  4.513954|
|       1012|Parlement de la C...| 50.846638| 4.3619924|
|       1020|             Brussel| 50.884216| 4.3580003|
|       1020|           Bruxelles| 50.884216| 4.3580003|
|       1020|              Laeken| 50.884216| 4.3580003|
|       1020|               Laken| 50.884216| 4.3580003|
|       1030|             Brussel|  50.86744|   4.37727|
|       1030|           Bruxell

## Haversine function
A function to calculate the distance in between two geocoordinates. <br>
Found at https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points

In [7]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

## Create filter function and parameters

In [21]:
# Radius to check for (in km)
radius = 3

# Position to check from. The location provided here is the location of the A. Chantrainestraat in Wilrijk.
latlng = [51.16388937021345, 4.392073389160737]

@udf(returnType=BooleanType())
def in_radius(lat1, lon1):    
    lat2 = latlng[0]
    lon2 = latlng[1]
    
    return haversine(lon1, lat1, lon2, lat2) <= radius

## Villages within the radius

In [24]:
cities_in_radius = centers.filter(in_radius('lat_center', 'lon_center')) \
                            .drop("postal code", "lat_center", "lon_center") \
                            .distinct().sort(col("Village").asc())

cities_in_radius.show()

+---------+
|  Village|
+---------+
|Antwerpen|
|  Wilrijk|
+---------+

