## Setup of the environnement
This section does all the required imports.

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, explode, json_tuple, regexp_replace
from pyspark.sql.functions import sum as col_sum
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, ArrayType, FloatType

import re
import os

spark = SparkSession.builder.appName("Notebook 2").master("local").getOrCreate()

## Data preparation
In this section, the data gets prepared and cleaned so that it is easy to use in the future.

#### Stops
In this section the stops get converted to a usable dataframe.

In [2]:
# Read in the file (as json) and convert to a single column
# The 'haltes' column gets renamed to 'stops'
stops = spark.read.json("data/stops.txt")
stops = stops.select((explode("haltes").alias("stops")))

# Map each entry in the dataframe to its own stop
stops = stops.select('stops').rdd.map(lambda x: x.stops).toDF()
# Drop unnecessary data
stops = stops.drop('links', 'gemeentenummer', 'entiteitnummer')

# Rename columns of dataframe to better name
stops = stops \
            .withColumnRenamed('haltenummer', 'stop_number') \
            .withColumnRenamed('omschrijving', 'desc') \
            .withColumnRenamed('geoCoordinaat', 'coord') \
            .withColumnRenamed('omschrijvingGemeente', 'village') \

# Show the first 20 entries of the dataframe
stops.show()

+--------------------+-----------+--------------------+---------+
|               coord|stop_number|                desc|  village|
+--------------------+-----------+--------------------+---------+
|[51.1638893702134...|     101000| A. Chantrainestraat|  Wilrijk|
|[51.2062496902375...|     101001|           Zurenborg|Antwerpen|
|[51.1660665941742...|     101002|Verenigde Natieslaan|  Hoboken|
|[51.1660216374063...|     101003|Verenigde Natieslaan|  Hoboken|
|[51.1740548394127...|     101004|     D. Baginierlaan|  Hoboken|
|[51.1630084393468...|     101005| A. Chantrainestraat|  Wilrijk|
|[51.1597748887066...|     101006|      Fotografielaan|  Wilrijk|
|[51.1599636330007...|     101007|      Fotografielaan|  Wilrijk|
|[51.1629556669243...|     101008|            Moerelei|  Wilrijk|
|[51.1634592883462...|     101009|            Moerelei|  Wilrijk|
|[51.1887431659368...|     101010|        J. De Voslei|Antwerpen|
|[51.1829725415369...|     101011|   Middelheim Vijver|Antwerpen|
|[51.16220

### The amount of stops in each village
In this section, a datatable is created with the amount of stops in each village.
To achieve this, the stops were grouped by village. Then it is possible to check how many times a village occurs.
This count value is the amount of stops that are in that particular village.

The select function below is only used to rename the second column of the dataframe from 'count' to '# stops'.

In [25]:
per_town = stops.groupBy("village").count().select(col("village"), col("count").alias("# stops"))
per_town.show()

+-------------------+-------+
|            village|# stops|
+-------------------+-------+
|              Bever|     28|
|           Waanrode|     40|
|              Duras|      2|
|           Ettelgem|      8|
|          Zillebeke|     25|
|Sint-Job-in-'t-Goor|     24|
|               Lint|     24|
|          Merelbeke|     85|
|             Essene|     31|
|             Parike|     10|
|          Huizingen|     21|
| Glabbeek-Zuurbemde|     16|
|             Gorsem|      6|
|          Harelbeke|     81|
|                Mol|    140|
|        Hoogstraten|     38|
|       Oud-Turnhout|     47|
|           Hoevenen|     24|
|         Rupelmonde|      6|
|            Edingen|     12|
+-------------------+-------+
only showing top 20 rows

