## Setup of environnement

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, explode, json_tuple, regexp_replace
from pyspark.sql.functions import sum as col_sum
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, ArrayType, FloatType
import pyspark
from pyspark import SparkConf

import re
import os

## The amount of citizens per city

In [2]:
citizens = sc.textFile("data/citizens.txt")
citizens.collect()

# Remove unnecessary headers
citizens = citizens.map(lambda x: re.sub(
    r'^KONINKRIJK.*|^BRUSSELS.*|^ARR.*|^ARRONDISSEMENT.*|^PROVINC.*|^VLAAMS.*|^REGION.*', '', x))

# Replace 'village / village-in-french' with 'village'
citizens = citizens.map(lambda x: re.sub(r'/ .* ', '', x))

# Remove everything in between parentheses
citizens = citizens.map(lambda x: re.sub(r'\(.*\)', '', x))

# Remove excess whitespaces
citizens = citizens.map(lambda x: re.sub(r"[^\S\n\t]+", ' ', x))

# Remove empty lines
citizens = citizens.filter(lambda x: x != '')

# Remove '.' from the numbers
citizens = citizens.map(lambda x: re.sub(r'\.', '', x))

# Split on space: gives a list of lists [[village, amount], [village, amount]...]
citizens = citizens.map(lambda x: x.rsplit(' ', 1))


for test in citizens.collect():
    if len(test) > 2:
        print(test)
        

# Create schema for dataframe
# For some reason, the citizens field needs to be StringType here, else .show() will not work.
s = StructType([
    StructField('village', StringType(), False),
    StructField('citizens', StringType(), False)
])

# Create schema and cast the citizens column to IntegerType
citizens = citizens.toDF(schema=s)

# Cast to citizens column to integers
citizens = citizens.withColumn("citizens", citizens['citizens'].cast(IntegerType()))

    
citizens.show()
citizens.printSchema()

+--------------------+--------+
|             village|citizens|
+--------------------+--------+
|          Anderlecht|  117724|
|             Brussel|  177112|
|              Elsene|   86336|
|           Etterbeek|   47410|
|               Evere|   41016|
|           Ganshoren|   24794|
|               Jette|   52144|
|          Koekelberg|   21765|
|            Oudergem|   33725|
|          Schaarbeek|  132097|
| Sint‐Agatha‐Berchem|   24831|
|         Sint‐Gillis|   49361|
| Sint‐Jans‐Molenbeek|   95455|
| Sint‐Joost‐ten‐Node|   26813|
|Sint‐Lambrechts‐W...|   56212|
| Sint‐Pieters‐Woluwe|   41513|
|               Ukkel|   82038|
|               Vorst|   55694|
| Watermaal‐Bosvoorde|   25001|
|          Aartselaar|   14298|
+--------------------+--------+
only showing top 20 rows

root
 |-- village: string (nullable = false)
 |-- citizens: integer (nullable = true)



## Stops

In [6]:
# Read in the file (as json) and convert to a single column
# The 'haltes' column gets renamed to 'stops'
stops = spark.read.json("data/stops.txt")
stops = stops.select((explode("haltes").alias("stops")))

# Map each entry in the dataframe to its own stop
stops = stops.select('stops').rdd.map(lambda x: x.stops).toDF()
# Drop unnecessary data
stops = stops.drop('links', 'gemeentenummer', 'entiteitnummer')

# Rename columns of dataframe to better name
stops = stops \
            .withColumnRenamed('haltenummer', 'stop_number') \
            .withColumnRenamed('omschrijving', 'desc') \
            .withColumnRenamed('geoCoordinaat', 'coord') \
            .withColumnRenamed('omschrijvingGemeente', 'vill') \

# Get the amount of stops per town
per_town = stops.groupBy("vill").count().select(col("vill"), col("count").alias("# stops"))
per_town.show()

+-------------------+-------+
|               vill|# stops|
+-------------------+-------+
|              Bever|     28|
|           Waanrode|     40|
|              Duras|      2|
|           Ettelgem|      8|
|          Zillebeke|     25|
|Sint-Job-in-'t-Goor|     24|
|               Lint|     24|
|          Merelbeke|     85|
|             Essene|     31|
|             Parike|     10|
|          Huizingen|     21|
| Glabbeek-Zuurbemde|     16|
|             Gorsem|      6|
|          Harelbeke|     81|
|                Mol|    140|
|        Hoogstraten|     38|
|       Oud-Turnhout|     47|
|           Hoevenen|     24|
|         Rupelmonde|      6|
|            Edingen|     12|
+-------------------+-------+
only showing top 20 rows



## Combining the dataframes

In [12]:
data = per_town.join(citizens, citizens.village == per_town.vill)
data = data.drop('vill')
data.show()
data.printSchema()

+-------+-------------+--------+
|# stops|      village|citizens|
+-------+-------------+--------+
|     28|        Bever|    2205|
|     81|    Harelbeke|   27879|
|     24|         Lint|    8776|
|     85|    Merelbeke|   24629|
|     17|   Alveringem|    5087|
|     38|  Hoogstraten|   21293|
|    140|          Mol|   36506|
|     60|     Oostkamp|   23577|
|      4|        Oreye|    3916|
|     53|   Aartselaar|   14298|
|     30|   Huldenberg|    9882|
|     98|      Torhout|   20504|
|     48|   Liedekerke|   13181|
|     47|   Wuustwezel|   20663|
|     25|     Kruibeke|   16661|
|    129|     Overijse|   25169|
|     30|    Boutersem|    8165|
|    159|   Diepenbeek|   19133|
|     44|    Houthulst|   10033|
|     71|Nieuwerkerken|    6957|
+-------+-------------+--------+
only showing top 20 rows

root
 |-- # stops: long (nullable = false)
 |-- village: string (nullable = false)
 |-- citizens: integer (nullable = true)



## Calculating the amount of stops per citizen of each village

In [13]:
result = data.withColumn("Result", col("# stops")/col("citizens"))
result = result.drop('# stops', 'citizens')
result.show()

+-------------+--------------------+
|      village|              Result|
+-------------+--------------------+
|        Bever|0.012698412698412698|
|    Harelbeke|0.002905412676207...|
|         Lint|0.002734731084776...|
|    Merelbeke|0.003451216046124...|
|   Alveringem|0.003341851779044...|
|  Hoogstraten|0.001784624054853...|
|          Mol|0.003834986029693749|
|     Oostkamp|0.002544853034737244|
|        Oreye|0.001021450459652...|
|   Aartselaar|0.003706812141558...|
|   Huldenberg|0.003035822707953...|
|      Torhout|0.004779555208739758|
|   Liedekerke|0.003641605341021...|
|   Wuustwezel|0.002274597105938...|
|     Kruibeke|0.001500510173458976|
|     Overijse|0.005125352616313719|
|    Boutersem|0.003674219228413962|
|   Diepenbeek|0.008310249307479225|
|    Houthulst|0.004385527758397289|
|Nieuwerkerken|0.010205548368549663|
+-------------+--------------------+
only showing top 20 rows

