In [1]:
from pyspark.sql import SparkSession, types, functions as F
import datetime

In [2]:
spark = SparkSession.builder\
          .master('local[*]')\
          .appName('test')\
          .getOrCreate()

25/03/28 13:08:39 WARN Utils: Your hostname, Bastiens-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.243.216.250 instead (on interface en0)
25/03/28 13:08:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/28 13:08:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Measurements

In [None]:
def get_datetime(day, s):
  delta = datetime.timedelta(seconds=s)
  return day + delta

get_datetime_udf = F.udf(get_datetime, returnType=types.TimestampType())

In [None]:
def get_datetime_hour(d):
  return d.hour

get_datetime_hour_udf = F.udf(get_datetime_hour, returnType=types.IntegerType())

In [62]:
measurements_schema = types.StructType([
  types.StructField('day', types.TimestampType(), False),
  types.StructField('interval', types.IntegerType(), False),
  types.StructField('detid', types.StringType(), False),
  types.StructField('flow', types.IntegerType(), True),
  types.StructField('occ', types.FloatType(), True),
  types.StructField('error', types.FloatType(), True),
  types.StructField('city', types.StringType(), False),
  types.StructField('speed', types.FloatType(), True)
])

In [None]:
measurements_df = spark.read\
                        .option('header', True)\
                        .schema(measurements_schema)\
                        .csv('./data/measurements_test.csv')

In [None]:
measurements_df = measurements_df\
  .withColumn('error', F.col('error').cast('integer'))\
  .withColumn('datetime', get_datetime_udf('day', 'interval'))\
  .withColumn('hour', get_datetime_hour_udf('datetime'))\
  .drop('day', 'interval')

In [None]:
measurements_df.printSchema()

root
 |-- detid: string (nullable = true)
 |-- flow: integer (nullable = true)
 |-- occ: float (nullable = true)
 |-- error: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- speed: float (nullable = true)
 |-- datetime: timestamp (nullable = true)



In [None]:
measurements_df.writ

[Stage 0:>                                                          (0 + 1) / 1]

+--------+----+----+-----+--------+-----+-------------------+
|   detid|flow| occ|error|    city|speed|           datetime|
+--------+----+----+-----+--------+-----+-------------------+
|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:00:00|
|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:05:00|
|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:10:00|
|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:15:00|
|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:20:00|
|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:25:00|
|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:30:00|
|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:35:00|
|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:40:00|
|06.X-2li|   8|0.01|    1|augsburg| NULL|2017-05-06 00:45:00|
|06.X-2li|   8|0.01|    1|augsburg| NULL|2017-05-06 00:50:00|
|06.X-2li|   8|0.01|    1|augsburg| NULL|2017-05-06 00:55:00|
|06.X-2li|   4| 0.0|    1|augsburg| NULL|2017-05-06 01:00:00|
|06.X-2l

                                                                                

## Detectors

In [None]:
detectors_schema = types.StructType([
  types.StructField('detid', types.StringType(), False),
  types.StructField('length', types.DoubleType(), False),
  types.StructField('pos', types.DoubleType(), False),
  types.StructField('fclass', types.StringType(), False),
  types.StructField('road', types.StringType(), False),
  types.StructField('limit', types.IntegerType(), False),
  types.StructField('citycode', types.StringType(), False),
  types.StructField('lanes', types.IntegerType(), False),
  types.StructField('linkid', types.IntegerType(), False),
  types.StructField('long', types.DoubleType(), False),
  types.StructField('lat', types.DoubleType(), False)
])

In [13]:
detectors_df = spark.read\
                .option("header", True)\
                .schema(detectors_schema)\
                .csv('./data/detectors_public.csv')

In [14]:
detectors_df.printSchema()

root
 |-- detid: string (nullable = true)
 |-- length: double (nullable = true)
 |-- pos: double (nullable = true)
 |-- fclass: string (nullable = true)
 |-- road: string (nullable = true)
 |-- limit: integer (nullable = true)
 |-- citycode: string (nullable = true)
 |-- lanes: integer (nullable = true)
 |-- linkid: integer (nullable = true)
 |-- long: double (nullable = true)
 |-- lat: double (nullable = true)



In [15]:
detectors_df.show(20)

+------+-----------------+-----------------+-----------+----------------+-----+--------+-----+------+----------+----------+
| detid|           length|              pos|     fclass|            road|limit|citycode|lanes|linkid|      long|       lat|
+------+-----------------+-----------------+-----------+----------------+-----+--------+-----+------+----------+----------+
|U1-52G|0.196036689548578|0.005511921838942|  secondary|Gögginger Straße|   50|augsburg|    1|    72|10.8895527| 48.359957|
|U1-51G|0.130039071784385|0.004013209518671|  secondary|Gögginger Straße|   50|augsburg|    1|    73| 10.889601|48.3599454|
|U1-52L|0.155863214591286|0.022227823915315|  secondary|Gögginger Straße|   50|augsburg|    1|    70|10.8893555|48.3598759|
|U1-51L|0.197675412740605|0.021889369247481|  secondary|Gögginger Straße|   50|augsburg|    1|    71|10.8893958|48.3598617|
| U1-62|0.065182655061287|0.024465133197684|  secondary|   Rosenaustraße|   50|augsburg|    1|    68|10.8893609|48.3605781|
| U1-61|

## Links

In [24]:
links_schema = types.StructType([
  types.StructField('long', types.DoubleType(), False),
  types.StructField('lat', types.DoubleType(), False),
  types.StructField('order', types.IntegerType(), False),
  types.StructField('piece', types.IntegerType(), False),
  types.StructField('linkid', types.IntegerType(), False),
  types.StructField('group', types.FloatType(), False),
  types.StructField('citycode', types.StringType(), False)
])

In [26]:
links_df = spark.read\
            .option("header", True)\
            .schema(links_schema)\
            .csv('./data/links.csv')

In [27]:
links_df.printSchema()

root
 |-- long: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- order: integer (nullable = true)
 |-- piece: integer (nullable = true)
 |-- linkid: integer (nullable = true)
 |-- group: float (nullable = true)
 |-- citycode: string (nullable = true)



In [28]:
links_df.show(10)

+----------+----------+-----+-----+------+-----+--------+
|      long|       lat|order|piece|linkid|group|citycode|
+----------+----------+-----+-----+------+-----+--------+
|10.8910158|48.3610789|    1|    1|     0|  0.1|augsburg|
|10.8908938|48.3609933|    2|    1|     0|  0.1|augsburg|
|10.8906417|48.3608526|    3|    1|     0|  0.1|augsburg|
|10.8904043|48.3607189|    4|    1|     0|  0.1|augsburg|
|10.8899939| 48.360497|    5|    1|     0|  0.1|augsburg|
|10.8909796|48.3611074|    1|    1|     1|  1.1|augsburg|
|10.8908589|48.3610201|    2|    1|     1|  1.1|augsburg|
| 10.890588|48.3608561|    3|    1|     1|  1.1|augsburg|
|10.8903842|48.3607421|    4|    1|     1|  1.1|augsburg|
|10.8901803|48.3606378|    5|    1|     1|  1.1|augsburg|
+----------+----------+-----+-----+------+-----+--------+
only showing top 10 rows



In [68]:
spark.stop()