In [58]:
from pyspark.sql import SparkSession, types, functions as F
import datetime

In [59]:
spark = SparkSession.builder\
          .master('local[*]')\
          .appName('test')\
          .getOrCreate()

## Measurements

In [60]:
def get_datetime(day, s):
  delta = datetime.timedelta(seconds=s)
  return day + delta

In [61]:
get_datetime_udf = F.udf(get_datetime, returnType=types.TimestampType())

In [62]:
measurements_schema = types.StructType([
  types.StructField('day', types.TimestampType(), False),
  types.StructField('interval', types.IntegerType(), False),
  types.StructField('detid', types.StringType(), False),
  types.StructField('flow', types.IntegerType(), True),
  types.StructField('occ', types.FloatType(), True),
  types.StructField('error', types.FloatType(), True),
  types.StructField('city', types.StringType(), False),
  types.StructField('speed', types.FloatType(), True)
])

In [63]:
measurements_schema = spark.read\
                        .option('header', True)\
                        .schema(measurements_schema)\
                        .csv('./data/measurements_test.csv')

In [65]:
measurements_schema = measurements_schema.\
  withColumn('error', F.col('error').cast('integer')).\
  withColumn('datetime', get_datetime_udf('day', 'interval')).\
  drop('day', 'interval')

In [66]:
measurements_schema.printSchema()

root
 |-- detid: string (nullable = true)
 |-- flow: integer (nullable = true)
 |-- occ: float (nullable = true)
 |-- error: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- speed: float (nullable = true)
 |-- datetime: timestamp (nullable = true)



In [67]:
measurements_schema.show(20)

[Stage 0:>                                                          (0 + 1) / 1]

+--------+----+----+-----+--------+-----+-------------------+
|   detid|flow| occ|error|    city|speed|           datetime|
+--------+----+----+-----+--------+-----+-------------------+
|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:00:00|
|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:05:00|
|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:10:00|
|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:15:00|
|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:20:00|
|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:25:00|
|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:30:00|
|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:35:00|
|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:40:00|
|06.X-2li|   8|0.01|    1|augsburg| NULL|2017-05-06 00:45:00|
|06.X-2li|   8|0.01|    1|augsburg| NULL|2017-05-06 00:50:00|
|06.X-2li|   8|0.01|    1|augsburg| NULL|2017-05-06 00:55:00|
|06.X-2li|   4| 0.0|    1|augsburg| NULL|2017-05-06 01:00:00|
|06.X-2l

                                                                                

In [68]:
spark.stop()