In [38]:
from pyspark.sql import SparkSession, types, functions as F
import datetime

In [8]:
spark = SparkSession.builder\
          .master('local[*]')\
          .appName('test')\
          .getOrCreate()

## Measurements

In [52]:
def get_datetime(day, s):
  delta = datetime.timedelta(seconds=s)
  return day + delta

In [53]:
get_datetime_udf = F.udf(get_datetime, returnType=types.TimestampType())

In [48]:
# # date_python = datetime.datetime(year=1991, month=2, day=20, hour=0, minute=0, second=0)
# date_python = datetime.date(year=2020, month=2, day=20)
# seconds = 200
# print(date_python)
# print(measurement_datetime(date_python, seconds))

In [49]:
measurements_schema = types.StructType([
  types.StructField('day', types.TimestampType(), False),
  types.StructField('interval', types.IntegerType(), False),
  types.StructField('detid', types.StringType(), False),
  types.StructField('flow', types.IntegerType(), True),
  types.StructField('occ', types.FloatType(), True),
  types.StructField('error', types.FloatType(), True),
  types.StructField('city', types.StringType(), False),
  types.StructField('speed', types.FloatType(), True)
])

In [50]:
measurements_schema = spark.read\
                        .option('header', True)\
                        .schema(measurements_schema)\
                        .csv('./data/measurements_test.csv')

In [54]:
measurements_schema = measurements_schema.\
  withColumn('error', F.col('error').cast('integer')).\
  withColumn('datetime', get_datetime_udf('day', 'interval'))

In [55]:
measurements_schema.printSchema()

root
 |-- day: timestamp (nullable = true)
 |-- interval: integer (nullable = true)
 |-- detid: string (nullable = true)
 |-- flow: integer (nullable = true)
 |-- occ: float (nullable = true)
 |-- error: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- speed: float (nullable = true)
 |-- datetime: timestamp (nullable = true)



In [56]:
measurements_schema.show(20)

[Stage 8:>                                                          (0 + 1) / 1]

+-------------------+--------+--------+----+----+-----+--------+-----+-------------------+
|                day|interval|   detid|flow| occ|error|    city|speed|           datetime|
+-------------------+--------+--------+----+----+-----+--------+-----+-------------------+
|2017-05-06 00:00:00|       0|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:00:00|
|2017-05-06 00:00:00|     300|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:05:00|
|2017-05-06 00:00:00|     600|06.X-2li|  12| 0.0|    1|augsburg| NULL|2017-05-06 00:10:00|
|2017-05-06 00:00:00|     900|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:15:00|
|2017-05-06 00:00:00|    1200|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:20:00|
|2017-05-06 00:00:00|    1500|06.X-2li|  16| 0.0|    1|augsburg| NULL|2017-05-06 00:25:00|
|2017-05-06 00:00:00|    1800|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:30:00|
|2017-05-06 00:00:00|    2100|06.X-2li|  20| 0.0|    1|augsburg| NULL|2017-05-06 00:35:00|

                                                                                

In [7]:
spark.stop()