In [129]:
from pyspark.sql import SparkSession, types
from pyspark.sql.functions import pandas_udf, broadcast
import pandas as pd

In [130]:
spark = SparkSession \
          .builder \
          .master('local[*]') \
          .appName('test') \
          .getOrCreate()

## Measurements

In [131]:
measurements_schema = types.StructType([
  types.StructField('day', types.DateType(), nullable=False),
  types.StructField('interval', types.IntegerType(), nullable=False),
  types.StructField('detid', types.StringType(), nullable=False),
  types.StructField('flow', types.IntegerType(), nullable=True),
  types.StructField('occ', types.FloatType(), nullable=True),
  types.StructField('error', types.IntegerType(), nullable=True),
  types.StructField('city', types.StringType(), nullable=False),
  types.StructField('speed', types.FloatType(), nullable=True)
])

In [132]:
measurements_df = spark.read \
                    .option("header", True) \
                    .schema(measurements_schema) \
                    .csv('./data/measurements_test.csv')

In [133]:
@pandas_udf(types.IntegerType())
def get_hour(intervals: pd.Series) -> pd.Series:
  # deltas = pd.to_timedelta(intervals, unit='s')
  # return deltas.dt.components.hours
  return intervals.floordiv(3600)

In [134]:
measurements_df.rdd.getNumPartitions()

8

In [None]:
measurements_df = measurements_df \
                    .repartition(20, 'day', 'city') \
                    .withColumn('hour', get_hour('interval')) \
                    .drop('interval')

In [136]:
measurements_df.rdd.getNumPartitions()



8

In [137]:
measurements_df.printSchema()

root
 |-- day: date (nullable = true)
 |-- detid: string (nullable = true)
 |-- flow: integer (nullable = true)
 |-- occ: float (nullable = true)
 |-- error: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- speed: float (nullable = true)
 |-- hour: integer (nullable = true)



## Detectors

In [138]:
detectors_schema = types.StructType([
  types.StructField('detid', types.StringType()),
  types.StructField('length', types.DoubleType()),
  types.StructField('pos', types.DoubleType()),
  types.StructField('fclass', types.StringType()),
  types.StructField('road', types.StringType()),
  types.StructField('limit', types.IntegerType()),
  types.StructField('citycode', types.StringType()),
  types.StructField('lanes', types.IntegerType()),
  types.StructField('linkid', types.IntegerType()),
  types.StructField('long', types.DoubleType()),
  types.StructField('lat', types.DoubleType())
])

In [139]:
detectors_df = spark.read \
                .option("header", True) \
                .schema(detectors_schema) \
                .csv('./data/detectors_public.csv')

In [140]:
detectors_df = detectors_df.withColumnRenamed('citycode', 'city')

In [141]:
detectors_df.show(5)

+------+-----------------+-----------------+---------+----------------+-----+--------+-----+------+----------+----------+
| detid|           length|              pos|   fclass|            road|limit|    city|lanes|linkid|      long|       lat|
+------+-----------------+-----------------+---------+----------------+-----+--------+-----+------+----------+----------+
|U1-52G|0.196036689548578|0.005511921838942|secondary|Gögginger Straße|   50|augsburg|    1|    72|10.8895527| 48.359957|
|U1-51G|0.130039071784385|0.004013209518671|secondary|Gögginger Straße|   50|augsburg|    1|    73| 10.889601|48.3599454|
|U1-52L|0.155863214591286|0.022227823915315|secondary|Gögginger Straße|   50|augsburg|    1|    70|10.8893555|48.3598759|
|U1-51L|0.197675412740605|0.021889369247481|secondary|Gögginger Straße|   50|augsburg|    1|    71|10.8893958|48.3598617|
| U1-62|0.065182655061287|0.024465133197684|secondary|   Rosenaustraße|   50|augsburg|    1|    68|10.8893609|48.3605781|
+------+----------------

                                                                                

In [142]:
df = measurements_df.join(broadcast(detectors_df), on=['detid', 'city'])

In [None]:
df.write.parquet('./data/pq/measurements.csv', mode='overwrite')

25/04/09 11:16:09 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

## Links

In [144]:
links_schema = types.StructType([
  types.StructField('long', types.DoubleType()),
  types.StructField('lat', types.DoubleType()),
  types.StructField('order', types.IntegerType()),
  types.StructField('piece', types.IntegerType()),
  types.StructField('linkid', types.IntegerType()),
  types.StructField('group', types.FloatType()),
  types.StructField('citycode', types.StringType())
])

In [145]:
links_df = spark.read \
            .option("header", True) \
            .schema(links_schema) \
            .csv('./data/links.csv')

In [146]:
links_df.show()

+----------+----------+-----+-----+------+-----+--------+
|      long|       lat|order|piece|linkid|group|citycode|
+----------+----------+-----+-----+------+-----+--------+
|10.8910158|48.3610789|    1|    1|     0|  0.1|augsburg|
|10.8908938|48.3609933|    2|    1|     0|  0.1|augsburg|
|10.8906417|48.3608526|    3|    1|     0|  0.1|augsburg|
|10.8904043|48.3607189|    4|    1|     0|  0.1|augsburg|
|10.8899939| 48.360497|    5|    1|     0|  0.1|augsburg|
|10.8909796|48.3611074|    1|    1|     1|  1.1|augsburg|
|10.8908589|48.3610201|    2|    1|     1|  1.1|augsburg|
| 10.890588|48.3608561|    3|    1|     1|  1.1|augsburg|
|10.8903842|48.3607421|    4|    1|     1|  1.1|augsburg|
|10.8901803|48.3606378|    5|    1|     1|  1.1|augsburg|
|10.8899577|48.3605166|    6|    1|     1|  1.1|augsburg|
|10.8935881|48.3648605|    1|    1|     2|  2.1|augsburg|
|10.8935827| 48.364693|    2|    1|     2|  2.1|augsburg|
|10.8934379|48.3644453|    3|    1|     2|  2.1|augsburg|
|10.8928666|48

In [147]:
spark.stop()