In [52]:
from pyspark.sql import SparkSession, types
from pyspark.sql.functions import pandas_udf
import pandas as pd

In [53]:
spark = SparkSession \
          .builder \
          .master('local[*]') \
          .appName('test') \
          .getOrCreate()

## Measurements

In [54]:
measurements_schema = types.StructType([
  types.StructField('day', types.DateType(), nullable=False),
  types.StructField('interval', types.IntegerType(), nullable=False),
  types.StructField('detid', types.StringType(), nullable=False),
  types.StructField('flow', types.IntegerType(), nullable=True),
  types.StructField('occ', types.FloatType(), nullable=True),
  types.StructField('error', types.IntegerType(), nullable=True),
  types.StructField('city', types.StringType(), nullable=False),
  types.StructField('speed', types.FloatType(), nullable=True)
])

In [55]:
measurements_df = spark.read \
                    .option("header", True) \
                    .schema(measurements_schema) \
                    .csv('./data/measurements_test.csv')

In [56]:
@pandas_udf(types.IntegerType())
def get_hour(intervals: pd.Series) -> pd.Series:
  # deltas = pd.to_timedelta(intervals, unit='s')
  # return deltas.dt.components.hours
  return intervals.floordiv(3600)

In [57]:
measurements_df.rdd.getNumPartitions()

8

In [58]:
measurements_df = measurements_df \
                    .repartition('day', 'city') \
                    .withColumn('hour', get_hour('interval')) \
                    .drop('interval')

In [59]:
measurements_df.rdd.getNumPartitions()



8

In [60]:
measurements_df.printSchema()

root
 |-- day: date (nullable = true)
 |-- detid: string (nullable = true)
 |-- flow: integer (nullable = true)
 |-- occ: float (nullable = true)
 |-- error: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- speed: float (nullable = true)
 |-- hour: integer (nullable = true)



## Detectors

In [61]:
detectors_schema = types.StructType([
  types.StructField('detid', types.StringType()),
  types.StructField('length', types.DoubleType()),
  types.StructField('pos', types.DoubleType()),
  types.StructField('fclass', types.StringType()),
  types.StructField('road', types.StringType()),
  types.StructField('limit', types.IntegerType()),
  types.StructField('citycode', types.StringType()),
  types.StructField('lanes', types.IntegerType()),
  types.StructField('linkid', types.IntegerType()),
  types.StructField('long', types.DoubleType()),
  types.StructField('lat', types.DoubleType())
])

In [62]:
detectors_df = spark.read \
                .option("header", True) \
                .schema(detectors_schema) \
                .csv('./data/detectors_public.csv')

In [63]:
detectors_df = detectors_df.withColumnRenamed('citycode', 'city')

In [64]:
detectors_df.show(5)

+------+-----------------+-----------------+---------+----------------+-----+--------+-----+------+----------+----------+
| detid|           length|              pos|   fclass|            road|limit|    city|lanes|linkid|      long|       lat|
+------+-----------------+-----------------+---------+----------------+-----+--------+-----+------+----------+----------+
|U1-52G|0.196036689548578|0.005511921838942|secondary|Gögginger Straße|   50|augsburg|    1|    72|10.8895527| 48.359957|
|U1-51G|0.130039071784385|0.004013209518671|secondary|Gögginger Straße|   50|augsburg|    1|    73| 10.889601|48.3599454|
|U1-52L|0.155863214591286|0.022227823915315|secondary|Gögginger Straße|   50|augsburg|    1|    70|10.8893555|48.3598759|
|U1-51L|0.197675412740605|0.021889369247481|secondary|Gögginger Straße|   50|augsburg|    1|    71|10.8893958|48.3598617|
| U1-62|0.065182655061287|0.024465133197684|secondary|   Rosenaustraße|   50|augsburg|    1|    68|10.8893609|48.3605781|
+------+----------------

                                                                                

In [65]:
df = measurements_df.join(detectors_df, on=['detid', 'city'])

## Links

In [68]:
links_schema = types.StructType([
  types.StructField('long', types.DoubleType()),
  types.StructField('lat', types.DoubleType()),
  types.StructField('order', types.IntegerType()),
  types.StructField('piece', types.IntegerType()),
  types.StructField('linkid', types.IntegerType()),
  types.StructField('group', types.FloatType()),
  types.StructField('citycode', types.StringType())
])

In [69]:
links_df = spark.read \
            .option("header", True) \
            .schema(links_schema) \
            .csv('./data/links.csv')

In [70]:
links_df.show()

+----------+----------+-----+-----+------+-----+--------+
|      long|       lat|order|piece|linkid|group|citycode|
+----------+----------+-----+-----+------+-----+--------+
|10.8910158|48.3610789|    1|    1|     0|  0.1|augsburg|
|10.8908938|48.3609933|    2|    1|     0|  0.1|augsburg|
|10.8906417|48.3608526|    3|    1|     0|  0.1|augsburg|
|10.8904043|48.3607189|    4|    1|     0|  0.1|augsburg|
|10.8899939| 48.360497|    5|    1|     0|  0.1|augsburg|
|10.8909796|48.3611074|    1|    1|     1|  1.1|augsburg|
|10.8908589|48.3610201|    2|    1|     1|  1.1|augsburg|
| 10.890588|48.3608561|    3|    1|     1|  1.1|augsburg|
|10.8903842|48.3607421|    4|    1|     1|  1.1|augsburg|
|10.8901803|48.3606378|    5|    1|     1|  1.1|augsburg|
|10.8899577|48.3605166|    6|    1|     1|  1.1|augsburg|
|10.8935881|48.3648605|    1|    1|     2|  2.1|augsburg|
|10.8935827| 48.364693|    2|    1|     2|  2.1|augsburg|
|10.8934379|48.3644453|    3|    1|     2|  2.1|augsburg|
|10.8928666|48

In [71]:
spark.stop()

In [72]:
df = pd.read_csv('./data/detectors_public.csv')

In [73]:
df.shape

(23626, 11)

In [74]:
df.head()

Unnamed: 0,detid,length,pos,fclass,road,limit,citycode,lanes,linkid,long,lat
0,U1-52G,0.196037,0.005512,secondary,Gögginger Straße,50,augsburg,1.0,72.0,10.889553,48.359957
1,U1-51G,0.130039,0.004013,secondary,Gögginger Straße,50,augsburg,1.0,73.0,10.889601,48.359945
2,U1-52L,0.155863,0.022228,secondary,Gögginger Straße,50,augsburg,1.0,70.0,10.889356,48.359876
3,U1-51L,0.197675,0.021889,secondary,Gögginger Straße,50,augsburg,1.0,71.0,10.889396,48.359862
4,U1-62,0.065183,0.024465,secondary,Rosenaustraße,50,augsburg,1.0,68.0,10.889361,48.360578


In [75]:
len(df.detid.unique())

21996

In [76]:
df.groupby('detid', as_index=False).nunique().sort_values('length', ascending=False)

Unnamed: 0,detid,length,pos,fclass,road,limit,citycode,lanes,linkid,long,lat
6543,423,6,6,3,6,2,6,2,6,6,6
7541,687,5,5,3,5,3,5,2,5,5,5
4528,215,5,5,4,5,2,5,3,5,5,5
6065,349,5,5,3,4,2,5,2,5,5,5
4968,224,5,5,3,4,3,5,2,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...
39,00055PMA0003,1,1,1,1,1,1,1,1,1,1
40,00055PMA0004,1,1,1,1,1,1,1,1,1,1
41,00056PMA0003,1,1,1,1,1,1,1,1,1,1
42,00056PMA0004,1,1,1,1,1,1,1,1,1,1


In [77]:
df.loc[df.detid == '349', :]

Unnamed: 0,detid,length,pos,fclass,road,limit,citycode,lanes,linkid,long,lat
14764,349,0.698863,0.0,tertiary,,,melbourne,1.0,349.0,144.963819,-37.81872
16198,349,0.183964,0.166846,tertiary,Rue de Sèvres,,paris,1.0,156.0,2.327383,48.851629
17533,349,0.138773,0.039354,secondary,é???å??äº??,0.0,taipeh,2.0,203.0,121.527066,25.032555
20754,349,0.274305,0.265567,residential,Rue Pierrette Louin,0.0,toulouse,1.0,487.0,1.452,43.627787
22352,349,0.199713,0.035497,secondary,Naugarduko g.,50.0,vilnius,1.0,419.0,25.271106,54.675191


In [78]:
df.detid.unique()

array(['U1-52G', 'U1-51G', 'U1-52L', ..., 'K426D11', 'K437D17', 'K38D11'],
      dtype=object)