In [1]:
import pyspark

In [2]:
!cat example_data.csv

n,g
15.0,b
23.0,c
6.0,c
NaN,c
26.0,b
12.0,b
8.0,a
18.0,c
14.0,a
20.0,c
22.0,a
21.0,a
1.0,c
0.0,a
17.0,b
2.0,a
7.0,a
16.0,b
24.0,b
10.0,a
4.0,c
27.0,c
25.0,b
NaN,a
11.0,a
5.0,a
19.0,c
29.0,c
28.0,a
13.0,b


In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [13]:
df = spark.read.csv('./example_data.csv', header=True)

In [17]:
# 1. infer schema
# 2. Specify a schema
# 3. Cast

df = df.select(df.n.cast('float'), df.g)

In [18]:
type(df)

pyspark.sql.dataframe.DataFrame

## More Data Frame Manipulation

In [22]:
df = (df
 .withColumnRenamed('n', 'number')
 .withColumnRenamed('g', 'group'))

In [26]:
df.withColumn('n_is_even', df.number % 2 == 0).show()

+------+-----+---------+
|number|group|n_is_even|
+------+-----+---------+
|  15.0|    b|    false|
|  23.0|    c|    false|
|   6.0|    c|     true|
|   NaN|    c|    false|
|  26.0|    b|     true|
|  12.0|    b|     true|
|   8.0|    a|     true|
|  18.0|    c|     true|
|  14.0|    a|     true|
|  20.0|    c|     true|
|  22.0|    a|     true|
|  21.0|    a|    false|
|   1.0|    c|    false|
|   0.0|    a|     true|
|  17.0|    b|    false|
|   2.0|    a|     true|
|   7.0|    a|    false|
|  16.0|    b|     true|
|  24.0|    b|     true|
|  10.0|    a|     true|
+------+-----+---------+
only showing top 20 rows



In [27]:
from pyspark.sql.functions import col, expr

In [31]:
df.withColumn('n_is_even', col('number') % 2 == 0)

DataFrame[number: float, group: string, n_is_even: boolean]

In [29]:
df.withColumn('n_is_even', expr('number % 2 = 0'))

DataFrame[number: float, group: string, n_is_even: boolean]

In [35]:
df.selectExpr('number + 1 as incremented', 'number % 2 = 0').show()

+-----------+------------------+
|incremented|((number % 2) = 0)|
+-----------+------------------+
|       16.0|             false|
|       24.0|             false|
|        7.0|              true|
|        NaN|             false|
|       27.0|              true|
|       13.0|              true|
|        9.0|              true|
|       19.0|              true|
|       15.0|              true|
|       21.0|              true|
|       23.0|              true|
|       22.0|             false|
|        2.0|             false|
|        1.0|              true|
|       18.0|             false|
|        3.0|              true|
|        8.0|             false|
|       17.0|              true|
|       25.0|              true|
|       11.0|              true|
+-----------+------------------+
only showing top 20 rows



In [38]:
df.selectExpr('*', 'number % 2 = 0 as number_is_even').show()

+------+-----+--------------+
|number|group|number_is_even|
+------+-----+--------------+
|  15.0|    b|         false|
|  23.0|    c|         false|
|   6.0|    c|          true|
|   NaN|    c|         false|
|  26.0|    b|          true|
|  12.0|    b|          true|
|   8.0|    a|          true|
|  18.0|    c|          true|
|  14.0|    a|          true|
|  20.0|    c|          true|
|  22.0|    a|          true|
|  21.0|    a|         false|
|   1.0|    c|         false|
|   0.0|    a|          true|
|  17.0|    b|         false|
|   2.0|    a|          true|
|   7.0|    a|         false|
|  16.0|    b|          true|
|  24.0|    b|          true|
|  10.0|    a|          true|
+------+-----+--------------+
only showing top 20 rows



In [46]:
df.drop('group').show()

+------+
|number|
+------+
|  15.0|
|  23.0|
|   6.0|
|   NaN|
|  26.0|
|  12.0|
|   8.0|
|  18.0|
|  14.0|
|  20.0|
|  22.0|
|  21.0|
|   1.0|
|   0.0|
|  17.0|
|   2.0|
|   7.0|
|  16.0|
|  24.0|
|  10.0|
+------+
only showing top 20 rows



In [48]:
df.limit(10).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   NaN|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
+------+-----+



In [50]:
df.where(df.number < 10).show()

+------+-----+
|number|group|
+------+-----+
|   6.0|    c|
|   8.0|    a|
|   1.0|    c|
|   0.0|    a|
|   2.0|    a|
|   7.0|    a|
|   4.0|    c|
|   5.0|    a|
+------+-----+



In [53]:
df.where('number < 10').where(col('number') > 4).show()

+------+-----+
|number|group|
+------+-----+
|   6.0|    c|
|   8.0|    a|
|   7.0|    a|
|   5.0|    a|
+------+-----+



In [58]:
df.where(df.number.between(4, 10)).show()

+------+-----+
|number|group|
+------+-----+
|   6.0|    c|
|   8.0|    a|
|   7.0|    a|
|  10.0|    a|
|   4.0|    c|
|   5.0|    a|
+------+-----+



In [60]:
df.filter(df.group == 'c').filter(df.number > 10).show()

+------+-----+
|number|group|
+------+-----+
|  23.0|    c|
|   NaN|    c|
|  18.0|    c|
|  20.0|    c|
|  27.0|    c|
|  19.0|    c|
|  29.0|    c|
+------+-----+



In [66]:
df.sample(.6).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   NaN|    c|
|  26.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  21.0|    a|
|   1.0|    c|
|   0.0|    a|
|  17.0|    b|
|   2.0|    a|
|  16.0|    b|
|  24.0|    b|
|   4.0|    c|
|  27.0|    c|
|  25.0|    b|
|   5.0|    a|
+------+-----+
only showing top 20 rows



In [67]:
df.sort(df.number).show()

+------+-----+
|number|group|
+------+-----+
|   0.0|    a|
|   1.0|    c|
|   2.0|    a|
|   4.0|    c|
|   5.0|    a|
|   6.0|    c|
|   7.0|    a|
|   8.0|    a|
|  10.0|    a|
|  11.0|    a|
|  12.0|    b|
|  13.0|    b|
|  14.0|    a|
|  15.0|    b|
|  16.0|    b|
|  17.0|    b|
|  18.0|    c|
|  19.0|    c|
|  20.0|    c|
|  21.0|    a|
+------+-----+
only showing top 20 rows



In [70]:
df.orderBy(df.group, df.number).show()

+------+-----+
|number|group|
+------+-----+
|   0.0|    a|
|   2.0|    a|
|   5.0|    a|
|   7.0|    a|
|   8.0|    a|
|  10.0|    a|
|  11.0|    a|
|  14.0|    a|
|  21.0|    a|
|  22.0|    a|
|  28.0|    a|
|   NaN|    a|
|  12.0|    b|
|  13.0|    b|
|  15.0|    b|
|  16.0|    b|
|  17.0|    b|
|  24.0|    b|
|  25.0|    b|
|  26.0|    b|
+------+-----+
only showing top 20 rows



In [74]:
from pyspark.sql.functions import desc

df.orderBy(df.group.desc(), desc('number')).show()

+------+-----+
|number|group|
+------+-----+
|   NaN|    c|
|  29.0|    c|
|  27.0|    c|
|  23.0|    c|
|  20.0|    c|
|  19.0|    c|
|  18.0|    c|
|   6.0|    c|
|   4.0|    c|
|   1.0|    c|
|  26.0|    b|
|  25.0|    b|
|  24.0|    b|
|  17.0|    b|
|  16.0|    b|
|  15.0|    b|
|  13.0|    b|
|  12.0|    b|
|   NaN|    a|
|  28.0|    a|
+------+-----+
only showing top 20 rows



In [81]:
df.orderBy(df.number.desc()).show()

+------+-----+
|number|group|
+------+-----+
|   NaN|    c|
|   NaN|    a|
|  29.0|    c|
|  28.0|    a|
|  27.0|    c|
|  26.0|    b|
|  25.0|    b|
|  24.0|    b|
|  23.0|    c|
|  22.0|    a|
|  21.0|    a|
|  20.0|    c|
|  19.0|    c|
|  18.0|    c|
|  17.0|    b|
|  16.0|    b|
|  15.0|    b|
|  14.0|    a|
|  13.0|    b|
|  12.0|    b|
+------+-----+
only showing top 20 rows



In [83]:
df.sort('number', ascending=False).show()

+------+-----+
|number|group|
+------+-----+
|   NaN|    c|
|   NaN|    a|
|  29.0|    c|
|  28.0|    a|
|  27.0|    c|
|  26.0|    b|
|  25.0|    b|
|  24.0|    b|
|  23.0|    c|
|  22.0|    a|
|  21.0|    a|
|  20.0|    c|
|  19.0|    c|
|  18.0|    c|
|  17.0|    b|
|  16.0|    b|
|  15.0|    b|
|  14.0|    a|
|  13.0|    b|
|  12.0|    b|
+------+-----+
only showing top 20 rows



In [87]:
df.replace('a', 'D', ['number']).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   NaN|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  22.0|    a|
|  21.0|    a|
|   1.0|    c|
|   0.0|    a|
|  17.0|    b|
|   2.0|    a|
|   7.0|    a|
|  16.0|    b|
|  24.0|    b|
|  10.0|    a|
+------+-----+
only showing top 20 rows



In [90]:
df.replace(['a', 'b'], ['A', 'B'], ['group']).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    B|
|  23.0|    c|
|   6.0|    c|
|   NaN|    c|
|  26.0|    B|
|  12.0|    B|
|   8.0|    A|
|  18.0|    c|
|  14.0|    A|
|  20.0|    c|
|  22.0|    A|
|  21.0|    A|
|   1.0|    c|
|   0.0|    A|
|  17.0|    B|
|   2.0|    A|
|   7.0|    A|
|  16.0|    B|
|  24.0|    B|
|  10.0|    A|
+------+-----+
only showing top 20 rows



In [97]:
df.count(), df.na.drop().count()

(30, 28)

In [101]:
df.na.fill(0, ['number']).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   0.0|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  22.0|    a|
|  21.0|    a|
|   1.0|    c|
|   0.0|    a|
|  17.0|    b|
|   2.0|    a|
|   7.0|    a|
|  16.0|    b|
|  24.0|    b|
|  10.0|    a|
+------+-----+
only showing top 20 rows



In [127]:
# fill nas with the average of the column

row = df.na.drop().agg(expr('avg(number)')).first()

the_average = row['avg(number)']

In [131]:
df.na.fill(the_average, ['number']).sort('number').show()

+---------+-----+
|   number|group|
+---------+-----+
|      0.0|    a|
|      1.0|    c|
|      2.0|    a|
|      4.0|    c|
|      5.0|    a|
|      6.0|    c|
|      7.0|    a|
|      8.0|    a|
|     10.0|    a|
|     11.0|    a|
|     12.0|    b|
|     13.0|    b|
|     14.0|    a|
|     15.0|    b|
|15.107142|    c|
|15.107142|    a|
|     16.0|    b|
|     17.0|    b|
|     18.0|    c|
|     19.0|    c|
+---------+-----+
only showing top 20 rows



In [135]:
for row in df.head(5):
    print('type(row): {}'.format(type(row)))
    print('type(row.group): {}'.format(type(row.group)))
    print('row.group:', row.group)
    print('---')

type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: b
---
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: c
---
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: c
---
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: c
---
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: b
---


## Built-in Functions

In [148]:
# Note that this will override some built-in python functions
from pyspark.sql.functions import *
import pyspark.sql.functions as F

In [223]:
df = spark.read.csv('./sa311/case.csv', header=True, inferSchema=True)

In [142]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [168]:
formatted_district = format_string('%010d', df.council_district)

(df
 .select(df.council_district)
 .orderBy(df.council_district.desc())
 .select(formatted_district.alias('district'))
 .show())

+----------+
|  district|
+----------+
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
|0000000010|
+----------+
only showing top 20 rows



In [181]:
(df
 .select(upper(col('request_address')),
         expr('lower(request_address)'))
 .show(3, truncate=False, vertical=True))

-RECORD 0-------------------------------------------------------
 upper(request_address) | 2315  EL PASO ST, SAN ANTONIO, 78207  
 lower(request_address) | 2315  el paso st, san antonio, 78207  
-RECORD 1-------------------------------------------------------
 upper(request_address) | 2215  GOLIAD RD, SAN ANTONIO, 78223   
 lower(request_address) | 2215  goliad rd, san antonio, 78223   
-RECORD 2-------------------------------------------------------
 upper(request_address) | 102  PALFREY ST W, SAN ANTONIO, 78223 
 lower(request_address) | 102  palfrey st w, san antonio, 78223 
only showing top 3 rows



In [191]:
(df
 # starting index, length
 .select(df.request_address, substring(df.request_address, 8, 6))
 .show(3, truncate=False))

+-------------------------------------+--------------------------------+
|request_address                      |substring(request_address, 8, 6)|
+-------------------------------------+--------------------------------+
|2315  EL PASO ST, San Antonio, 78207 |L PASO                          |
|2215  GOLIAD RD, San Antonio, 78223  |OLIAD                           |
|102  PALFREY ST W, San Antonio, 78223|LFREY                           |
+-------------------------------------+--------------------------------+
only showing top 3 rows



In [204]:
address_re = r'^.+,\s(.+),\s+(\d+)$'

(df
 .select(df.request_address,
         regexp_extract(df.request_address, address_re, 1).alias('city'),
         regexp_extract(df.request_address, address_re, 2).alias('zip'))
 .show(truncate=False))

+----------------------------------------+-----------+-----+
|request_address                         |city       |zip  |
+----------------------------------------+-----------+-----+
|2315  EL PASO ST, San Antonio, 78207    |San Antonio|78207|
|2215  GOLIAD RD, San Antonio, 78223     |San Antonio|78223|
|102  PALFREY ST W, San Antonio, 78223   |San Antonio|78223|
|114  LA GARDE ST, San Antonio, 78223    |San Antonio|78223|
|734  CLEARVIEW DR, San Antonio, 78228   |San Antonio|78228|
|BANDERA RD and BRESNAHAN                |           |     |
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10129  BOXING PASS, San Antonio, 78251  |San Antonio|78251|
|10129  BOXING PASS, San

In [210]:
(df
 .select(df.case_opened_date)
 .show())

+----------------+
|case_opened_date|
+----------------+
|     1/1/18 0:42|
|     1/1/18 0:46|
|     1/1/18 0:48|
|     1/1/18 1:29|
|     1/1/18 1:34|
|     1/1/18 6:28|
|     1/1/18 6:57|
|     1/1/18 6:58|
|     1/1/18 6:58|
|     1/1/18 6:59|
|     1/1/18 7:00|
|     1/1/18 7:02|
|     1/1/18 7:02|
|     1/1/18 7:03|
|     1/1/18 7:04|
|     1/1/18 7:04|
|     1/1/18 7:05|
|     1/1/18 7:06|
|     1/1/18 7:06|
|     1/1/18 7:07|
+----------------+
only showing top 20 rows



In [224]:
(df
 .select(to_timestamp(df.case_opened_date, 'M/d/y H:mm').alias('timestamp'))
 .show())

+-------------------+
|          timestamp|
+-------------------+
|2018-01-01 00:42:00|
|2018-01-01 00:46:00|
|2018-01-01 00:48:00|
|2018-01-01 01:29:00|
|2018-01-01 01:34:00|
|2018-01-01 06:28:00|
|2018-01-01 06:57:00|
|2018-01-01 06:58:00|
|2018-01-01 06:58:00|
|2018-01-01 06:59:00|
|2018-01-01 07:00:00|
|2018-01-01 07:02:00|
|2018-01-01 07:02:00|
|2018-01-01 07:03:00|
|2018-01-01 07:04:00|
|2018-01-01 07:04:00|
|2018-01-01 07:05:00|
|2018-01-01 07:06:00|
|2018-01-01 07:06:00|
|2018-01-01 07:07:00|
+-------------------+
only showing top 20 rows



In [227]:
(df
 .select(to_timestamp(df.case_opened_date, 'M/d/y H:mm').alias('timestamp'))
 .select(col('timestamp'),
     datediff(current_timestamp(), col('timestamp')).alias('days_since_now'))
 .show())

+-------------------+--------------+
|          timestamp|days_since_now|
+-------------------+--------------+
|2018-01-01 00:42:00|           499|
|2018-01-01 00:46:00|           499|
|2018-01-01 00:48:00|           499|
|2018-01-01 01:29:00|           499|
|2018-01-01 01:34:00|           499|
|2018-01-01 06:28:00|           499|
|2018-01-01 06:57:00|           499|
|2018-01-01 06:58:00|           499|
|2018-01-01 06:58:00|           499|
|2018-01-01 06:59:00|           499|
|2018-01-01 07:00:00|           499|
|2018-01-01 07:02:00|           499|
|2018-01-01 07:02:00|           499|
|2018-01-01 07:03:00|           499|
|2018-01-01 07:04:00|           499|
|2018-01-01 07:04:00|           499|
|2018-01-01 07:05:00|           499|
|2018-01-01 07:06:00|           499|
|2018-01-01 07:06:00|           499|
|2018-01-01 07:07:00|           499|
+-------------------+--------------+
only showing top 20 rows



In [233]:
my_col = when(df.num_days_late >= 0, df.num_days_late).otherwise(0)

(df
 .select(df.num_days_late)
 .select(my_col)
 .show())

+------------------------------------------------------------+
|CASE WHEN (num_days_late >= 0) THEN num_days_late ELSE 0 END|
+------------------------------------------------------------+
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                         0.37216435200000003|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                      

In [234]:
my_col = when(df.num_days_late < 0, 0).otherwise(df.num_days_late)

(df
 .select(df.num_days_late)
 .select(my_col)
 .show())

+-----------------------------------------------------------+
|CASE WHEN (num_days_late < 0) THEN 0 ELSE num_days_late END|
+-----------------------------------------------------------+
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|                                        0.37216435200000003|
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|                                                        0.0|
|       

In [248]:
(df
 .select(df.case_closed, df.case_late, df.num_days_late)
 .select((df.case_closed == 'YES').alias('case_closed'),
         (df.case_late == 'YES').alias('case_late'),
         df.num_days_late)
 .filter(col('case_late'))
 .show())

+-----------+---------+-------------------+
|case_closed|case_late|      num_days_late|
+-----------+---------+-------------------+
|       true|     true|0.37216435200000003|
|       true|     true|         0.03150463|
|       true|     true|        80.74537037|
|       true|     true|0.38280092600000004|
|       true|     true|        0.376655093|
|       true|     true|        46.41153935|
|       true|     true|        0.048368056|
|       true|     true|         36.1630787|
|       true|     true|        25.36005787|
|       true|     true| 1.8262268519999998|
|       true|     true|         46.3819213|
|       true|     true|        46.38175926|
|       true|     true|        72.39403935|
|       true|     true| 113.73300929999999|
|       true|     true|        79.13157407|
|       true|     true|        3.450983796|
|       true|     true|        73.16055556|
|       true|     true|        1.339675926|
|       true|     true|        68.02585648|
|       true|     true|        7

In [253]:
(df
 .select(df.case_closed, df.case_late, df.num_days_late)
 .withColumn('case_closed', df.case_closed == 'YES')
 .withColumn('case_late', df.case_late == 'YES')
 .filter(col('case_late') | col('case_closed'))
 .show())

+-----------+---------+-------------------+
|case_closed|case_late|      num_days_late|
+-----------+---------+-------------------+
|       true|    false| -998.5087616000001|
|       true|    false|-2.0126041669999997|
|       true|    false|       -3.022337963|
|       true|    false|       -15.01148148|
|       true|     true|0.37216435200000003|
|       true|    false|       -29.74398148|
|       true|    false|       -14.70673611|
|       true|    false|       -14.70662037|
|       true|    false|       -14.70662037|
|       true|    false|       -14.70649306|
|       true|    false|       -14.70649306|
|       true|    false|       -14.70636574|
|       true|    false|          -14.70625|
|       true|    false|       -14.70636574|
|       true|    false|       -14.70623843|
|       true|    false|-14.705891199999998|
|       true|    false|       -14.70600694|
|       true|    false|       -14.70576389|
|       true|    false|       -14.70576389|
|       true|    false|       -1

In [257]:
(df
 .select(df.case_closed, df.case_late, df.num_days_late)
 .withColumn('case_closed', df.case_closed == 'YES')
 .withColumn('case_late', df.case_late == 'YES')
 .filter(col('case_late') | col('case_closed'))
 .show())

+-----------+---------+-------------------+
|case_closed|case_late|      num_days_late|
+-----------+---------+-------------------+
|       true|    false| -998.5087616000001|
|       true|    false|-2.0126041669999997|
|       true|    false|       -3.022337963|
|       true|    false|       -15.01148148|
|       true|     true|0.37216435200000003|
|       true|    false|       -29.74398148|
|       true|    false|       -14.70673611|
|       true|    false|       -14.70662037|
|       true|    false|       -14.70662037|
|       true|    false|       -14.70649306|
|       true|    false|       -14.70649306|
|       true|    false|       -14.70636574|
|       true|    false|          -14.70625|
|       true|    false|       -14.70636574|
|       true|    false|       -14.70623843|
|       true|    false|-14.705891199999998|
|       true|    false|       -14.70600694|
|       true|    false|       -14.70576389|
|       true|    false|       -14.70576389|
|       true|    false|       -1

## Joining

## Aggregating