%md
## This demonstrates how perform choosing first/last record data in window function partition

In [41]:
myData =   ((1, "2017-09-12", "2017-10-28", "2017-09-12", 0), \
            (1, "2017-09-12", "2017-10-28", "2017-09-13", 41), \
            (1, "2017-09-12", "2017-10-28", "2017-09-14", 42), \
            (1, "2017-09-12", "2017-10-28", "2017-09-15", 43), \
            (1, "2017-09-12", "2017-10-28", "2017-09-16", 44), \
            (1, "2017-09-12", "2017-10-28", "2017-09-17", 5), \
            (2, "2017-09-12", "2017-10-28", "2017-09-15", 101), \
            (2, "2017-09-12", "2017-10-28", "2017-09-16", 102), \
            (2, "2017-09-12", "2017-10-28", "2017-09-17", 103))
 
columns= ['stay_id','ci','co','date','tmpr']
df = spark.createDataFrame(data = myData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- stay_id: long (nullable = true)
 |-- ci: string (nullable = true)
 |-- co: string (nullable = true)
 |-- date: string (nullable = true)
 |-- tmpr: long (nullable = true)

+-------+----------+----------+----------+----+
|stay_id|ci        |co        |date      |tmpr|
+-------+----------+----------+----------+----+
|1      |2017-09-12|2017-10-28|2017-09-12|0   |
|1      |2017-09-12|2017-10-28|2017-09-13|41  |
|1      |2017-09-12|2017-10-28|2017-09-14|42  |
|1      |2017-09-12|2017-10-28|2017-09-15|43  |
|1      |2017-09-12|2017-10-28|2017-09-16|44  |
|1      |2017-09-12|2017-10-28|2017-09-17|5   |
|2      |2017-09-12|2017-10-28|2017-09-15|101 |
|2      |2017-09-12|2017-10-28|2017-09-16|102 |
|2      |2017-09-12|2017-10-28|2017-09-17|103 |
+-------+----------+----------+----------+----+



In [42]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, lag, avg, first

In [44]:
windowSpec  = Window.partitionBy("stay_id").orderBy(col("date"))
windowSpecDesc  = Window.partitionBy("stay_id").orderBy(col("date").desc())
windowSpecAgg  = Window.partitionBy("stay_id")

df.withColumn("first_tmpr", first("tmpr").over(windowSpec)) \
    .withColumn("last_tmpr", first("tmpr").over(windowSpecDesc)) \
    .withColumn("avg_tmpr", avg("tmpr").over(windowSpecAgg)) \
    .withColumn("row",row_number().over(windowSpec)) \
    .where("row = 1") \
    .select("stay_id", "ci", "co", "first_tmpr", "last_tmpr", "avg_tmpr") \
    .show()



+-------+----------+----------+----------+---------+------------------+
|stay_id|        ci|        co|first_tmpr|last_tmpr|          avg_tmpr|
+-------+----------+----------+----------+---------+------------------+
|      1|2017-09-12|2017-10-28|         0|        5|29.166666666666668|
|      2|2017-09-12|2017-10-28|       101|      103|             102.0|
+-------+----------+----------+----------+---------+------------------+



                                                                                

%md
## This is Azure blob write/read test

In [46]:
data_path = "abfss://data@styakovdwesteurope.dfs.core.windows.net"

In [48]:
df.write.parquet(data_path  + "/ouput/test1.parquet")

23/11/22 23:49:18 WARN SSLSocketFactoryEx: Failed to load OpenSSL. Falling back to the JSSE default.
                                                                                

In [49]:
test_df = spark.read.parquet(data_path  + "/ouput/test1.parquet")
test_df.show()

                                                                                

+-------+----------+----------+----------+----+
|stay_id|        ci|        co|      date|tmpr|
+-------+----------+----------+----------+----+
|      2|2017-09-12|2017-10-28|2017-09-15| 101|
|      2|2017-09-12|2017-10-28|2017-09-16| 102|
|      2|2017-09-12|2017-10-28|2017-09-17| 103|
|      1|2017-09-12|2017-10-28|2017-09-12|   0|
|      1|2017-09-12|2017-10-28|2017-09-13|  41|
|      1|2017-09-12|2017-10-28|2017-09-14|  42|
|      1|2017-09-12|2017-10-28|2017-09-15|  43|
|      1|2017-09-12|2017-10-28|2017-09-16|  44|
|      1|2017-09-12|2017-10-28|2017-09-17|   5|
+-------+----------+----------+----------+----+



                                                                                

%md
## This demonstrates SQL sintaxis magic in Jupyter

In [2]:
%load_ext sparksql_magic

In [3]:
%%sparksql
SELECT 1

                                                                                

0
1
1
