# FIRST AND LAS WINDOW FUNCTIONS

## Create DataFrame

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 19:43:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_payments = [
    ("C1","2024-08-12",150),
    ("C1","2024-08-13",33),
    ("C2","2024-06-01",45),
    ("C2","2024-07-29",110),
    ("C3","2024-01-01",20),
    ("C3","2024-01-02",23),
    ("C3","2024-02-24",78),
    ("C3","2024-05-11",91),
    ("C1","2024-08-14",75),
    ("C1","2024-08-15",120),
    ("C2","2024-08-01",60),
    ("C2","2024-08-05",95),
    ("C3","2024-05-12",85),
    ("C3","2024-06-01",88),
    ("C3","2024-06-15",30),
    ("C3","2024-07-01",99),
    ("C1","2024-08-16",130),
    ("C1","2024-08-17",200),
    ("C2","2024-08-10",72),
    ("C2","2024-08-20",110),
    ("C3","2024-07-02",105),
    ("C3","2024-08-01",140),
    ("C3","2024-08-15",50),
    ("C3","2024-08-20",115)
]

schema_payments = "id_customer STRING, payment_day STRING, amount INTEGER"

In [4]:
df_payments_original = spark.createDataFrame(data=data_payments,schema=schema_payments)
df_payments_original.printSchema()

root
 |-- id_customer: string (nullable = true)
 |-- payment_day: string (nullable = true)
 |-- amount: integer (nullable = true)



In [5]:
from pyspark.sql.functions import to_date

In [6]:
df_payments = df_payments_original.withColumn("payment_date", to_date("payment_day", "yyyy-MM-dd")).drop("payment_day")
df_payments.printSchema()

root
 |-- id_customer: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- payment_date: date (nullable = true)



In [7]:
df_payments.show()

                                                                                

+-----------+------+------------+
|id_customer|amount|payment_date|
+-----------+------+------------+
|         C1|   150|  2024-08-12|
|         C1|    33|  2024-08-13|
|         C2|    45|  2024-06-01|
|         C2|   110|  2024-07-29|
|         C3|    20|  2024-01-01|
|         C3|    23|  2024-01-02|
|         C3|    78|  2024-02-24|
|         C3|    91|  2024-05-11|
|         C1|    75|  2024-08-14|
|         C1|   120|  2024-08-15|
|         C2|    60|  2024-08-01|
|         C2|    95|  2024-08-05|
|         C3|    85|  2024-05-12|
|         C3|    88|  2024-06-01|
|         C3|    30|  2024-06-15|
|         C3|    99|  2024-07-01|
|         C1|   130|  2024-08-16|
|         C1|   200|  2024-08-17|
|         C2|    72|  2024-08-10|
|         C2|   110|  2024-08-20|
+-----------+------+------------+
only showing top 20 rows



## Window Function

In [8]:
from pyspark.sql.window import Window

window_function = Window.partitionBy("id_customer")

## Generate Columns for First and Last Payment

In [9]:
from pyspark.sql.functions import first, last

In [10]:
df_payments_calculate = df_payments \
                        .withColumn("first_payment", first("payment_date").over(window_function)) \
                        .withColumn("last_payment", last("payment_date").over(window_function))

df_payments_calculate.show()

+-----------+------+------------+-------------+------------+
|id_customer|amount|payment_date|first_payment|last_payment|
+-----------+------+------------+-------------+------------+
|         C1|   150|  2024-08-12|   2024-08-12|  2024-08-17|
|         C1|    33|  2024-08-13|   2024-08-12|  2024-08-17|
|         C1|    75|  2024-08-14|   2024-08-12|  2024-08-17|
|         C1|   120|  2024-08-15|   2024-08-12|  2024-08-17|
|         C1|   130|  2024-08-16|   2024-08-12|  2024-08-17|
|         C1|   200|  2024-08-17|   2024-08-12|  2024-08-17|
|         C2|    45|  2024-06-01|   2024-06-01|  2024-08-20|
|         C2|   110|  2024-07-29|   2024-06-01|  2024-08-20|
|         C2|    60|  2024-08-01|   2024-06-01|  2024-08-20|
|         C2|    95|  2024-08-05|   2024-06-01|  2024-08-20|
|         C2|    72|  2024-08-10|   2024-06-01|  2024-08-20|
|         C2|   110|  2024-08-20|   2024-06-01|  2024-08-20|
|         C3|    20|  2024-01-01|   2024-01-01|  2024-08-20|
|         C3|    23|  20

## Get especific values

In [11]:
df_payments_calculate.drop("payment_date","amount").distinct().show()

+-----------+-------------+------------+
|id_customer|first_payment|last_payment|
+-----------+-------------+------------+
|         C1|   2024-08-12|  2024-08-17|
|         C2|   2024-06-01|  2024-08-20|
|         C3|   2024-01-01|  2024-08-20|
+-----------+-------------+------------+

