In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from datetime import date

spark = SparkSession.builder.getOrCreate()
spark

In [2]:
df = spark.createDataFrame([
    Row(pk=1, id="com.facebook.katana", title="Facebook", rating=4.0, last_update_date=date(2016,9,12)),
    Row(pk=2, id="com.whatsapp", title="WhatsApp", rating=4.5, last_update_date=date(2016,9,11)),
    Row(pk=3, id="com.whatsapp", title="WhatsApp", rating=4.4, last_update_date=date(2016,9,12)),
    Row(pk=4, id="com.nianticlabs.pokemongo", title="Pokémon GO", rating=4.6 ,last_update_date=date(2016,9,5)),
    Row(pk=5, id="com.nianticlabs.pokemongo", title="Pokémon GO", rating=4.3, last_update_date=date(2016,9,6)),
    Row(pk=6, id="com.nianticlabs.pokemongo", title="Pokémon GO", rating=4.1, last_update_date=date(2016,9,7)),
])

In [3]:
df.show(truncate=False)

+-------------------------+----------------+---+------+----------+
|id                       |last_update_date|pk |rating|title     |
+-------------------------+----------------+---+------+----------+
|com.facebook.katana      |2016-09-12      |1  |4.0   |Facebook  |
|com.whatsapp             |2016-09-11      |2  |4.5   |WhatsApp  |
|com.whatsapp             |2016-09-12      |3  |4.4   |WhatsApp  |
|com.nianticlabs.pokemongo|2016-09-05      |4  |4.6   |Pokémon GO|
|com.nianticlabs.pokemongo|2016-09-06      |5  |4.3   |Pokémon GO|
|com.nianticlabs.pokemongo|2016-09-07      |6  |4.1   |Pokémon GO|
+-------------------------+----------------+---+------+----------+



It may be worth noticing that, in the following cell, the complexity mostly depends on the ordering (n*log(n), presumably) and that this partition-based solution is generally more efficient than others, based for example on join operations.

In [4]:
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

w = Window.partitionBy("id").orderBy(F.col("last_update_date").desc())
result = df.withColumn("rownum", F.row_number().over(w)).where("rownum = 1").drop("rownum")

result.show(truncate=False)

+-------------------------+----------------+---+------+----------+
|id                       |last_update_date|pk |rating|title     |
+-------------------------+----------------+---+------+----------+
|com.whatsapp             |2016-09-12      |3  |4.4   |WhatsApp  |
|com.facebook.katana      |2016-09-12      |1  |4.0   |Facebook  |
|com.nianticlabs.pokemongo|2016-09-07      |6  |4.1   |Pokémon GO|
+-------------------------+----------------+---+------+----------+

