# Data de Update duplicado e as datas como string

## Usando drop_duplicates()

In [56]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "500mb") \
    .appName("Ex6") \
    .getOrCreate()

In [19]:
df = spark.read.csv('./data/data.csv', header=True, inferSchema=True)

In [20]:
df.show()

+---+------+------+--------------+
| Id|  Nome|Estado|Data de Update|
+---+------+------+--------------+
|  1| Pedro|    SP|    01-01-2022|
|  1| Pedro|    MG|    01-02-2022|
|  1| Pedro|    RJ|    01-03-2022|
|  2|   Ana|    SC|    01-01-2022|
|  2|   Ana|    PR|    15-01-2022|
|  3| Lucas|    SP|    03-01-2020|
|  4| Maria|    RS|    01-01-2010|
|  5|Marcos|    RN|    01-01-2022|
|  4| Maria|    RJ|    31-12-2021|
+---+------+------+--------------+



In [21]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Nome: string (nullable = true)
 |-- Estado: string (nullable = true)
 |-- Data de Update: string (nullable = true)



In [22]:
# Problemas: Data e id duplicados - Arrumar a data com to_date e arrumar os is de acordo com o último update

## Arrumando a data

df = df.withColumn('Data de Update', to_date('Data de Update', 'dd-MM-yyyy'))

In [23]:
df.show()

+---+------+------+--------------+
| Id|  Nome|Estado|Data de Update|
+---+------+------+--------------+
|  1| Pedro|    SP|    2022-01-01|
|  1| Pedro|    MG|    2022-02-01|
|  1| Pedro|    RJ|    2022-03-01|
|  2|   Ana|    SC|    2022-01-01|
|  2|   Ana|    PR|    2022-01-15|
|  3| Lucas|    SP|    2020-01-03|
|  4| Maria|    RS|    2010-01-01|
|  5|Marcos|    RN|    2022-01-01|
|  4| Maria|    RJ|    2021-12-31|
+---+------+------+--------------+



In [24]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Nome: string (nullable = true)
 |-- Estado: string (nullable = true)
 |-- Data de Update: date (nullable = true)



In [38]:
# Não funciona, pois os estados e as datas estão diferentes, logo não são cópias

df.drop_duplicates().show()

+---+------+------+--------------+
| Id|  Nome|Estado|Data de Update|
+---+------+------+--------------+
|  2|   Ana|    SC|    2022-01-01|
|  1| Pedro|    SP|    2022-01-01|
|  3| Lucas|    SP|    2020-01-03|
|  2|   Ana|    PR|    2022-01-15|
|  4| Maria|    RS|    2010-01-01|
|  4| Maria|    RJ|    2021-12-31|
|  1| Pedro|    RJ|    2022-03-01|
|  5|Marcos|    RN|    2022-01-01|
|  1| Pedro|    MG|    2022-02-01|
+---+------+------+--------------+



In [40]:
# Ao selecionar uma coluna em específico ['id'], não segue a data de update

df.drop_duplicates(['Id']).show()

+---+------+------+--------------+
| Id|  Nome|Estado|Data de Update|
+---+------+------+--------------+
|  1| Pedro|    SP|    2022-01-01|
|  3| Lucas|    SP|    2020-01-03|
|  5|Marcos|    RN|    2022-01-01|
|  4| Maria|    RS|    2010-01-01|
|  2|   Ana|    SC|    2022-01-01|
+---+------+------+--------------+



In [48]:
# Ao usar orderBy, é ordenado de modo crescente

df.orderBy(col('Data de Update')).drop_duplicates(['Id']).show()



+---+------+------+--------------+
| Id|  Nome|Estado|Data de Update|
+---+------+------+--------------+
|  1| Pedro|    SP|    2022-01-01|
|  3| Lucas|    SP|    2020-01-03|
|  5|Marcos|    RN|    2022-01-01|
|  4| Maria|    RS|    2010-01-01|
|  2|   Ana|    SC|    2022-01-01|
+---+------+------+--------------+



In [54]:
# Usa-se ordem decrescente no orderBy e usa-se um outro orderBy para ordenar de acordo com o Id

df.orderBy(col('Data de Update').desc()).drop_duplicates(['Id']).orderBy('Id').show()



+---+------+------+--------------+
| Id|  Nome|Estado|Data de Update|
+---+------+------+--------------+
|  1| Pedro|    RJ|    2022-03-01|
|  2|   Ana|    PR|    2022-01-15|
|  3| Lucas|    SP|    2020-01-03|
|  4| Maria|    RJ|    2021-12-31|
|  5|Marcos|    RN|    2022-01-01|
+---+------+------+--------------+





## Usando window function com row_number()

In [60]:
# Cria-se a coluna 'rowid', a partir da repetição do Id, considerando a data de update de modo decrescente
# Assim, o rowid == 1 mostra a última data de update para cada Id

df = df.withColumn('rowid', row_number().over(Window.partitionBy('Id').orderBy(col('Data de Update').desc())))

In [58]:
df.show()

                                                                                

+---+------+------+--------------+-----+
| Id|  Nome|Estado|Data de Update|rowid|
+---+------+------+--------------+-----+
|  1| Pedro|    RJ|    2022-03-01|    1|
|  1| Pedro|    MG|    2022-02-01|    2|
|  1| Pedro|    SP|    2022-01-01|    3|
|  3| Lucas|    SP|    2020-01-03|    1|
|  5|Marcos|    RN|    2022-01-01|    1|
|  4| Maria|    RJ|    2021-12-31|    1|
|  4| Maria|    RS|    2010-01-01|    2|
|  2|   Ana|    PR|    2022-01-15|    1|
|  2|   Ana|    SC|    2022-01-01|    2|
+---+------+------+--------------+-----+





In [64]:
# Filtra-se para o rowid desejado e reordena de acordo com o Id

df.filter('rowid=1').orderBy('Id').show()



+---+------+------+--------------+-----+
| Id|  Nome|Estado|Data de Update|rowid|
+---+------+------+--------------+-----+
|  1| Pedro|    RJ|    2022-03-01|    1|
|  2|   Ana|    PR|    2022-01-15|    1|
|  3| Lucas|    SP|    2020-01-03|    1|
|  4| Maria|    RJ|    2021-12-31|    1|
|  5|Marcos|    RN|    2022-01-01|    1|
+---+------+------+--------------+-----+



                                                                                