In [232]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, to_timestamp, month, year
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("airflow_prod") \
            .config('spark.executor.memory', '6g') \
            .config('spark.driver.memory', '6g') \
            .config("spark.driver.maxResultSize", "1048MB") \
            .config("spark.port.maxRetries", "100") \
            .getOrCreate()

In [76]:
def realiza_pivot(df, tipo):
    # separa as colunas datas para realizar pivot
    colunas = df.columns[4:]

    # coleta numero de colunas
    n = len(colunas)
    
    #expressao para realizar o pivot
    expr = "stack({}, {}) as (Date, {})".format(n, ", ".join(["'{}', `{}`".format(col, col) for col in colunas]),tipo)
    
    #realiza pivot
    df = df.selectExpr("Province_State", "Country_Region", "Lat", "Long", expr)
    return df

In [78]:
### recuperados
df_recovered = spark.read.options(header='true',inferSchema=True).csv('/home/jovyan/work/dados/covid19/time_series_covid19_recovered_global.csv')
df_recovered = df_recoved.withColumnRenamed("Province/State", "Province_State").withColumnRenamed("Country/Region", "Country_Region") # renomea as colunas

df_confirmed = spark.read.options(header='true',inferSchema=True).csv('/home/jovyan/work/dados/covid19/time_series_covid19_confirmed_global.csv')
df_confirmed = df_confirmed.withColumnRenamed("Province/State", "Province_State").withColumnRenamed("Country/Region", "Country_Region") # renomea as colunas

df_deaths = spark.read.options(header='true',inferSchema=True).csv('/home/jovyan/work/dados/covid19/time_series_covid19_deaths_global.csv')
df_deaths = df_deaths.withColumnRenamed("Province/State", "Province_State").withColumnRenamed("Country/Region", "Country_Region") # renomea as colunas

In [79]:
#realiza pivot de todas as tabelas
recovered = realiza_pivot(df_recovered, "Recovered")

confirmed = realiza_pivot(df_confirmed, "Confirmed")

deaths = realiza_pivot(df_deaths, "Deaths")

In [244]:
#deaths.count()

In [245]:
#recovered.count()

In [183]:
# inner join entre a tabela deaths e confirmed
join = confirmed.join( deaths,((confirmed.Lat == deaths.Lat) & (confirmed.Long == deaths.Long)  & (confirmed.Date == deaths.Date) & (confirmed.Country_Region == deaths.Country_Region)), "inner" ) \
    .select(deaths["*"], confirmed["confirmed"])


In [None]:
join.count()

In [201]:
# right join entre a tabela join e recovered
join2 = recovered.join(join,((recovered.Lat == join.Lat) & (recovered.Long == join.Long)  & (recovered.Date == join.Date) & (recovered.Country_Region == join.Country_Region)), "right" ) \
    .select(join["*"], recovered["recovered"])

In [240]:
# altera os tipos de dados
join3 = join2.withColumn("Date", to_timestamp(col("Date"), "M/d/yy")) \
    .withColumn("deaths", col("deaths").cast("long")) \
    .withColumn("confirmed", col("confirmed").cast("long")) \
    .withColumn("recovered", col("recovered").cast("long")) \
    .withColumn("mes", month("Date")) \
    .withColumn("ano", year("Date"))

In [241]:
join3

DataFrame[Province_State: string, Country_Region: string, Lat: double, Long: double, Date: timestamp, deaths: bigint, confirmed: bigint, recovered: bigint, mes: int, ano: int]

In [247]:
join3.write.format("parquet").option("header", "true").mode("overwrite").save("/home/jovyan/work/dados/Trusted")