# Data Transformation and Aggregation by Identifier "Ano e Faixa Salarial"

## Environment Configuration

In [0]:
from pyspark.sql.utils import AnalysisException
from src.utils.udfs import functions_for_df_structure_management as ffdsm

## Data Ingestion from Bronze Layer

In [0]:
df_range_of_gross_taxable_income_in_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_rendimento_tributavel_bruto_em_salarios_minimos")

In [0]:
df_range_of_total_income_in_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_rendimentos_totais_em_salarios_minimos")

In [0]:
df_taxable_income_range_exclusive_taxation_on_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_rendimentos_tributaveis_tributacao_exclusiva_em_salarios_minimos")

In [0]:
df_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_recebedores_de_lucros_e_dividendos_rend_socio_e_titular_microempresa_por_faixa_de_rendimento_total_em_salarios_minimos")

In [0]:
df_income_subject_to_exclusive_definitive_taxation = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_rendimentos_sujeitos_a_tributacao_exclusiva_definitiva")

In [0]:
df_taxable_income_by_minimum_wage_range = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_rendimentos_tributaveis_por_faixa_de_salarios_minimos")

In [0]:
df_tax_exempt_and_non_taxable_income = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_rendimentos_isentos_e_nao_tributaveis")

## Data Transformation

### "Faixa de Rendimento Tributável Bruto em Salários-Mínimos"

In [0]:
df_range_of_gross_taxable_income_in_minimum_wages = df_range_of_gross_taxable_income_in_minimum_wages.dropna(how='all')

In [0]:
df_casted_range_of_gross_taxable_income_in_minimum_wages = ffdsm.cast_columns_to_float(df_range_of_gross_taxable_income_in_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_casted_range_of_gross_taxable_income_in_minimum_wages = ffdsm.rename_columns_with_df_name(df_casted_range_of_gross_taxable_income_in_minimum_wages, "FaixaDeRendimentoTributavelBrutoEmSalariosMinimos", ["AnoCalendario", "FaixaSmMensal"])

In [0]:
dbutils.data.summarize(df_casted_range_of_gross_taxable_income_in_minimum_wages)

In [0]:
df_filled_range_of_gross_taxable_income_in_minimum_wages = ffdsm.fill_nulls(df_casted_range_of_gross_taxable_income_in_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

### "Faixa de Rendimentos Tributáveis Tributação Exclusiva em Salários-Mínimos"

In [0]:
df_taxable_income_range_exclusive_taxation_on_minimum_wages = df_taxable_income_range_exclusive_taxation_on_minimum_wages.dropna(how='all')

In [0]:
df_casted_taxable_income_range_exclusive_taxation_on_minimum_wages = ffdsm.cast_columns_to_float(df_taxable_income_range_exclusive_taxation_on_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_casted_taxable_income_range_exclusive_taxation_on_minimum_wages = ffdsm.rename_columns_with_df_name(df_casted_taxable_income_range_exclusive_taxation_on_minimum_wages, "FaixaDeRendimentosTributaveisTributacaoExclusivaEmSalariosMinimos", ["AnoCalendario", "FaixaSmMensal"])

In [0]:
dbutils.data.summarize(df_casted_taxable_income_range_exclusive_taxation_on_minimum_wages)

In [0]:
df_filled_taxable_income_range_exclusive_taxation_on_minimum_wages = ffdsm.fill_nulls(df_casted_taxable_income_range_exclusive_taxation_on_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

### "Faixa de Rendimentos Totais em Salários-Mínimos"

In [0]:
df_range_of_total_income_in_minimum_wages = df_range_of_total_income_in_minimum_wages.dropna(how='all')

In [0]:
df_casted_range_of_total_income_in_minimum_wages = ffdsm.cast_columns_to_float(df_range_of_total_income_in_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_casted_range_of_total_income_in_minimum_wages = ffdsm.rename_columns_with_df_name(df_casted_range_of_total_income_in_minimum_wages, "FaixaDeRendimentosTotaisEmSalariosMinimos", ["AnoCalendario", "FaixaSmMensal"])

In [0]:
dbutils.data.summarize(df_casted_range_of_total_income_in_minimum_wages)

In [0]:
df_filled_casted_range_of_total_income_in_minimum_wages = ffdsm.fill_nulls(df_casted_range_of_total_income_in_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

### "Recebedores de Lucros e Dividendos Rend Sócio e Titular Microempresa por Faixa de Rendimento Total em Salários-Mínimos"

In [0]:
df_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages = df_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages.dropna(how='all')

In [0]:
df_casted_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages = ffdsm.cast_columns_to_float(df_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_casted_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages = ffdsm.rename_columns_with_df_name(df_casted_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages, "RecebedoresDeLucrosEDividendosRendSocioETitularMicroempresaPorFaixaDeRendimentoTotalEmSalariosMinimos", ["AnoCalendario", "FaixaSmMensal"])

In [0]:
dbutils.data.summarize(df_casted_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages)

In [0]:
df_filled_casted_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages = ffdsm.fill_nulls(df_casted_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages, ["AnoCalendario", "FaixaSmMensal"])

### "Rendimentos Sujeitos à Tributação Exclusiva Definitiva"

In [0]:
df_income_subject_to_exclusive_definitive_taxation = df_income_subject_to_exclusive_definitive_taxation.dropna(how='all')

In [0]:
df_income_subject_to_exclusive_definitive_taxation = df_income_subject_to_exclusive_definitive_taxation.withColumnRenamed("FaixaDeSalariosMinimos", "FaixaSmMensal")

In [0]:
df_casted_income_subject_to_exclusive_definitive_taxation = ffdsm.cast_columns_to_float(df_income_subject_to_exclusive_definitive_taxation, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_casted_income_subject_to_exclusive_definitive_taxation = ffdsm.rename_columns_with_df_name(df_casted_income_subject_to_exclusive_definitive_taxation, "RendimentosSujeitosATributacaoExclusivaDefinitiva", ["AnoCalendario", "FaixaSmMensal"])

In [0]:
dbutils.data.summarize(df_casted_income_subject_to_exclusive_definitive_taxation)

In [0]:
df_filled_income_subject_to_exclusive_definitive_taxation = ffdsm.fill_nulls(df_casted_income_subject_to_exclusive_definitive_taxation, ["AnoCalendario", "FaixaSmMensal"])

### "Rendimentos Tributáveis por Faixa de Salários-Mínimos"

In [0]:
df_taxable_income_by_minimum_wage_range = df_taxable_income_by_minimum_wage_range.dropna(how='all')

In [0]:
df_taxable_income_by_minimum_wage_range = df_taxable_income_by_minimum_wage_range.withColumnRenamed("FaixaDeSmMensal", "FaixaSmMensal")

In [0]:
df_casted_taxable_income_by_minimum_wage_range = ffdsm.cast_columns_to_float(df_taxable_income_by_minimum_wage_range, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_casted_taxable_income_by_minimum_wage_range = ffdsm.rename_columns_with_df_name(df_casted_taxable_income_by_minimum_wage_range, "RendimentosTributaveisPorFaixaDeSalariosMinimos", ["AnoCalendario", "FaixaSmMensal"])

In [0]:
dbutils.data.summarize(df_casted_taxable_income_by_minimum_wage_range)

In [0]:
df_filled_taxable_income_by_minimum_wage_range = ffdsm.fill_nulls(df_casted_taxable_income_by_minimum_wage_range, ["AnoCalendario", "FaixaSmMensal"])

### "Rendimentos Isentos e Não Tributáveis"

In [0]:
df_tax_exempt_and_non_taxable_income = df_tax_exempt_and_non_taxable_income.dropna(how='all')

In [0]:
df_tax_exempt_and_non_taxable_income = df_tax_exempt_and_non_taxable_income.withColumnRenamed("FaixaDeSalariosMinimos", "FaixaSmMensal")

In [0]:
df_casted_tax_exempt_and_non_taxable_income = ffdsm.cast_columns_to_float(df_tax_exempt_and_non_taxable_income, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_casted_tax_exempt_and_non_taxable_income = ffdsm.rename_columns_with_df_name(df_casted_tax_exempt_and_non_taxable_income, "RendIsenENTrib", ["AnoCalendario", "FaixaSmMensal"])

In [0]:
dbutils.data.summarize(df_casted_tax_exempt_and_non_taxable_income)

In [0]:
df_filled_tax_exempt_and_non_taxable_income = ffdsm.fill_nulls(df_casted_tax_exempt_and_non_taxable_income, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
display(df_filled_tax_exempt_and_non_taxable_income)

## Aggregation By "Ano e Faixa Salarial"

In [0]:
df1 = df_filled_range_of_gross_taxable_income_in_minimum_wages.join(df_filled_taxable_income_range_exclusive_taxation_on_minimum_wages, on=["AnoCalendario", "FaixaSmMensal"], how="full_outer")

In [0]:
df2 = df1.join(df_filled_casted_range_of_total_income_in_minimum_wages, on=["AnoCalendario", "FaixaSmMensal"], how="full_outer")

In [0]:
df3 = df2.join(df_filled_casted_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages, on=["AnoCalendario", "FaixaSmMensal"], how="full_outer")

In [0]:
df4 = df3.join(df_filled_income_subject_to_exclusive_definitive_taxation, on=["AnoCalendario", "FaixaSmMensal"], how="full_outer")

In [0]:
df5 = df4.join(df_filled_taxable_income_by_minimum_wage_range, on=["AnoCalendario", "FaixaSmMensal"], how="full_outer")

In [0]:
df_silver_earnings = df5.join(df_filled_tax_exempt_and_non_taxable_income, on=["AnoCalendario", "FaixaSmMensal"], how="full_outer")

In [0]:
display(df_silver_earnings)

In [0]:
df_filled_silver_earnings = ffdsm.fill_nulls(df_silver_earnings, ["AnoCalendario", "FaixaSmMensal"])

In [0]:
df_nulls = ffdsm.count_nulls(df_filled_silver_earnings)
display(df_nulls)

## Save as Delta in Silver Layer

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS brazilian_tax_big_numbers.silver_layer")

In [0]:
error = None

try:
    df_filled_silver_earnings.write \
        .mode("overwrite") \
        .saveAsTable(f"brazilian_tax_big_numbers.silver_layer.delta_rendimentos")
    error = None
except Exception as e:
    error = str(e)
    print(error)