# Data Sources Advanced Exploration

## Environment Configuration

In [0]:
from pyspark.sql import functions as F

## Ingestion of Tables and Basic Data Analysis

### "bens_e_direitos" Tables

In [0]:
df_assets_and_rights = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_bens_e_direitos")

display(df_assets_and_rights)

### "capital_de_estado_de_residencia_do_declarante" Table

In [0]:
df_state_of_residence_capital_of_the_declarant = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_capital_de_estado_de_residencia_do_declarante")

display(df_state_of_residence_capital_of_the_declarant)

In [0]:
display(df_state_of_residence_capital_of_the_declarant.groupBy("CapitalEstado").agg(F.count("*")).alias("count"))

### "dividas_e_onus" Table

In [0]:
df_debts_and_liabilities = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_dividas_e_onus")

display(df_debts_and_liabilities)

### "estado_de_residencia_do_declarante" Table

In [0]:
df_state_of_residence_of_the_declarant = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_estado_de_residencia_do_declarante")

display(df_state_of_residence_of_the_declarant)

In [0]:
display(df_state_of_residence_of_the_declarant.groupBy("Estado").agg(F.count("*")).alias("count"))

### "faixa_etaria_do_declarante_e_genero" Table

In [0]:
df_declarant_age_range_and_gender = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_etaria_do_declarante_e_genero")

display(df_declarant_age_range_and_gender)

In [0]:
df_sum = df_declarant_age_range_and_gender.groupBy("AnoCalendario").agg(F.sum("QuantidadeDeDeclarantes").alias("TotalAnoG"))
df_sum.display()

### "faixa_de_base_de_calculo_anual" Table

In [0]:
df_annual_calculation_base_range = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_base_de_calculo_anual")

display(df_annual_calculation_base_range)

### Base Range for Calculation in Minimum Wages and Gender

In [0]:
df_base_range_for_calculation_in_minimum_wages_and_gender = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_base_de_calculo_em_salarios_minimos_e_genero")

display(df_base_range_for_calculation_in_minimum_wages_and_gender)

In [0]:
agg_df = df_base_range_for_calculation_in_minimum_wages_and_gender.groupBy("AnoCalendario").agg(F.sum("QuantidadeDeDeclarantes").alias("TotalAnoSM"))
df_sum = df_sum.join(agg_df, on='AnoCalendario', how='left')
df_sum.display()

### "faixa_de_doacoes_e_herancas" Table

In [0]:
df_donation_and_inheritance_range = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_doacoes_e_herancas")

display(df_donation_and_inheritance_range)

### "faixa_de_rendimento_tributavel_bruto_em_salarios_minimos" Table

In [0]:
df_range_of_gross_taxable_income_in_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_rendimento_tributavel_bruto_em_salarios_minimos")

display(df_range_of_gross_taxable_income_in_minimum_wages)

### Taxable Income Range Exclusive Taxation in Minimum Wages Table

In [0]:
df_taxable_income_range_exclusive_taxation_on_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_rendimentos_tributaveis_tributacao_exclusiva_em_salarios_minimos")

display(df_taxable_income_range_exclusive_taxation_on_minimum_wages)

### Range of Total Income in Minimum Wages Table

In [0]:
df_range_of_total_income_in_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_faixa_de_rendimentos_totais_em_salarios_minimos")

display(df_range_of_total_income_in_minimum_wages)

### Gender and Type of Declaration Table

In [0]:
df_gender_and_declaration_type = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_genero_e_tipo_de_declaracao")

display(df_gender_and_declaration_type)

### Municipality of Residence of the Declarant Table

In [0]:
df_municipality_of_residence = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_municipio_de_residencia_do_declarante_e_tipo_de_formulario")

display(df_municipality_of_residence)

In [0]:
df_municipality_of_residence.count()

In [0]:
display(df_municipality_of_residence.groupBy("Municipio").agg(F.count("*").alias("qtd")))

In [0]:
display(df_municipality_of_residence.groupBy("Municipio").agg(F.count("*").alias("qtd")).where((F.col("qtd") > 10) | (F.col("qtd") < 10)))

In [0]:
display(df_municipality_of_residence.groupBy("TipoDeFormularioDeDeclaracao").agg(F.count("*").alias("qtd")))

In [0]:
display(df_municipality_of_residence.groupBy("AnoCalendario").agg(F.count("*").alias("qtd")))

In [0]:
display(df_municipality_of_residence.groupBy("Municipio").agg(F.count("*").alias("qtd")))

### Nature of Occupation Table

In [0]:
df_nature_of_occupation = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_natureza_de_ocupacao")

display(df_nature_of_occupation)

In [0]:
display(df_nature_of_occupation.groupBy("AnoCalendario").agg(F.count("*").alias("qtd")))

### Main Occupation of the Declarant Table

In [0]:
df_main_occupation = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_ocupacao_principal_do_declarante")

display(df_main_occupation)

In [0]:
display(df_main_occupation.groupBy("AnoCalendario").agg(F.count("*").alias("qtd")))

### Payments and Donations Table

In [0]:
df_payments_and_donations = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_pagamentos_e_doacoes")

display(df_payments_and_donations)

### Recipients of Profits and Dividends + Income of Partner and Microenterprise Owner by Total Income Range (in minimum wages) Table

In [0]:
df_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_recebedores_de_lucros_e_dividendos_rend_socio_e_titular_microempresa_por_faixa_de_rendimento_total_em_salarios_minimos")

display(df_profit_dividend_recipients_and_partner_owner_income_by_total_income_range_in_minimum_wages)

### Recipients of Profits and Dividends + Partner and Microenterprise Owner Income by Main Occupation Table

In [0]:
df_profit_dividend_recipients_and_partner_owner_income_by_main_occupation = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_recebedores_de_lucros_e_dividendos_rend_socio_e_titular_microempresa_por_ocupacao_principal")

display(df_profit_dividend_recipients_and_partner_owner_income_by_main_occupation)

In [0]:
display(df_profit_dividend_recipients_and_partner_owner_income_by_main_occupation.groupBy("OcupacaoPrincipalDoDeclarante").agg(F.count("*").alias("qtd")))

### Income Subject to Exclusive/Definitive Taxation Table

In [0]:
df_income_subject_to_exclusive_definitive_taxation = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_rendimentos_sujeitos_a_tributacao_exclusiva_definitiva")

display(df_income_subject_to_exclusive_definitive_taxation)

### Taxable Income by Minimum Wage Range Table

In [0]:
df_taxable_income_by_minimum_wage_range = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_rendimentos_tributaveis_por_faixa_de_salarios_minimos")

display(df_taxable_income_by_minimum_wage_range)

### Exempt and Non_Taxable Income Table

In [0]:
df_tax_exempt_and_non_taxable_income = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_rendimentos_isentos_e_nao_tributaveis")

display(df_tax_exempt_and_non_taxable_income)

### Tax Status Table

In [0]:
df_tax_status = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_situacao_fiscal")

display(df_tax_status)

### Type of Form Table

In [0]:
df_form_type = spark.table("brazilian_tax_big_numbers.bronze_layer.delta_tipo_de_formulario")

display(df_form_type)