# 1 - Importing libraries

In [109]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import numpy as np
import basedosdados as bd
import os

# 2 - Initializing Spark Session

In [110]:
spark = SparkSession.builder \
    .appName("Revenue By State ETL") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "100") \
    .getOrCreate()

# 3 - Extracting

## 3.1 - Scraping file

In [None]:
from script.arrecadacao_estado_scrap_file import FileHandlerArrecadacaoEstado
scrap = FileHandlerArrecadacaoEstado("C:\RevenueByState\data")

paths = os.listdir("C:\RevenueByState")
name_path = scrap.download_dir[scrap.download_dir.rfind('\\') + 1:]

if name_path not in paths:
    scrap.download_file()
elif "arrecadacao-estado.csv" not in os.listdir(scrap.download_dir):
    scrap.download_file()
else:
    print("Already exists!")

## 3.2 - Getting another resources (from https://basedosdados.org/)

In [111]:
population_states_df = bd.read_table(
dataset_id='br_ibge_populacao',
table_id='uf',
billing_project_id="revenue-etl"
)

Downloading: 100%|██████████| 837/837 [00:00<00:00, 1494.65rows/s]


In [112]:
print(population_states_df)

    sigla_uf   ano  populacao  populacao_economicamente_ativa
0         AC  1991     417102                            <NA>
1         AL  1991    2512658                            <NA>
2         AM  1991    2102766                            <NA>
3         AP  1991     289035                            <NA>
4         BA  1991   11867336                            <NA>
..       ...   ...        ...                             ...
832       RS  2021   11466630                            9533
833       SC  2021    7338473                            5947
834       SE  2021    2338474                            1873
835       SP  2021   46649132                           38573
836       TO  2021    1607363                            1258

[837 rows x 4 columns]


In [113]:
inflation_df = bd.read_table(dataset_id='br_ibge_inpc',
table_id='mes_brasil',
billing_project_id="revenue-etl"
) 

Downloading: 100%|██████████| 538/538 [00:00<00:00, 1577.70rows/s]


In [114]:
print(inflation_df)

      ano  mes        indice  variacao_mensal  variacao_trimestral  \
0    1979    3  5.576500e-09              NaN                  NaN   
1    1979    4  5.768900e-09             3.45                  NaN   
2    1979    5  5.870400e-09             1.76                  NaN   
3    1979    6  6.046500e-09             3.00                 8.43   
4    1979    7  6.370500e-09             5.36                10.43   
..    ...  ...           ...              ...                  ...   
533  2023    7  6.880170e+03            -0.09                 0.17   
534  2023    9  6.901510e+03             0.11                 0.22   
535  2023   10  6.909790e+03             0.12                 0.43   
536  2023   11  6.916700e+03             0.10                 0.33   
537  2023   11  6.916700e+03             0.10                 0.33   

     variacao_semestral  variacao_anual  variacao_doze_meses  
0                   NaN             NaN                  NaN  
1                   NaN          

## 3.3 - Reading resources

### 3.3.1 - Listing file paths

In [115]:
data = {
    "arrecadacao_estado": "C:\RevenueByState\data\\arrecadacao-estado.csv"
}

### 3.3.2 - Defining Schemas

In [117]:
# Defining the schema (translating columns from Portuguese to English)
collection_state_schema = StructType(
    [
        StructField(name="year", dataType=IntegerType(), nullable=True),
        StructField(name="month", dataType=StringType(), nullable=True),
        StructField(name="state", dataType=StringType(), nullable=True),
        StructField(name="import_tax", dataType=DoubleType(), nullable=True),
        StructField(name="export_tax", dataType=DoubleType(), nullable=True),
        StructField(name="ipi_tobacco", dataType=DoubleType(), nullable=True),
        StructField(name="ipi_beverages", dataType=DoubleType(), nullable=True),
        StructField(name="ipi_automobiles", dataType=DoubleType(), nullable=True),
        StructField(name="ipi_linked_imports", dataType=DoubleType(), nullable=True),
        StructField(name="ipi_others", dataType=DoubleType(), nullable=True),
        StructField(name="individual_income_tax", dataType=DoubleType(), nullable=True),
        StructField(name="corporate_income_tax_financial_entities", dataType=DoubleType(), nullable=True),
        StructField(name="corporate_income_tax_other_companies", dataType=DoubleType(), nullable=True),
        StructField(name="withholding_income_tax_employment_income", dataType=DoubleType(), nullable=True),
        StructField(name="withholding_income_tax_capital_income", dataType=DoubleType(), nullable=True),
        StructField(name="withholding_income_tax_overseas_remittances", dataType=DoubleType(), nullable=True),
        StructField(name="withholding_income_tax_other_income", dataType=DoubleType(), nullable=True),
        StructField(name="tax_on_financial_operations", dataType=DoubleType(), nullable=True),
        StructField(name="rural_land_tax", dataType=DoubleType(), nullable=True),
        StructField(name="provisional_tax_on_financial_transactions", dataType=DoubleType(), nullable=True),
        StructField(name="provisional_contribution_on_financial_transactions", dataType=DoubleType(), nullable=True),
        StructField(name="cofins", dataType=DoubleType(), nullable=True),
        StructField(name="cofins_financial_institutions", dataType=DoubleType(), nullable=True),
        StructField(name="cofins_other_sectors", dataType=DoubleType(), nullable=True),
        StructField(name="contribution_for_social_integration_program", dataType=DoubleType(), nullable=True),
        StructField(name="contribution_for_social_integration_program_financial_institutions", dataType=DoubleType(), nullable=True),
        StructField(name="contribution_for_social_integration_program_other_sectors", dataType=DoubleType(), nullable=True),
        StructField(name="social_contribution_on_net_profit", dataType=DoubleType(), nullable=True),
        StructField(name="social_contribution_on_net_profit_financial_institutions", dataType=DoubleType(), nullable=True),
        StructField(name="social_contribution_on_net_profit_other_sectors", dataType=DoubleType(), nullable=True),
        StructField(name="contribution_for_intervention_in_economic_domain_fuels_non_deductible", dataType=DoubleType(), nullable=True),
        StructField(name="contribution_for_intervention_in_economic_domain_fuels", dataType=DoubleType(), nullable=True),
        StructField(name="contribution_to_social_security_plan_public_servants", dataType=DoubleType(), nullable=True),
        StructField(name="contributions_to_social_security_plan_public_servants", dataType=DoubleType(), nullable=True),
        StructField(name="contributions_to_special_fund_development_inspection_activities", dataType=DoubleType(), nullable=True),
        StructField(name="fiscal_recovery_program", dataType=DoubleType(), nullable=True),
        StructField(name="special_installment_payment_program", dataType=DoubleType(), nullable=True),
        StructField(name="withholding_tax_law_10833_article_30", dataType=DoubleType(), nullable=True),
        StructField(name="unified_payment", dataType=DoubleType(), nullable=True),
        StructField(name="other_administered_revenues", dataType=DoubleType(), nullable=True),
        StructField(name="other_revenues", dataType=DoubleType(), nullable=True),
        StructField(name="social_security_revenue", dataType=DoubleType(), nullable=True),
        StructField(name="social_security_revenue_individual_contributors", dataType=DoubleType(), nullable=True),
        StructField(name="social_security_revenue_other_sources", dataType=DoubleType(), nullable=True),
        StructField(name="administered_by_other_agencies", dataType=DoubleType(), nullable=True)
    ]
)

In [118]:
population_states_schema = StructType([
    StructField(name="state_uf", dataType=StringType(), nullable=True),
    StructField(name="year", dataType=IntegerType(), nullable=True),
    StructField(name="population_state", dataType=IntegerType(), nullable=True),
    StructField(name="economically_active_population", dataType=IntegerType(), nullable=True)
])

In [119]:
inflation_schema = StructType([
    StructField("year", IntegerType(), nullable=True),
    StructField("month", IntegerType(), nullable=True),
    StructField("index", FloatType(), nullable=True),
    StructField("monthly_variation", FloatType(), nullable=True),
    StructField("quarterly_variation", FloatType(), nullable=True),
    StructField("semiannual_variation", FloatType(), nullable=True),
    StructField("annual_variation", FloatType(), nullable=True),
    StructField("twelve_months_variation", FloatType(), nullable=True)
])

### 3.3.3 - Reading all files

In [120]:
try:
    collection_state_df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("printSchema", "true") \
    .option("delimiter", ";") \
    .schema(collection_state_schema) \
    .load(data.get("arrecadacao_estado"))
except Exception as e:
    print(f"Error occurred while loading DataFrame: {e}")

In [121]:
# changing np.nan to None (for Spark) 
population_states_df = population_states_df.replace([np.nan], [None])
inflation_df = inflation_df.replace([np.nan], [None])

In [122]:
try:
    population_states_df = spark.createDataFrame(population_states_df, schema=population_states_schema)
    inflation_df = spark.createDataFrame(inflation_df, schema=inflation_schema)
except Exception as e:
    print(f"Error occurred while loading DataFrame: {e}")

In [None]:
collection_state_df.printSchema()
population_states_df.printSchema()
inflation_df.printSchema()

In [None]:
collection_state_df.show()

In [123]:
collection_state_df = collection_state_df.cache()
population_states_df = population_states_df.cache()
inflation_df = inflation_df.cache()

# 4 - Transformation

## 4.1 - Validate and Cleanse Data

### 4.1.1 - Joining tables (population_states and collection_state)

In [124]:
# alias for the tables
collection_state_df = collection_state_df.alias("cs")
population_states_df = population_states_df.alias("ps")

In [125]:
# joinin and selecting columns
collection_state_df = collection_state_df.join(
    population_states_df,
    (population_states_df.year == collection_state_df.year)
    &
    (collection_state_df.state == population_states_df.state_uf),
    "left"
).select(
    "cs.*",
    "ps.population_state",
    "ps.economically_active_population"
)

### 4.1.2 - Solving ASCII problem

In [127]:
# we have a non ASCII character (ç), so we need to translate
special_char = "�"

correct_char = "c"

collection_state_df = collection_state_df.withColumn("month", translate(col("month"), special_char, correct_char))

In [128]:
# changing month names from Portuguese to English
collection_state_df = collection_state_df.withColumn(
    "month",
    when(col("month") == "Janeiro", "january")
    .when(col("month") == "Fevereiro", "february")
    .when(col("month") == "Marco", "march")
    .when(col("month") == "Abril", "april")
    .when(col("month") == "Maio", "may")
    .when(col("month") == "Junho", "june")
    .when(col("month") == "Julho", "july")
    .when(col("month") == "Agosto", "august")
    .when(col("month") == "Setembro", "september")
    .when(col("month") == "Outubro", "october")
    .when(col("month") == "Novembro", "november")
    .when(col("month") == "Dezembro", "december")
    .otherwise(col("month"))
)

In [None]:
collection_state_df.show()

### 4.1.3 - Checking count grouped by month

In [129]:
months_counts_names = collection_state_df.groupBy("month").count()
months_counts_names.show()

+---------+-----+
|    month|count|
+---------+-----+
|  october|  648|
|      may|  648|
|september|  648|
|   august|  648|
|    april|  675|
| november|  648|
|     july|  648|
|  january|  675|
| february|  675|
|    march|  675|
|     june|  648|
| december|  648|
+---------+-----+



### 4.1.4 - Adding month in numeric representation

In [130]:
months = {
    "january": 1,
    "february": 2,
    "march": 3,
    "april": 4,
    "may": 5,
    "june": 6,
    "july": 7,
    "august": 8,
    "september": 9,
    "october": 10,
    "november": 11,
    "december": 12
}

In [132]:
collection_state_df = collection_state_df.withColumn(
    "month_numeric",
    when(col("month") == "january", lit(months["january"]))
    .when(col("month") == "february", lit(months["february"]))
    .when(col("month") == "march", lit(months["march"]))
    .when(col("month") == "april", lit(months["april"]))
    .when(col("month") == "may", lit(months["may"]))
    .when(col("month") == "june", lit(months["june"]))
    .when(col("month") == "july", lit(months["july"]))
    .when(col("month") == "august", lit(months["august"]))
    .when(col("month") == "september", lit(months["september"]))
    .when(col("month") == "october", lit(months["october"]))
    .when(col("month") == "november", lit(months["november"]))
    .when(col("month") == "december", lit(months["december"]))
    .otherwise(None)  # Handle any unexpected values
)

### 4.1.5 - Checking count grouped by month in numeric representation

In [134]:
numeric_month_count = collection_state_df.groupBy("month_numeric").count()
numeric_month_count.show()

+-------------+-----+
|month_numeric|count|
+-------------+-----+
|           12|  648|
|            1|  675|
|            6|  648|
|            3|  675|
|            5|  648|
|            9|  648|
|            4|  675|
|            8|  648|
|            7|  648|
|           10|  648|
|           11|  648|
|            2|  675|
+-------------+-----+



### 4.1.6 - Joining tables (collection_state and inflation_df)

#### 4.1.6.1 - Tables alias

In [135]:
collection_state_df = collection_state_df.alias("cs")
inflation_df = inflation_df.alias("inf")

### 4.1.6.1 - Joining both tables with alias

In [72]:
collection_state_df = collection_state_df.join(
    inflation_df,
    (collection_state_df.year == inflation_df.year)
    &
    (collection_state_df.month_numeric == inflation_df.month),
    "left"
).select(
    "cs.*",
    "inf.index",
    "inf.monthly_variation",
    "inf.quarterly_variation",
    "inf.semiannual_variation",
    "inf.annual_variation"
)

In [None]:
collection_state_df.show()

### 4.1.7 - Removing duplicates

In [136]:
collection_state_df.dropDuplicates()

DataFrame[year: int, month: string, state: string, import_tax: double, export_tax: double, ipi_tobacco: double, ipi_beverages: double, ipi_automobiles: double, ipi_linked_imports: double, ipi_others: double, individual_income_tax: double, corporate_income_tax_financial_entities: double, corporate_income_tax_other_companies: double, withholding_income_tax_employment_income: double, withholding_income_tax_capital_income: double, withholding_income_tax_overseas_remittances: double, withholding_income_tax_other_income: double, tax_on_financial_operations: double, rural_land_tax: double, provisional_tax_on_financial_transactions: double, provisional_contribution_on_financial_transactions: double, cofins: double, cofins_financial_institutions: double, cofins_other_sectors: double, contribution_for_social_integration_program: double, contribution_for_social_integration_program_financial_institutions: double, contribution_for_social_integration_program_other_sectors: double, social_contributio

### 4.1.8 - Handling missing values 

In [137]:
# string(will be removed), numeric(flag 0)
def fill_values_na_with_zero_or_drop(df: DataFrame) -> DataFrame:
    for col, dtype in df.dtypes:
        if isinstance(df.schema[col].dataType, NumericType):
            df = df.fillna(0, subset=[col])
        elif isinstance(df.schema[col].dataType, StringType):
            df = df.dropna(subset=[col])
    return df

In [138]:
collection_state_df = fill_values_na_with_zero_or_drop(df = collection_state_df)

## 4.2 - Adding some columns

### 4.2.1 - Regions in Brazil

In [139]:
regions_uf = {
    "north": ("AC", "AP", "AM", "PA", "RO", "RR", "TO"),
    "north_east": ("AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE"),
    "midwest": ("DF", "GO", "MT", "MS"),
    "south": ("PR", "RS", "SC"),
    "south_east": ("ES", "MG", "RJ", "SP")
}

In [140]:
# function to add column "region" based on regions_uf
def add_region_column(df: DataFrame, regions_uf: dict) -> DataFrame:
    df = df.withColumn(
        "region",
        when(col("state").isin(list(regions_uf["north"])), "north")
        .when(col("state").isin(list(regions_uf["north_east"])), "north_east")
        .when(col("state").isin(list(regions_uf["midwest"])), "midwest")
        .when(col("state").isin(list(regions_uf["south"])), "south")
        .when(col("state").isin(list(regions_uf["south_east"])), "south_east")
        .otherwise("unknown")
    )
    return df

In [141]:
collection_state_df = add_region_column(df=collection_state_df, regions_uf=regions_uf)

### 4.2.1 - Adding 'id' column

In [142]:
windowSpec = Window.orderBy("year")
collection_state_df = collection_state_df.withColumn("id", F.row_number().over(windowSpec))

# Reordering columns to have 'id' as the first column
id_first_columns = ['id'] + [col for col in collection_state_df.columns if col != 'id']
collection_state_final_df = collection_state_df.select(id_first_columns)

In [None]:
collection_state_final_df.show()