# 1 - Importing libraries

In [1]:
from pyspark.sql import SparkSession
import os
import logging

# 2 - Initializing Spark Session

In [2]:
mysql_connector_jar = r"C:\Connectors\mysql-connector-j-8.4.0.jar"

spark = SparkSession.builder \
    .appName("Revenue By State EDA") \
    .config("spark.driver.extraClassPath", mysql_connector_jar) \
    .getOrCreate()

# 3 - Reading Data

## 3.1 - Setup logging

In [6]:
from loggings.log_config import LoggerConfig

logger_config = LoggerConfig()

analysis_logger = logger_config.get_analysis_logger()
etl_logger = logger_config.get_etl_logger()

## 3.2 - MySQL config

In [13]:
host = os.getenv('DB_HOST', 'localhost')
user = os.getenv('DB_USER', 'root')
password = os.getenv('DB_PASSWORD', '')
database_name = "revenue_by_state"
table_name = "collection_state"
jdbc_url = f"jdbc:mysql://{host}/{database_name}"

jdbc_properties = {
    "user": user,
    "password": password,
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [14]:
try:
    df = spark.read \
        .format("jdbc") \
        .option("url", jdbc_url) \
        .option("dbtable", table_name) \
        .options(**jdbc_properties) \
        .load()
    df.printSchema()

except Exception as e:
    logging.WARNING(f"An error occurred while loading data from MySQL: {e}")

root
 |-- id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- state: string (nullable = true)
 |-- region: string (nullable = true)
 |-- import_tax: double (nullable = true)
 |-- export_tax: double (nullable = true)
 |-- ipi_tobacco: double (nullable = true)
 |-- ipi_beverages: double (nullable = true)
 |-- ipi_auto: double (nullable = true)
 |-- ipi_linked_imports: double (nullable = true)
 |-- ipi_others: double (nullable = true)
 |-- income_tax_individual: double (nullable = true)
 |-- income_tax_financial: double (nullable = true)
 |-- income_tax_other: double (nullable = true)
 |-- withholding_tax_employment: double (nullable = true)
 |-- withholding_tax_capital: double (nullable = true)
 |-- withholding_tax_remittances: double (nullable = true)
 |-- withholding_tax_other: double (nullable = true)
 |-- tax_financial_operations: double (nullable = true)
 |-- rural_land_tax: double (nullable = true)
 |-- provisional_tax_transa

In [15]:
df.show()

+---+----+-------+-----+----------+-----------+----------+-----------+-------------+-----------+------------------+-----------+---------------------+--------------------+----------------+--------------------------+-----------------------+---------------------------+---------------------+------------------------+--------------+----------------------------+-------------------------------------+------------+----------------+------------+-------------------------------+-----------------------------------------+-------------------------------------+------------------------------+----------------------------------------+------------------------------------+-------------------------------------------------+----------------------------------+------------------------------------------+-------------------------------------------+------------------------------------------------+-----------------------+---------------------------+------------------------------+---------------+---------------------

In [16]:
df.cache()

DataFrame[id: int, year: int, month: string, state: string, region: string, import_tax: double, export_tax: double, ipi_tobacco: double, ipi_beverages: double, ipi_auto: double, ipi_linked_imports: double, ipi_others: double, income_tax_individual: double, income_tax_financial: double, income_tax_other: double, withholding_tax_employment: double, withholding_tax_capital: double, withholding_tax_remittances: double, withholding_tax_other: double, tax_financial_operations: double, rural_land_tax: double, provisional_tax_transactions: double, provisional_contribution_transactions: double, cofins: double, cofins_financial: double, cofins_other: double, contribution_social_integration: double, contribution_social_integration_financial: double, contribution_social_integration_other: double, social_contribution_net_profit: double, social_contribution_net_profit_financial: double, social_contribution_net_profit_other: double, intervention_economic_domain_non_deductible_fuels: double, intervent