<center>

# $\textbf{Tax}$

<center>

### $\textbf{Code}$

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, lit, array, struct, when
import time

In [2]:
inicio = time.time()

In [3]:
spark = SparkSession.builder.appName('Tax').master("local").enableHiveSupport().getOrCreate()
spark

24/03/07 21:06:32 WARN Utils: Your hostname, Francisco-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.191.2.158 instead (on interface en0)
24/03/07 21:06:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/07 21:06:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Creating dataframe from the csv file and infering the schema
df = spark.read.load("Files/Tax2.csv", format="csv", sep=",", inferschema="true", header="true")

In [5]:
# Rename the column
df = df.withColumnRenamed("Country Name","country")
columns_to_drop = [str(year) for year in range(1960, 2000)]
df = df.drop("Country Code","Indicator Name","Indicator Code",*columns_to_drop,"_c67")

In [6]:
# Assuming df is properly defined DataFrame
df = df.select("country", explode(array([
    struct(lit(year).alias("year"), col(str(year)).alias("tax")) 
    for year in range(2000, 2023)
])).alias("data")).selectExpr("country", "data.year", "data.tax")

df = df.orderBy("country")

In [7]:
# Replace null values in 'total_cases' column with 0
df = df.withColumn("tax", when(df["tax"].isNull(), 0).otherwise(df["tax"]))

# Cast columns to their desired types
df = df.withColumn("country", col("country").cast("string"))
df = df.withColumn("year", col("year").cast("int"))
df = df.withColumn("tax", col("tax").cast("double"))

In [8]:
# Filter data for years greater than 2010 and lower than 2024
df = df.filter(df["year"] > 2010)
df = df.filter(df["year"] < 2024)

In [9]:
# Order by country and then by year
df = df.orderBy("country", "year")

### $\textbf{Pre-Processing}$

In [10]:
replacements = {
    "China, People's Republic of": "China",
    "Gambia, The": "Gambia",
    "Micronesia, Fed. States of": "Micronesia",
    "South Sudan, Republic of": "South Sudan",
    "Taiwan Province of China": "Taiwan",
    "Türkiye, Republic of": "Türkiye",
    "Korea, Republic of": "South Korea",
    "Congo": "Democratic Republic of the Congo",
    "Caribbean small states": "Caribbean"
}

# Rename columns based on replacements dictionary
for old_value, new_value in replacements.items():
    df = df.withColumnRenamed(old_value, new_value)

In [11]:
# Columns to drop
countries_to_remove = ['Africa Eastern and Southern', 
                       'Africa Western and Central', 
                       'Central Europe and the Baltics', 
                       'Early-demographic dividend', 
                       'East Asia & Pacific', 
                       'East Asia & Pacific (IDA & IBRD countries)', 
                       'East Asia & Pacific (excluding high income)', 
                       'Euro area',
                       'Europe & Central Asia',
                       'Europe & Central Asia (IDA & IBRD countries)',
                       'Europe & Central Asia (excluding high income)',
                       'European Union',
                       'Fragile and conflict affected situations',
                       'Heavily indebted poor countries (HIPC)',
                       'High income',
                       'IBRD only',
                       'IDA & IBRD total',
                       'IDA blend',
                       'IDA only',
                       'IDA total',
                       'Latin America & the Caribbean (IDA & IBRD countries)',
                       'Latin America & Caribbean (excluding high income)',
                       'Least developed countries: UN classification',
                       'Low & middle income',
                       'Low income',
                       'Lower middle income',
                       'Middle East & North Africa',
                       'Middle East & North Africa (IDA & IBRD countries)',
                       'Middle East & North Africa (excluding high income)',
                       'Middle income',
                       'Not classified',
                       'OECD members',
                       'Other small states',
                       'Pacific island small states'
                       'Post-demographic dividend',
                       'Pre-demographic dividend',
                       'Small states',
                       'South Asia',
                       'South Asia (IDA & IBRD)',
                       'Sub-Saharan Africa (IDA & IBRD countries)',
                       'Sub-Saharan Africa (excluding high income)',
                       'Upper middle income',
                       'World'
                       ]

# Drop the specified rows
df = df.filter(~df['country'].isin(countries_to_remove))

In [12]:
# Remove rows with missing values in the 'country' column
df = df.dropna(subset=['country'])

In [13]:
#Storing this dataframe in parquet
df.write.mode("overwrite").parquet("FilesParquet/Tax.parquet" )
spark.read.parquet("FilesParquet/Tax.parquet").show()
spark.stop()

                                                                                

+-----------+----+----------------+
|    country|year|             tax|
+-----------+----+----------------+
|Afghanistan|2011|8.91679399595736|
|Afghanistan|2012|7.70780060287773|
|Afghanistan|2013|7.12277329020889|
|Afghanistan|2014|6.88210294277301|
|Afghanistan|2015|7.58538233129798|
|Afghanistan|2016|9.50265278288094|
|Afghanistan|2017|9.89845089688421|
|Afghanistan|2018|             0.0|
|Afghanistan|2019|             0.0|
|Afghanistan|2020|             0.0|
|Afghanistan|2021|             0.0|
|Afghanistan|2022|             0.0|
|    Albania|2011|18.0220132797796|
|    Albania|2012|17.4832335567459|
|    Albania|2013|16.5032706123389|
|    Albania|2014|18.3019361358269|
|    Albania|2015|18.5157898553099|
|    Albania|2016| 17.590229809729|
|    Albania|2017|18.8874860606848|
|    Albania|2018|18.5459674731621|
+-----------+----+----------------+
only showing top 20 rows



In [14]:
fim = time.time()
final = fim - inicio
print(final)

7.7793309688568115
