<center>

# $\textbf{Unemployment}$

<center>

### $\textbf{Code}$

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, lit, array, struct
import time

In [2]:
inicio = time.time()

In [3]:
spark = SparkSession.builder.appName('Unemployment').master("local").enableHiveSupport().getOrCreate()
spark

24/03/07 21:10:38 WARN Utils: Your hostname, Francisco-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.191.2.158 instead (on interface en0)
24/03/07 21:10:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/07 21:10:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Creating dataframe from the csv file and infering the schema
df = spark.read.load("Files/Unemployment2.csv", format="csv", sep=",", inferschema="true", header="true")

In [5]:
# Remove columns Country Code and Indicator Code
df.drop("Country Code", "Indicator Code", "Indicator Name")

# Rename the column
df = df.withColumnRenamed("Country Name","country")

In [6]:
# Assuming df is properly defined DataFrame
df = df.select("country", explode(array([
    struct(lit(year).alias("year"), col(str(year)).alias("unemployment")) 
    for year in range(2000, 2023)
])).alias("data")).selectExpr("country", "data.year", "data.unemployment")

df = df.orderBy("country")

In [7]:
# Cast columns to their desired types
df = df.withColumn("country", col("country").cast("string"))
df = df.withColumn("year", col("year").cast("int"))
df = df.withColumn("unemployment", col("unemployment").cast("double"))

In [8]:
# Filter data for years greater than 2010 and lower than 2024
df = df.filter(df["year"] > 2010)
df = df.filter(df["year"] < 2024)

In [9]:
# Order by country and then by year
df = df.orderBy("country", "year")

### $\textbf{Pre-Processing}$

In [10]:
replacements = {
    "China, People's Republic of": "China",
    "Gambia, The": "Gambia",
    "Micronesia, Fed. States of": "Micronesia",
    "South Sudan, Republic of": "South Sudan",
    "Taiwan Province of China": "Taiwan",
    "Türkiye, Republic of": "Türkiye",
    "Caribbean small states": "Caribbean",
    "Hong Kong SAR": "Hong Kong",
    "Congo": "Democratic Republic of the Congo",
    "Viet Nam": "Vietnam"
}

# Rename columns based on replacements dictionary
for old_value, new_value in replacements.items():
    df = df.withColumnRenamed(old_value, new_value)

In [11]:
# Columns to drop
countries_to_remove = ['Africa Eastern and Southern', 
                       'Africa Western and Central', 
                       'Central Europe and the Baltics', 
                       'Early-demographic dividend', 
                       'East Asia & Pacific', 
                       'East Asia & Pacific (IDA & IBRD countries)', 
                       'East Asia & Pacific (excluding high income)', 
                       'Euro area',
                       'Europe & Central Asia',
                       'Europe & Central Asia (IDA & IBRD countries)',
                       'Europe & Central Asia (excluding high income)',
                       'European Union',
                       'Fragile and conflict affected situations',
                       'Heavily indebted poor countries (HIPC)',
                       'High income',
                       'IBRD only',
                       'IDA & IBRD total',
                       'IDA blend',
                       'IDA only',
                       'IDA total',
                       'Latin America & the Caribbean (IDA & IBRD countries)',
                       'Latin America & Caribbean (excluding high income)',
                       'Least developed countries: UN classification',
                       'Low & middle income',
                       'Low income',
                       'Lower middle income',
                       'Middle East & North Africa',
                       'Middle East & North Africa (IDA & IBRD countries)',
                       'Middle East & North Africa (excluding high income)',
                       'Middle income',
                       'Not classified',
                       'OECD members',
                       'Other small states',
                       'Pacific island small states'
                       'Post-demographic dividend',
                       'Pre-demographic dividend',
                       'Small states',
                       'South Asia',
                       'South Asia (IDA & IBRD)',
                       'Sub-Saharan Africa (IDA & IBRD countries)',
                       'Sub-Saharan Africa (excluding high income)',
                       'Upper middle income',
                       'World',
                       'ther small states'
                       ]

# Drop the specified rows
df = df.filter(~df['country'].isin(countries_to_remove))

In [12]:
# Remove rows with missing values in the 'country' column
df = df.dropna(subset=['country'])

In [13]:
#Storing this dataframe in parquet
df.write.mode("overwrite").parquet("FilesParquet/Unemployment.parquet" )
spark.read.parquet("FilesParquet/Unemployment.parquet").show()
spark.stop()

                                                                                

+-----------+----+------------+
|    country|year|unemployment|
+-----------+----+------------+
|Afghanistan|2011|       7.918|
|Afghanistan|2012|       7.914|
|Afghanistan|2013|       7.914|
|Afghanistan|2014|        7.91|
|Afghanistan|2015|       9.002|
|Afghanistan|2016|      10.092|
|Afghanistan|2017|       11.18|
|Afghanistan|2018|      11.131|
|Afghanistan|2019|      11.082|
|Afghanistan|2020|       11.71|
|Afghanistan|2021|      12.075|
|Afghanistan|2022|        14.1|
|    Albania|2011|       13.48|
|    Albania|2012|       13.38|
|    Albania|2013|       15.87|
|    Albania|2014|       18.05|
|    Albania|2015|       17.19|
|    Albania|2016|       15.42|
|    Albania|2017|       13.62|
|    Albania|2018|        12.3|
+-----------+----+------------+
only showing top 20 rows



In [14]:
fim = time.time()
final = fim - inicio
print(final)

7.791278123855591
