<center>

# $\textbf{Merge Files}$

<center>

### $\textbf{Code}$

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col
import os
import time

In [2]:
inicio = time.time()
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf "spark.executor.extraJavaOptions=-XX:ReservedCodeCacheSize=2048m" --conf "spark.driver.extraJavaOptions=-XX:ReservedCodeCacheSize=2048m" pyspark-shell'

In [3]:
spark = SparkSession.builder.appName('MergeFiles').master("local").enableHiveSupport().getOrCreate()
spark

24/03/14 16:41:38 WARN Utils: Your hostname, Francisco-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.191.2.158 instead (on interface en0)
24/03/14 16:41:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 16:41:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Set paths to the Parquet files
file_paths = ["../FilesParquet/Covid.parquet", "../FilesParquet/GDP.parquet", "../FilesParquet/Inflation.parquet",
              "../FilesParquet/Migration.parquet", "../FilesParquet/Population.parquet", "../FilesParquet/Tax.parquet",
              "../FilesParquet/Unemployment.parquet"]

# Read Parquet files into DataFrames
dfs = [spark.read.parquet(file_path) for file_path in file_paths]

# Merge or outer join the DataFrames
merged_df = dfs[0]

for df in dfs[1:]:
    merged_df = merged_df.join(df, on=["country","year"], how="outer")

                                                                                

### $\textbf{Pre-Processing}$

In [5]:
replacements = {
    "Korea": "North Korea",
    "Pacific island small states": "Pacific Islands"
}

# Rename columns based on replacements dictionary
for old_value, new_value in replacements.items():
    merged_df = merged_df.withColumnRenamed(old_value, new_value)

In [6]:
# Drop the specified columns
columns_to_drop = ['t. Lucia',
                   'ali', 
                   'alau',
                   'Anguilla',
                   'Bonaire Sint Eustatius and Saba',
                   'Cook Islands',
                   'American Samoa',
                   'Arab World',
                   'European Union',
                   'Faeroe Islands',
                   'Falkland Islands',
                   'Faroe Islands',
                   'French Guiana',
                   'Guadeloupe',
                   'Guernsey',
                   'High income',
                   'Isle of Man',
                   'Jersey',
                   'Kyrgyzstan',
                   'Late-demographic dividend',
                   'Latin America & Caribbean',
                   'Low income',
                   'Macao',              
                   'Martinique',
                   'Mayotte',
                   'Montserrat',
                   'Niue',
                   'North America',
                   'North Korea',
                   'Northern Cyprus',
                   'Northern Ireland',
                   'Oceania',
                   'Western Sahara',
                   'Wales',
                   'Wallis and Futuna',
                   'Vatican',
                   'Tokelau',
                   'Sub-Saharan Africa',
                   'Sub-Saharan Africa (Region) ',
                   'St. Lucia',
                   'St. Martin',
                   'Sint Maarten',
                   'Sint Maarten (Dutch part)',
                   'Scotland',
                   'Saint Martin',
                   'Saint Lucia',
                   'Saint Helena',
                   'Reunion',
                   'Post-demographic dividend',
                   'Pitcairn',
                   'Palestine',
                   'England',
                   'Saint Pierre and Miquelon',
                   'Saint Barthelemy',
                   'South America',
                   'St. Martin (French part)'
                   ]

# Drop the specified rows
merged_df = merged_df.filter(~df['country'].isin(columns_to_drop))

In [7]:
# Rename "no data" to "nan"
merged_df = merged_df.withColumn("country", when(df["country"] == "no data", "nan").otherwise(df["country"]))
merged_df = merged_df.withColumn("covid_cases", when(col("covid_cases").isNull(), 0).otherwise(col("covid_cases")))

In [8]:
# Sort the merged data by country and then by year
merged_df = merged_df.orderBy(col('country'), col('year'))

In [9]:
#Storing this dataframe in parquet
merged_df.write.mode("overwrite").parquet("../FinalParquet")
spark.read.parquet("../FinalParquet").show()
spark.stop()



CodeCache: size=2097152Kb used=30080Kb max_used=30080Kb free=2067071Kb
 bounds [0x0000000280000000, 0x0000000281d80000, 0x0000000300000000]
 total_blobs=10188 nmethods=9216 adapters=884
 compilation: disabled (not enough contiguous free space left)


                                                                                

+-----------+----+-----------+--------+---------+---------+----------+----------------+------------+
|    country|year|covid_cases|     gdp|inflation|migration|population|             tax|unemployment|
+-----------+----+-----------+--------+---------+---------+----------+----------------+------------+
|Afghanistan|2011|          0|   714.7|     11.8|   418796|  29249157|8.91679399595736|       7.918|
|Afghanistan|2012|          0| 784.611|      6.4|   105905|  30466479|7.70780060287773|       7.914|
|Afghanistan|2013|          0| 754.402|      7.4|    48076|  31541209|7.12277329020889|       7.914|
|Afghanistan|2014|          0| 746.922|      4.7|   255611|  32716210|6.88210294277301|        7.91|
|Afghanistan|2015|          0| 705.597|     -0.7|  -281739|  33753499|7.58538233129798|       9.002|
|Afghanistan|2016|          0| 617.126|      4.4|   -90238|  34636207|9.50265278288094|      10.092|
|Afghanistan|2017|          0| 635.789|      5.0|   -47090|  35643418|9.89845089688421|    

In [10]:
fim = time.time()
final = fim - inicio
print(final)

7.556262016296387
