<center>

# $\textbf{Gross Domestic Product }$

<center>

### $\textbf{Code}$

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, lit, array, struct

In [26]:
spark = SparkSession.builder.appName('GDP').master("local").enableHiveSupport().getOrCreate()
spark

In [27]:
# Creating dataframe from the csv file and infering the schema
df = spark.read.load("Files/GDP.csv", format="csv", sep=",", inferschema="true", header="true")

In [28]:
# Assuming df is properly defined DataFrame
df = df.select("country", explode(array([
    struct(lit(year).alias("year"), col(str(year)).alias("gdp")) 
    for year in range(2000, 2025)
])).alias("data")).selectExpr("country", "data.year", "data.gdp")

In [29]:
# Cast columns to their desired types
df = df.withColumn("country", col("country").cast("string"))
df = df.withColumn("year", col("year").cast("int"))
df = df.withColumn("gdp", col("gdp").cast("double"))

In [30]:
# Filter data for years greater than 2010 and lower than 2024
df = df.filter(df["year"] > 2010)
df = df.filter(df["year"] < 2024)

In [31]:
# Order by country and then by year
df = df.orderBy("country", "year")

In [32]:
#Storing this dataframe in parquet
df.write.mode("overwrite").parquet("FilesParquet/GDP.parquet" )
spark.read.parquet("FilesParquet/GDP.parquet").show()
spark.stop()

+------------------+----+---------+
|           country|year|      gdp|
+------------------+----+---------+
|           ASEAN-5|2011| 4742.031|
|           ASEAN-5|2012| 4934.317|
|           ASEAN-5|2013| 5009.992|
|           ASEAN-5|2014| 4941.123|
|           ASEAN-5|2015| 4715.428|
|           ASEAN-5|2016| 4889.034|
|           ASEAN-5|2017| 5213.511|
|           ASEAN-5|2018| 5513.448|
|           ASEAN-5|2019| 5771.691|
|           ASEAN-5|2020| 5361.577|
|           ASEAN-5|2021| 5882.641|
|           ASEAN-5|2022| 6249.703|
|           ASEAN-5|2023| 6597.204|
|Advanced economies|2011|45011.518|
|Advanced economies|2012|44632.763|
|Advanced economies|2013|44832.244|
|Advanced economies|2014|45647.709|
|Advanced economies|2015|42967.254|
|Advanced economies|2016|43894.075|
|Advanced economies|2017| 45602.87|
+------------------+----+---------+
only showing top 20 rows



### $\textbf{Pre-Processing}$

In [None]:
df['country'] = df['country'].replace({'China, People\'s Republic of': 'China', 
                                       'Gambia, The': 'Gambia',
                                       'Micronesia, Fed. States of': 'Micronesia',
                                       'South Sudan, Republic of': 'South Sudan',
                                       'Taiwan Province of China': 'Taiwan',
                                       'Türkiye, Republic of': 'Turkiye',
                                       'Bahamas, The' : 'Bahamas',
                                       'Syria' : 'Syrian Arab Republic',
                                       'Czech Republic' : 'Czechia',
                                       'Korea, Republic of' : 'South Korea',
                                       'Côte d\'Ivoire' : 'Cote d\'Ivoire',
                                       'Hong Kong SAR' : 'Hong Kong',
                                       'Lao P.D.R.' : 'Lao PDR',
                                       'Congo, Republic of ' : 'Republic of the Congo',
                                       'Congo, Dem. Rep. of the' : 'Democratic Republic of the Congo',
                                       'Pacific Islands ' : 'Pacific island small states',
                                       'North Macedonia ' : 'North Macedonia'})

# Rename the column
df = df.withColumnRenamed("China, People\'s Republic of","China")
df = df.withColumnRenamed("Gambia, The","Gambia")
df = df.withColumnRenamed("Micronesia, Fed. States of","Micronesia")
df = df.withColumnRenamed("South Sudan, Republic of","South Sudan")
df = df.withColumnRenamed("Taiwan Province of China","Taiwan")
df = df.withColumnRenamed("Türkiye, Republic of","Turkiye")
df = df.withColumnRenamed("Bahamas, The","Bahamas")
df = df.withColumnRenamed("Syria","Syrian Arab Republic")
df = df.withColumnRenamed("Czech Republic","Czechia")
df = df.withColumnRenamed("Korea, Republic of","South Korea")
df = df.withColumnRenamed("Sint Maarten (Dutch part)","Sint Maarten")
df = df.withColumnRenamed("Brunei","Brunei Darussalam")
df = df.withColumnRenamed("Cape Verde","Cabo Verde")
df = df.withColumnRenamed("Laos","Lao PDR")
df = df.withColumnRenamed("United States Virgin Islands","Virgin Islands (U.S.)")
df = df.withColumnRenamed("Turkey","Turkiye")
df = df.withColumnRenamed("Syria","Syrian Arab Republic")
df = df.withColumnRenamed("Timor","Timor-Leste")
df = df.withColumnRenamed("Russia","Russian Federation")
df = df.withColumnRenamed("Congo","Democratic Republic of the Congo")
df = df.withColumnRenamed("Slovakia","Slovenia")