<center>

# $\textbf{Inflation}$

<center>

### $\textbf{Code}$

In [1]:
import pandas as pd
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import explode, col, lit, array, struct

In [2]:
inicio = time.time()

In [3]:
spark = SparkSession.builder.appName('Inflation').master("local").config("spark.jars.packages", "com.crealytics:spark-excel_2.11:0.12.2").getOrCreate()
spark

24/03/14 16:33:02 WARN Utils: Your hostname, Francisco-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.191.2.158 instead (on interface en0)
24/03/14 16:33:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/franciscoclaudino/.ivy2/cache
The jars for the packages stored in: /Users/franciscoclaudino/.ivy2/jars
com.crealytics#spark-excel_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-55f109fe-88db-4755-b73b-08f95d5c87ab;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Library/anaconda3/envs/DataScience/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.crealytics#spark-excel_2.11;0.12.2 in central
	found org.apache.poi#poi;4.1.0 in central
	found commons-codec#commons-codec;1.12 in central
	found org.apache.commons#commons-collections4;4.3 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.apache.poi#poi-ooxml;4.1.0 in central
	found org.apache.poi#poi-ooxml-schemas;4.1.0 in central
	found org.apache.xmlbeans#xmlbeans;3.1.0 in central
	found org.apache.commons#commons-compress;1.18 in central
	found com.github.virtuald#curvesapi;1.06 in central
	found com.norbitltd#spoiwo_2.11;1.6.0 in central
	found org.scala-lang.modules#scala-xml_2.11;1.2.0 in central
	found joda-time#joda-time;2.9.9 in central
	found org.joda#joda-convert;2.0.1 in central
	found com.fasterxml.jackson.core#jackson-core;2.8.8 in central
	found com.monitorjbl#xlsx-streamer;2.1.0 in central
	found com.rackspace.apache#xerces2-xsd11;2.11.1 in central
	found com.rackspace.eclipse.webtools.sourceediting#org.eclipse.wst.xml.xpath2.p

In [4]:
# Step 4: Read the Excel file using pandas
pandas_df = pd.read_excel("../Files/Inflation.xlsx")

# Replace "no data" with NaN
pandas_df.replace("no data", float("nan"), inplace=True)

# Extract column names from the first row
column_names = [str(col) for col in pandas_df.columns]

# Step 5: Define the schema for the Spark DataFrame
schema_fields = [StructField(column_names[0], StringType(), True)] + \
                [StructField(col, DoubleType(), True) for col in column_names[1:]]

# Create schema
schema = StructType(schema_fields)

# Step 6: Convert the pandas DataFrame to a Spark DataFrame with the specified schema
df = spark.createDataFrame(pandas_df, schema=schema)

In [5]:
# Rename the columns "location" and "total_cases"
df = df.withColumnRenamed("Inflation rate, average consumer prices (Annual percent change)","country")

In [6]:
# Assuming df is properly defined DataFrame
df = df.select("country", explode(array([
    struct(lit(year).alias("year"), col(str(year)).alias("inflation")) 
    for year in range(2000, 2025)
])).alias("data")).selectExpr("country", "data.year", "data.inflation")

In [7]:
# Cast columns to their desired types
df = df.withColumn("country", col("country").cast("string"))
df = df.withColumn("year", col("year").cast("int"))
df = df.withColumn("inflation", col("inflation").cast("double"))

In [8]:
# Filter data for years greater than 2010 and lower than 2024
df = df.filter(df["year"] > 2010)
df = df.filter(df["year"] < 2024)

In [9]:
# Order by country and then by year
df = df.orderBy("country", "year")

### $\textbf{Pre-Processing}$

In [10]:
replacements = {
    "China, People's Republic of": 'China',
    'Gambia, The': 'Gambia',
    'Micronesia, Fed. States of': 'Micronesia',
    'South Sudan, Republic of': 'South Sudan',
    'Taiwan Province of China': 'Taiwan',
    'Türkiye, Republic of': 'Turkiye',
    'Bahamas, The': 'Bahamas',
    'Czech Republic': 'Czechia',
    'Syria': 'Syrian Arab Republic',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Hong Kong SAR': 'Hong Kong',
    'Lao P.D.R.': 'Lao PDR',
    'Korea, Republic of': 'South Korea',
    'Congo, Dem. Rep. of the': 'Democratic Republic of the Congo',
    'Congo, Republic of ': 'Republic of the Congo',
    'Pacific Islands ': 'Pacific island small states',
    'North Macedonia ': 'North Macedonia'
}

# Rename columns based on replacements dictionary
for old_value, new_value in replacements.items():
    df = df.withColumnRenamed(old_value, new_value)

In [11]:
# Columns to drop
countries_to_remove = ['ASEAN-5', 
                       'Advanced economies', 
                       'Africa (Region)', 
                       'Asia and Pacific', 
                       'Australia and New Zealand', 
                       'Central America', 
                       'Central Asia and the Caucasus', 
                       '©IMF, 2023',
                       'East Asia',
                       'Eastern Europe ',
                       'Emerging and Developing Asia',
                       'Emerging and Developing Europe',
                       'Emerging market and developing economies',
                       'Euro area',
                       'Europe',
                       'European Union',
                       'Latin America and the Caribbean',
                       'Macao SAR',
                       'Major advanced economies (G7)',
                       'Middle East (Region)',
                       'Middle East and Central Asia',
                       'North Africa',
                       'North America',
                       'North Macedonia',
                       'Other advanced economies',
                       'South America',
                       'South Asia',
                       'Southeast Asia',
                       'Sub-Saharan Africa',
                       'Sub-Saharan Africa (Region)',
                       'Western Europe',
                       'Western Hemisphere (Region)',
                       'World',
                       'nan']

# Drop the specified rows
df = df.filter(~df['country'].isin(countries_to_remove))

In [12]:
# Remove rows with missing values in the 'country' column
df = df.dropna(subset=['country'])

In [13]:
#Storing this dataframe in parquet
df.write.mode("overwrite").parquet("../FilesParquet/Inflation.parquet" )
spark.read.parquet("../FilesParquet/Inflation.parquet").show()
spark.stop()

24/03/14 16:33:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-----------+----+---------+
|    country|year|inflation|
+-----------+----+---------+
|Afghanistan|2011|     11.8|
|Afghanistan|2012|      6.4|
|Afghanistan|2013|      7.4|
|Afghanistan|2014|      4.7|
|Afghanistan|2015|     -0.7|
|Afghanistan|2016|      4.4|
|Afghanistan|2017|      5.0|
|Afghanistan|2018|      0.6|
|Afghanistan|2019|      2.3|
|Afghanistan|2020|      5.6|
|Afghanistan|2021|      5.1|
|Afghanistan|2022|     13.7|
|Afghanistan|2023|      NaN|
|    Albania|2011|      3.4|
|    Albania|2012|      2.0|
|    Albania|2013|      1.9|
|    Albania|2014|      1.6|
|    Albania|2015|      1.9|
|    Albania|2016|      1.3|
|    Albania|2017|      2.0|
+-----------+----+---------+
only showing top 20 rows



In [14]:
fim = time.time()
final = fim - inicio
print(final)

10.069801092147827
