<center>

# $\textbf{Inflation}$

<center>

### $\textbf{Code}$

In [19]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import explode, col, lit, array, struct

In [20]:
spark = SparkSession.builder.appName('Inflation').master("local").config("spark.jars.packages", "com.crealytics:spark-excel_2.11:0.12.2").getOrCreate()
spark

In [21]:
# Step 4: Read the Excel file using pandas
pandas_df = pd.read_excel("Files/Inflation.xlsx")

# Replace "no data" with NaN
pandas_df.replace("no data", float("nan"), inplace=True)

# Extract column names from the first row
column_names = [str(col) for col in pandas_df.columns]

# Step 5: Define the schema for the Spark DataFrame
schema_fields = [StructField(column_names[0], StringType(), True)] + \
                [StructField(col, DoubleType(), True) for col in column_names[1:]]

# Create schema
schema = StructType(schema_fields)

# Step 6: Convert the pandas DataFrame to a Spark DataFrame with the specified schema
df = spark.createDataFrame(pandas_df, schema=schema)

In [22]:
# Rename the columns "location" and "total_cases"
df = df.withColumnRenamed("Inflation rate, average consumer prices (Annual percent change)","country")

In [23]:
# Assuming df is properly defined DataFrame
df = df.select("country", explode(array([
    struct(lit(year).alias("year"), col(str(year)).alias("inflation")) 
    for year in range(2000, 2025)
])).alias("data")).selectExpr("country", "data.year", "data.inflation")

In [24]:
# Cast columns to their desired types
df = df.withColumn("country", col("country").cast("string"))
df = df.withColumn("year", col("year").cast("int"))
df = df.withColumn("inflation", col("inflation").cast("double"))

In [25]:
# Filter data for years greater than 2010 and lower than 2024
df = df.filter(df["year"] > 2010)
df = df.filter(df["year"] < 2024)

In [26]:
# Order by country and then by year
df = df.orderBy("country", "year")

In [27]:
#Storing this dataframe in parquet
df.write.mode("overwrite").parquet("FilesParquet/Inflation.parquet" )
spark.read.parquet("FilesParquet/Inflation.parquet").show()
spark.stop()

+------------------+----+---------+
|           country|year|inflation|
+------------------+----+---------+
|           ASEAN-5|2011|      4.7|
|           ASEAN-5|2012|      3.4|
|           ASEAN-5|2013|      4.2|
|           ASEAN-5|2014|      4.3|
|           ASEAN-5|2015|      3.1|
|           ASEAN-5|2016|      2.0|
|           ASEAN-5|2017|      2.8|
|           ASEAN-5|2018|      2.6|
|           ASEAN-5|2019|      1.9|
|           ASEAN-5|2020|      1.0|
|           ASEAN-5|2021|      2.0|
|           ASEAN-5|2022|      4.8|
|           ASEAN-5|2023|      3.6|
|Advanced economies|2011|      2.7|
|Advanced economies|2012|      2.0|
|Advanced economies|2013|      1.4|
|Advanced economies|2014|      1.4|
|Advanced economies|2015|      0.3|
|Advanced economies|2016|      0.7|
|Advanced economies|2017|      1.7|
+------------------+----+---------+
only showing top 20 rows

