# `This notebook will contain all spark code for our project`

## Imports

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, year, month, to_date, min, max, avg, first, lag, round
from pyspark.sql.window import Window
import happybase

## Prepare data

`COMTRADE`

`NARODOWY BANK POLSKI`

In [None]:
spark = SparkSession.builder.appName("ReadCurrencyParquet").getOrCreate()

df = spark.read.parquet("hdfs:///user/vagrant/data/currency_all.parquet")

df = df.withColumn("date", to_date(col("date"), "yyyy-MM-dd")) \
        .withColumn("year", year(col("date"))) \
        .withColumn("month", month(col("date")))

yearly_agg = (df.groupBy("currency", "year").agg(min("rate").alias("min_rate"),
                                                 max("rate").alias("max_rate"),
                                                 avg("rate").alias("avg_rate")
                                                ).orderBy("currency", "year"))

monthly_agg = (df.groupBy("currency","year", "month").agg(min("rate").alias("min_rate"),
                                                          max("rate").alias("max_rate"),
                                                          avg("rate").alias("avg_rate")
                                                    ).orderBy("currency","year", "month"))

monthly_agg.show(10)
monthly_agg.printSchema()

spark.stop()

In [None]:
spark = SparkSession.builder.appName("DimCurrencyCorrect").getOrCreate()

data = [
    Row(country_iso3="USA", currency="USD"),
    Row(country_iso3="GBR", currency="GBP"),
    Row(country_iso3="NOR", currency="NOK"),
    Row(country_iso3="SWE", currency="SEK"),
    Row(country_iso3="DNK", currency="DKK"),
    Row(country_iso3="DEU", currency="EUR"),
    Row(country_iso3="FRA", currency="EUR"),
    Row(country_iso3="ESP", currency="EUR"),
    Row(country_iso3="ITA", currency="EUR"),
    Row(country_iso3="CZE", currency="CZK"),
    Row(country_iso3="HUN", currency="HUF"),
    Row(country_iso3="NZL", currency="NZD"),
    Row(country_iso3="AUS", currency="AUD"),
    Row(country_iso3="CAN", currency="CAD"),
    Row(country_iso3="JPN", currency="JPY"),
    Row(country_iso3="CHN", currency="CNY"),
    Row(country_iso3="THA", currency="THB"),
    Row(country_iso3="SGP", currency="SGD"),
    Row(country_iso3="ZAF", currency="ZAR"),
    Row(country_iso3="BRA", currency="BRL"),
    Row(country_iso3="IND", currency="INR"),
    Row(country_iso3="PHL", currency="PHP"),
    Row(country_iso3="MYS", currency="MYR"),
    Row(country_iso3="IDN", currency="IDR"),
    Row(country_iso3="KOR", currency="KRW"),
    Row(country_iso3="ROU", currency="RON"),
    Row(country_iso3="IRL", currency="EUR"),
    Row(country_iso3="BEL", currency="EUR"),
    Row(country_iso3="NLD", currency="EUR"),
    Row(country_iso3="EST", currency="EUR"),
    Row(country_iso3="AUT", currency="EUR"),
    Row(country_iso3="SVN", currency="EUR"),
    Row(country_iso3="SVK", currency="EUR"),
    Row(country_iso3="LVA", currency="EUR"),
    Row(country_iso3="LTU", currency="EUR"),
    Row(country_iso3="CYP", currency="EUR"),
    Row(country_iso3="BGR", currency="BGN"),

    Row(country_iso3="X1", currency=None),
    Row(country_iso3="_X", currency=None),
    Row(country_iso3="W00", currency=None),
    Row(country_iso3="XX", currency=None),
    Row(country_iso3="E19", currency=None),
    Row(country_iso3="S19", currency=None)
]

df_dim_currency = spark.createDataFrame(data)
df_dim_currency.show(truncate=False)
df_dim_currency.write.mode("overwrite").parquet("hdfs:///user/vagrant/dim/dim_currency_country.parquet")

spark.stop()


`WORLD DEVELOPMENT INDICATORS`

In [4]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, first, lag, round
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("PrepareWDIData") \
    .master("local[*]") \
    .getOrCreate()

input_path = "hdfs://localhost:8020/user/vagrant/project/WDI/*.parquet"

# Load data and map types
df = spark.read.parquet(input_path) \
    .withColumn("value", col("value").cast("double")) \
    .withColumn("year", col("year").cast("integer"))

# Pivot table
pivoted_df = df.groupBy("country_id", "country_name", "year") \
    .pivot("indicator_name") \
    .agg(first("value"))

# Renaming columns
rename_map = {
    "External debt stocks, total (DOD, current US$)": "external_debt",
    "GDP (current US$)": "gdp",
    "Imports of goods and services (current US$)": "import",
    "Industry (including construction), value added (% of GDP)": "industry_in_gdp",
    "Inflation, consumer prices (annual %)": "inflation",
    "Population, total": "population",
    "Services, value added (% of GDP)": "services_in_gdp",
    "Trade (% of GDP)": "trade_in_gdp"
}

for old_name, new_name in rename_map.items():
    pivoted_df = pivoted_df.withColumnRenamed(old_name, new_name)

# Calculate new indicators
# window_spec = Window.partitionBy("country_id").orderBy("year")

final_df = pivoted_df \
    .withColumn("gdp_per_capita", round(col("gdp") / col("population"), 2)) \
        .drop("indicator_name")
    # .withColumn("prev_year_gdp", lag("gdp").over(window_spec)) \
    # .withColumn("gdp_growth", round(((col("gdp") - col("prev_year_gdp")) / col("prev_year_gdp")) * 100, 2)) \
        # .drop("prev_year_gdp") \



final_df = final_df.orderBy("country_name", "year")
final_df.printSchema()
print("DataFrame count:", final_df.count())
poland_df = final_df.filter(col("country_name") == "Poland")
poland_df.show(15)

                                                                                

root
 |-- country_id: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- external_debt: double (nullable = true)
 |-- gdp: double (nullable = true)
 |-- import: double (nullable = true)
 |-- industry_in_gdp: double (nullable = true)
 |-- inflation: double (nullable = true)
 |-- population: double (nullable = true)
 |-- services_in_gdp: double (nullable = true)
 |-- trade_in_gdp: double (nullable = true)
 |-- gdp_per_capita: double (nullable = true)



                                                                                

DataFrame count: 5587




+----------+------------+----+-------------+-------------------+-------------------+----------------+------------------+-----------+----------------+----------------+--------------+
|country_id|country_name|year|external_debt|                gdp|             import| industry_in_gdp|         inflation| population| services_in_gdp|    trade_in_gdp|gdp_per_capita|
+----------+------------+----+-------------+-------------------+-------------------+----------------+------------------+-----------+----------------+----------------+--------------+
|        PL|      Poland|2003|         null| 2.1856122599847E11|7.86269228544063E10|27.5111441045363| 0.682701375787681| 3.820457E7|57.9482847667238|69.2134479841788|       5720.81|
|        PL|      Poland|2004|         null|2.56268656145134E11|9.46128680718343E10| 29.185278751871|  3.38264681884691|3.8182222E7|55.9833742114646|70.9815765694162|       6711.73|
|        PL|      Poland|2005|         null|3.06999913150525E11|1.09863040541397E11| 29.05

                                                                                

## Views