# `This notebook contains all spark code for our project`

## Imports

In [3]:
# instalation of pycoutry to easily map country codes from iso2 to iso3 format
# !pip install pycountry

Defaulting to user installation because normal site-packages is not writeable
Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m23.9 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[0mInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [38]:
import findspark
findspark.init()
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import first
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, year, month, to_date, min, max, avg, first, lag, round, count, when, trim, isnan, lit, substring, col, ceil, concat, sum as spark_sum, udf, length
from pyspark.sql.window import Window
from pyspark.sql.types import DecimalType, StringType, DoubleType, DecimalType, IntegerType
import happybase
import pycountry

## Data Preparation

### `COMTRADE`

`connecting`

In [20]:

spark = SparkSession.builder.appName("data_prep").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
print("Spark session ready:", spark)

parquet_path = "/user/vagrant/project/comtrade/comtrade.parquet"

df = spark.read.parquet(parquet_path)

print("Rows count:", len(df.columns))
print("Sample rows:", df.columns[:10])
print("Schema:")
df.printSchema()
print("Sample rows:")
df.show(5, truncate=False)

Spark session ready: <pyspark.sql.session.SparkSession object at 0x7f9c2d5ca430>
Rows count: 9
Sample rows: ['quantity_code', 'quantity', 'primary_value_usd', 'quantity_desc', 'data_period', 'weight', 'partner_code', 'commodity_desc', 'hs_code']
Schema:
root
 |-- quantity_code: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- primary_value_usd: string (nullable = true)
 |-- quantity_desc: string (nullable = true)
 |-- data_period: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- partner_code: string (nullable = true)
 |-- commodity_desc: string (nullable = true)
 |-- hs_code: string (nullable = true)

Sample rows:
+-------------+--------+-----------------+-------------+-----------+------+------------+--------------+-------+
|quantity_code|quantity|primary_value_usd|quantity_desc|data_period|weight|partner_code|commodity_desc|hs_code|
+-------------+--------+-----------------+-------------+-----------+------+------------+--------------+-------+
|n

In [24]:
def missing_condition(column_name, data_type):
    c = col(column_name)

    if isinstance(data_type, StringType):
        return c.isNull() | (trim(c) == "")
    
    elif isinstance(data_type, (DoubleType, DecimalType)):
        return c.isNull() | isnan(c)
    
    else:
        return c.isNull()
    
cols_to_cast = ["quantity", "weight", "primary_value_usd"]
for c_name in cols_to_cast:
    df = df.withColumn(c_name, col(c_name).cast("double"))


# -----------------------------------------------------------------------------------------------------------------
print("\nCHECK FOR MISSING VALUES\n")


total_rows = df.count()
print("Total rows:", total_rows)

missing_report = df.select([
    count(
        when(
            missing_condition(c, df.schema[c].dataType),
            c
        )
    ).alias(c)
    for c in df.columns
])

missing_report.show(truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nFILLING MISSING VALUES\n")


df = df.fillna({
    "quantity": 0,
    "weight": 0
})

df.select("quantity", "weight").summary().show()


#-----------------------------------------------------------------------------------------------------------------
print("\nCLEAN partnerISO COLUMN\n")

def get_iso3_code(code):
    try:
        if not code: return "UNIDENTIFIED"
        code_int = int(code)
        country = pycountry.countries.get(numeric=str(code_int).zfill(3))
        return country.alpha_3 if country else "UNIDENTIFIED"
    except:
        return "UNIDENTIFIED"


iso_udf = udf(get_iso3_code, StringType())

df = df.withColumn("partnerISO", iso_udf(col("partner_code")))

df = df.withColumn(
    "partnerISO",
    when(col("partnerISO").isin("X1", "_X", "E19", "F19"), "UNIDENTIFIED")
    .otherwise(col("partnerISO"))
)

df.select("partner_code", "partnerISO").distinct().show(20, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nVALUE PER UNIT AND VALUE PER KG FEATURES\n")

df = df.withColumn(
    "unit_value_usd",
    when(col("quantity") > 0, col("primary_value_usd") / col("quantity"))
    .otherwise(lit(0))
)

df = df.withColumn(
    "usd_per_kg",
    when(col("weight") > 0, col("primary_value_usd") / col("weight"))
    .otherwise(lit(0))
)

df.select(
    "primary_value_usd", "quantity", "weight",
    "unit_value_usd", "usd_per_kg"
).show(5, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nDATE FEATURES EXTRACTION\n")

df = df.withColumn("year", substring("data_period", 1, 4).cast("int"))

df = df.withColumn(
    "month", 
    when(length(col("data_period")) == 6, substring("data_period", 5, 2).cast("int"))
    .otherwise(lit(1))
)

df = df.withColumn("quarter", ceil(col("month") / 3))
df = df.withColumn("quarter_label", concat(lit("Q"), col("quarter")))

df.select("data_period", "year", "month", "quarter", "quarter_label") \
  .distinct().show(10, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nWINDOW FUNCTIONS FOR MONTHLY AGGREGATIONS\n")

w = Window.partitionBy("data_period", "hs_code")

df = df.withColumn(
    "month_world_export_value",
    spark_sum("primary_value_usd").over(w)
)

df = df.withColumn(
    "share_of_month_market",
    when(
        col("month_world_export_value") > 0,
        col("primary_value_usd") / col("month_world_export_value")
    ).otherwise(lit(0))
)

df.select(
    "hs_code",
    "partner_code",
    "primary_value_usd",
    "month_world_export_value",
    "share_of_month_market"
).show(10, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nOUTPUT OVERVIEW\n")


comtrade_monthly = df
print("Final row count:", comtrade_monthly.count())
print("Final schema:")
comtrade_monthly.printSchema()


CHECK FOR MISSING VALUES

Total rows: 9996


                                                                                

+-------------+--------+-----------------+-------------+-----------+------+------------+--------------+-------+----------+--------------+----------+----+-----+-------+-------------+------------------------+---------------------+
|quantity_code|quantity|primary_value_usd|quantity_desc|data_period|weight|partner_code|commodity_desc|hs_code|partnerISO|unit_value_usd|usd_per_kg|year|month|quarter|quarter_label|month_world_export_value|share_of_month_market|
+-------------+--------+-----------------+-------------+-----------+------+------------+--------------+-------+----------+--------------+----------+----+-----+-------+-------------+------------------------+---------------------+
|9996         |0       |0                |9996         |0          |0     |0           |9996          |0      |0         |0             |0         |0   |0    |0      |0            |0                       |0                    |
+-------------+--------+-----------------+-------------+-----------+------+---------

In [26]:
print("\nYEARLY AGGREGATION\n")

print("Input df row count:", comtrade_monthly.count())
print("Input df columns:", comtrade_monthly.columns)

print("Years available:")
comtrade_monthly.select("year").distinct().orderBy("year").show()


#-----------------------------------------------------------------------------------------------------------------
print("\nGROUP BY YEAR / PARTNER / COMMODITY\n")

df_yearly = (
    comtrade_monthly.groupBy("year", "partnerISO", "commodity_desc", "hs_code")
      .agg(
          spark_sum("primary_value_usd").alias("annual_value_usd"),
          spark_sum("quantity").alias("quantity"),
          spark_sum("weight").alias("weight")
      )
)

print("Yearly row count:", df_yearly.count())
print("Yearly schema:")
df_yearly.printSchema()

print("Sample yearly rows:")
df_yearly.show(10, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nYEARLY UNIT VALUES\n")

df_yearly = df_yearly.withColumn(
    "unit_value_usd",
    when(col("quantity") > 0, col("annual_value_usd") / col("quantity"))
    .otherwise(lit(0))
)

df_yearly = df_yearly.withColumn(
    "usd_per_kg",
    when(col("weight") > 0, col("annual_value_usd") / col("weight"))
    .otherwise(lit(0))
)

df_yearly.select(
    "partnerISO", "commodity_desc",
    "annual_value_usd", "quantity", "weight",
    "unit_value_usd", "usd_per_kg"
).show(10, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nCHECK W00 (WORLD)\n")

df_yearly.filter(col("partnerISO") == "W00") \
    .groupBy("year") \
    .agg(
        spark_sum("annual_value_usd").alias("world_export_check")
    ) \
    .orderBy("year") \
    .show(truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nWINDOW: WORLD EXPORT VALUE\n")

w = Window.partitionBy("year", "hs_code")

df_yearly = df_yearly.withColumn(
    "world_export_value",
    spark_sum("annual_value_usd").over(w)
)

df_yearly.select(
    "year",
    "hs_code",
    "partnerISO",
    "annual_value_usd",
    "world_export_value"
).show(10, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nSHARE OF YEAR MARKET")

df_yearly = df_yearly.withColumn(
    "share_of_year_market",
    when(
        col("world_export_value") > 0,
        col("annual_value_usd") / col("world_export_value")
    ).otherwise(lit(0))
)

df_yearly.select(
    "partnerISO",
    "annual_value_usd",
    "world_export_value",
    "share_of_year_market"
).show(10, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nOUTPUT OVERVIEW\n")

print("Final yearly row count:", df_yearly.count())
print("Final yearly schema:")
df_yearly.printSchema()


YEARLY AGGREGATION

Input df row count: 9996
Input df columns: ['quantity_code', 'quantity', 'primary_value_usd', 'quantity_desc', 'data_period', 'weight', 'partner_code', 'commodity_desc', 'hs_code', 'partnerISO', 'unit_value_usd', 'usd_per_kg', 'year', 'month', 'quarter', 'quarter_label', 'month_world_export_value', 'share_of_month_market']
Years available:


                                                                                

+----+
|year|
+----+
|2015|
|2016|
|2017|
|2018|
|2019|
|2020|
|2021|
|2022|
|2023|
|2024|
+----+


GROUP BY YEAR / PARTNER / COMMODITY





Yearly row count: 9272
Yearly schema:
root
 |-- year: integer (nullable = true)
 |-- partnerISO: string (nullable = true)
 |-- commodity_desc: string (nullable = true)
 |-- hs_code: string (nullable = true)
 |-- annual_value_usd: double (nullable = true)
 |-- quantity: double (nullable = true)
 |-- weight: double (nullable = true)

Sample yearly rows:


                                                                                

+----+----------+--------------+-------+----------------+-----------+-----------+
|year|partnerISO|commodity_desc|hs_code|annual_value_usd|quantity   |weight     |
+----+----------+--------------+-------+----------------+-----------+-----------+
|2015|UZB       |null          |847130 |71.0            |1.0        |25.0       |
|2015|GBR       |null          |220300 |1.5314237E7     |1.4409926E7|1.5044839E7|
|2015|KWT       |null          |080810 |23596.0         |36288.0    |36288.0    |
|2016|KWT       |null          |080810 |289401.0        |407942.0   |407942.0   |
|2016|RUS       |null          |330499 |9.9623932E7     |1.8922555E7|1.8922555E7|
|2015|ISR       |null          |330499 |390884.0        |187866.0   |187866.0   |
|2019|UGA       |null          |330499 |140.0           |1.262      |1.262      |
|2018|AZE       |null          |040690 |152713.0        |38295.0    |38295.0    |
|2016|BHR       |null          |080810 |13556.0         |35388.0    |35388.0    |
|2016|MKD       



+----+------------------+
|year|world_export_check|
+----+------------------+
+----+------------------+


WINDOW: WORLD EXPORT VALUE



                                                                                

+----+-------+------------+----------------+------------------+
|year|hs_code|partnerISO  |annual_value_usd|world_export_value|
+----+-------+------------+----------------+------------------+
|2015|020321 |UNIDENTIFIED|3754919.0       |7509838.0         |
|2015|020321 |LTU         |204484.0        |7509838.0         |
|2015|020321 |COD         |30147.0         |7509838.0         |
|2015|020321 |LVA         |1019553.0       |7509838.0         |
|2015|020321 |SVK         |143161.0        |7509838.0         |
|2015|020321 |MNG         |429044.0        |7509838.0         |
|2015|020321 |BEL         |12051.0         |7509838.0         |
|2015|020321 |LBR         |117280.0        |7509838.0         |
|2015|020321 |ROU         |22749.0         |7509838.0         |
|2015|020321 |SWE         |2794.0          |7509838.0         |
+----+-------+------------+----------------+------------------+
only showing top 10 rows


SHARE OF YEAR MARKET


                                                                                

+------------+----------------+------------------+---------------------+
|partnerISO  |annual_value_usd|world_export_value|share_of_year_market |
+------------+----------------+------------------+---------------------+
|UNIDENTIFIED|3754919.0       |7509838.0         |0.5                  |
|LTU         |204484.0        |7509838.0         |0.027228816387250963 |
|COD         |30147.0         |7509838.0         |0.0040143342639348545|
|LVA         |1019553.0       |7509838.0         |0.1357623160446337   |
|SVK         |143161.0        |7509838.0         |0.019063127593431442 |
|MNG         |429044.0        |7509838.0         |0.05713092612650233  |
|BEL         |12051.0         |7509838.0         |0.0016046950679894827|
|LBR         |117280.0        |7509838.0         |0.015616848192996973 |
|ROU         |22749.0         |7509838.0         |0.003029226462674694 |
|SWE         |2794.0          |7509838.0         |3.720453090998767E-4 |
+------------+----------------+------------------+-



Final yearly row count: 9272
Final yearly schema:
root
 |-- year: integer (nullable = true)
 |-- partnerISO: string (nullable = true)
 |-- commodity_desc: string (nullable = true)
 |-- hs_code: string (nullable = true)
 |-- annual_value_usd: double (nullable = true)
 |-- quantity: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- unit_value_usd: double (nullable = true)
 |-- usd_per_kg: double (nullable = true)
 |-- world_export_value: double (nullable = true)
 |-- share_of_year_market: double (nullable = true)



                                                                                

## `NBP`

In [27]:
data = [
        # --- Bliski Wschód / Azja ---
    Row(country_iso3="PSE", currency="ILS"),
    Row(country_iso3="IRQ", currency="IQD"),
    Row(country_iso3="IRN", currency="IRR"),
    Row(country_iso3="ISR", currency="ILS"),
    Row(country_iso3="JOR", currency="JOD"),
    Row(country_iso3="KWT", currency="KWD"),
    Row(country_iso3="LBN", currency="LBP"),
    Row(country_iso3="OMN", currency="OMR"),
    Row(country_iso3="QAT", currency="QAR"),
    Row(country_iso3="SAU", currency="SAR"),
    Row(country_iso3="ARE", currency="AED"),
    Row(country_iso3="YEM", currency="YER"),
    Row(country_iso3="SYR", currency="SYP"),
    Row(country_iso3="AFG", currency="AFN"),
    
    Row(country_iso3="IND", currency="INR"),
    Row(country_iso3="CHN", currency="CNY"),
    Row(country_iso3="JPN", currency="JPY"),
    Row(country_iso3="KOR", currency="KRW"),
    Row(country_iso3="THA", currency="THB"),
    Row(country_iso3="VNM", currency="VND"),
    Row(country_iso3="MYS", currency="MYR"),
    Row(country_iso3="IDN", currency="IDR"),
    Row(country_iso3="PHL", currency="PHP"),
    Row(country_iso3="SGP", currency="SGD"),
    Row(country_iso3="LKA", currency="LKR"),
    Row(country_iso3="PAK", currency="PKR"),
    Row(country_iso3="BGD", currency="BDT"),
    Row(country_iso3="NPL", currency="NPR"),
    Row(country_iso3="MMR", currency="MMK"),
    Row(country_iso3="KHM", currency="KHR"),
    Row(country_iso3="LAO", currency="LAK"),
    Row(country_iso3="PRK", currency="KPW"),
    Row(country_iso3="MNG", currency="MNT"),
    Row(country_iso3="KAZ", currency="KZT"),
    Row(country_iso3="UZB", currency="UZS"),
    Row(country_iso3="TJK", currency="TJS"),
    Row(country_iso3="TKM", currency="TMT"),
    Row(country_iso3="KGZ", currency="KGS"),
    Row(country_iso3="AZE", currency="AZN"),
    Row(country_iso3="ARM", currency="AMD"),
    Row(country_iso3="GEO", currency="GEL"),
    Row(country_iso3="HKG", currency="HKD"),
    Row(country_iso3="MAC", currency="MOP"),
    
    # --- Europa ---
    Row(country_iso3="FIN", currency="EUR"),
    Row(country_iso3="FRA", currency="EUR"),
    Row(country_iso3="ITA", currency="EUR"),
    Row(country_iso3="ESP", currency="EUR"),
    Row(country_iso3="PRT", currency="EUR"),
    Row(country_iso3="DEU", currency="EUR"),
    Row(country_iso3="BEL", currency="EUR"),
    Row(country_iso3="NLD", currency="EUR"),
    Row(country_iso3="LUX", currency="EUR"),
    Row(country_iso3="AUT", currency="EUR"),
    Row(country_iso3="IRL", currency="EUR"),
    Row(country_iso3="EST", currency="EUR"),
    Row(country_iso3="LVA", currency="EUR"),
    Row(country_iso3="LTU", currency="EUR"),
    Row(country_iso3="SVK", currency="EUR"),
    Row(country_iso3="SVN", currency="EUR"),
    Row(country_iso3="MLT", currency="EUR"),
    Row(country_iso3="CYP", currency="EUR"),
    Row(country_iso3="GRC", currency="EUR"),
    Row(country_iso3="HRV", currency="EUR"),
    Row(country_iso3="AND", currency="EUR"),
    Row(country_iso3="MNE", currency="EUR"),
    Row(country_iso3="SMR", currency="EUR"),
    Row(country_iso3="VAT", currency="EUR"),
    
    Row(country_iso3="GBR", currency="GBP"),
    Row(country_iso3="NOR", currency="NOK"),
    Row(country_iso3="SWE", currency="SEK"),
    Row(country_iso3="DNK", currency="DKK"),
    Row(country_iso3="ISL", currency="ISK"),
    Row(country_iso3="CHE", currency="CHF"),
    Row(country_iso3="CZE", currency="CZK"),
    Row(country_iso3="HUN", currency="HUF"),
    Row(country_iso3="ROU", currency="RON"),
    Row(country_iso3="BGR", currency="BGN"),
    Row(country_iso3="SRB", currency="RSD"),
    Row(country_iso3="BIH", currency="BAM"),
    Row(country_iso3="MKD", currency="MKD"),
    Row(country_iso3="ALB", currency="ALL"),
    Row(country_iso3="BLR", currency="BYN"),
    Row(country_iso3="MDA", currency="MDL"),
    Row(country_iso3="UKR", currency="UAH"),
    Row(country_iso3="RUS", currency="RUB"),
    
    # --- Afryka ---
    Row(country_iso3="EGY", currency="EGP"),
    Row(country_iso3="MAR", currency="MAD"),
    Row(country_iso3="DZA", currency="DZD"),
    Row(country_iso3="TUN", currency="TND"),
    Row(country_iso3="ZAF", currency="ZAR"),
    Row(country_iso3="NGA", currency="NGN"),
    Row(country_iso3="GHA", currency="GHS"),
    Row(country_iso3="KEN", currency="KES"),
    Row(country_iso3="ETH", currency="ETB"),
    Row(country_iso3="UGA", currency="UGX"),
    Row(country_iso3="TZA", currency="TZS"),
    Row(country_iso3="RWA", currency="RWF"),
    Row(country_iso3="BDI", currency="BIF"),
    Row(country_iso3="SDN", currency="SDG"),
    Row(country_iso3="SSD", currency="SSP"),
    Row(country_iso3="SEN", currency="XOF"),
    Row(country_iso3="MLI", currency="XOF"),
    Row(country_iso3="NER", currency="XOF"),
    Row(country_iso3="BFA", currency="XOF"),
    Row(country_iso3="CIV", currency="XOF"),
    Row(country_iso3="GIN", currency="GNF"),
    Row(country_iso3="SLE", currency="SLL"),
    Row(country_iso3="GMB", currency="GMD"),
    Row(country_iso3="LBR", currency="LRD"),
    Row(country_iso3="COD", currency="CDF"),
    Row(country_iso3="COG", currency="XAF"),
    Row(country_iso3="CAF", currency="XAF"),
    Row(country_iso3="CMR", currency="XAF"),
    Row(country_iso3="GAB", currency="XAF"),
    Row(country_iso3="GNQ", currency="XAF"),
    Row(country_iso3="STP", currency="STN"),
    Row(country_iso3="CPV", currency="CVE"),
    Row(country_iso3="MUS", currency="MUR"),
    Row(country_iso3="SYC", currency="SCR"),
    Row(country_iso3="NAM", currency="NAD"),
    Row(country_iso3="BWA", currency="BWP"),
    Row(country_iso3="ZMB", currency="ZMW"),
    Row(country_iso3="ZWE", currency="ZWL"),
    Row(country_iso3="MWI", currency="MWK"),
    Row(country_iso3="MOZ", currency="MZN"),
    Row(country_iso3="DJI", currency="DJF"),
    Row(country_iso3="ERI", currency="ERN"),
    Row(country_iso3="SOM", currency="SOS"),
    
    # --- Ameryki ---
    Row(country_iso3="USA", currency="USD"),
    Row(country_iso3="CAN", currency="CAD"),
    Row(country_iso3="MEX", currency="MXN"),
    Row(country_iso3="ARG", currency="ARS"),
    Row(country_iso3="BRA", currency="BRL"),
    Row(country_iso3="CHL", currency="CLP"),
    Row(country_iso3="COL", currency="COP"),
    Row(country_iso3="PER", currency="PEN"),
    Row(country_iso3="URY", currency="UYU"),
    Row(country_iso3="PRY", currency="PYG"),
    Row(country_iso3="BOL", currency="BOB"),
    Row(country_iso3="ECU", currency="USD"),
    Row(country_iso3="VEN", currency="VES"),
    Row(country_iso3="CRI", currency="CRC"),
    Row(country_iso3="PAN", currency="PAB"),
    Row(country_iso3="DOM", currency="DOP"),
    Row(country_iso3="HTI", currency="HTG"),
    Row(country_iso3="JAM", currency="JMD"),
    Row(country_iso3="CUB", currency="CUP"),
    Row(country_iso3="BHS", currency="BSD"),
    Row(country_iso3="TTO", currency="TTD"),
    Row(country_iso3="BRB", currency="BBD"),
    Row(country_iso3="GRD", currency="XCD"),
    Row(country_iso3="DMA", currency="XCD"),
    Row(country_iso3="LCA", currency="XCD"),
    Row(country_iso3="VCT", currency="XCD"),
    Row(country_iso3="KNA", currency="XCD"),
    
    # --- Oceania ---
    Row(country_iso3="AUS", currency="AUD"),
    Row(country_iso3="NZL", currency="NZD"),
    Row(country_iso3="FJI", currency="FJD"),
    Row(country_iso3="PNG", currency="PGK"),
    Row(country_iso3="VUT", currency="VUV"),
    Row(country_iso3="WSM", currency="WST"),
    Row(country_iso3="TON", currency="TOP"),
    
    # --- Terytoria / techniczne / nieidentyfikowalne ---
    Row(country_iso3="GIB", currency="GIP"),
    Row(country_iso3="GRL", currency="DKK"),
    Row(country_iso3="CUW", currency="ANG"),
    Row(country_iso3="ABW", currency="AWG"),
    Row(country_iso3="CYM", currency="KYD"),
    Row(country_iso3="BES", currency="USD"),
    Row(country_iso3="VGB", currency="USD"),
    
    Row(country_iso3="UNIDENTIFIED", currency=None),
    Row(country_iso3="S19", currency=None),
    Row(country_iso3="W00", currency=None),
    Row(country_iso3="XX", currency=None),
    Row(country_iso3="SCG", currency=None),
    Row(country_iso3="ANT", currency=None),
    Row(country_iso3="ATF", currency=None),
    Row(country_iso3="ATA", currency=None),
    Row(country_iso3="BVT", currency=None),
    Row(country_iso3="UMI", currency=None),
    Row(country_iso3="IOT", currency=None),
    Row(country_iso3="WLF", currency=None),
    Row(country_iso3="ESH", currency=None)
 
]
df_dim_currency = spark.createDataFrame(data)

df_dim_currency \
    .filter(col("country_iso3").isin("GBR", "USA", "FRA", "NOR", "SWE", "HUN", "CHE")) \
    .show()
print('Stworzono dim currency')

+------------+--------+
|country_iso3|currency|
+------------+--------+
|         FRA|     EUR|
|         GBR|     GBP|
|         NOR|     NOK|
|         SWE|     SEK|
|         CHE|     CHF|
|         HUN|     HUF|
|         USA|     USD|
+------------+--------+

Stworzono dim currency


In [29]:
parquet_path = "/user/vagrant/project/NBP/currency_all.parquet"
currency = spark.read.parquet(parquet_path)

print("Rows count:", len(currency.columns))
print("Sample rows:", currency.columns[:10])
print("Schema:")
currency.printSchema()
print("Sample rows:")
currency.show(5, truncate=False)


# -----------------------------------------------------------------------------------------------------------------
print("\nDATE TRANSFORMATION\n")

currency = (
    currency
    .withColumn("date", to_date(col("date"), "yyyy-MM-dd"))
    .withColumn("year", year(col("date")))
    .withColumn("month", month(col("date")))
)

currency.select("date", "year", "month") \
    .orderBy("date") \
    .show(10, truncate=False)
currency.filter(col("date").isNull()).show(5, truncate=False)


# -----------------------------------------------------------------------------------------------------------------
print("\nMONTHLY AGGREGATION\n")

currency_data = (
    currency
    .groupBy("currency", "year", "month")
    .agg(
        min("rate").alias("min_rate"),
        max("rate").alias("max_rate"),
        avg("rate").alias("avg_rate")
    )
    .orderBy("currency", "year", "month")
)

print("Row count after aggregation:", currency_data.count())

print("Schema:")
currency_data.printSchema()

print("Sample aggregated rows:")
currency_data.show(20, truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nFINAL CHECKPOINT (CURRENCY)\n")

print("Currencies available:")
currency_data.select("currency").distinct().orderBy("currency").show(truncate=False)

print("Years range:")
currency_data.select("year").distinct().orderBy("year").show()

Rows count: 3
Sample rows: ['currency', 'date', 'rate']
Schema:
root
 |-- currency: string (nullable = true)
 |-- date: string (nullable = true)
 |-- rate: double (nullable = true)

Sample rows:
+--------+----------+------+
|currency|date      |rate  |
+--------+----------+------+
|THB     |2009-01-02|0.086 |
|THB     |2009-01-05|0.085 |
|THB     |2009-01-06|0.0839|
|THB     |2009-01-07|0.0827|
|THB     |2009-01-08|0.0846|
+--------+----------+------+
only showing top 5 rows


DATE TRANSFORMATION

+----------+----+-----+
|date      |year|month|
+----------+----+-----+
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
|2002-01-02|2002|1    |
+----------+----+-----+
only showing top 10 rows

+--------+----+----+----+-----+
|currency|date|rate|year|month|
+--------+----+----+----+-----+
+--------+----+----+----+-----+


MONTH

                                                                                

Row count after aggregation: 7910
Schema:
root
 |-- currency: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- min_rate: double (nullable = true)
 |-- max_rate: double (nullable = true)
 |-- avg_rate: double (nullable = true)

Sample aggregated rows:


                                                                                

+--------+----+-----+--------+--------+------------------+
|currency|year|month|min_rate|max_rate|avg_rate          |
+--------+----+-----+--------+--------+------------------+
|AUD     |2002|1    |2.0227  |2.1608  |2.1002409090909095|
|AUD     |2002|2    |2.1124  |2.1708  |2.14745           |
|AUD     |2002|3    |2.1496  |2.2054  |2.172628571428571 |
|AUD     |2002|4    |2.1425  |2.1938  |2.1731666666666674|
|AUD     |2002|5    |2.1361  |2.29    |2.2294050000000003|
|AUD     |2002|6    |2.2494  |2.3277  |2.291505          |
|AUD     |2002|7    |2.2061  |2.3882  |2.280417391304348 |
|AUD     |2002|8    |2.1959  |2.3079  |2.2618714285714288|
|AUD     |2002|9    |2.2405  |2.3013  |2.2701666666666673|
|AUD     |2002|10   |2.2345  |2.2947  |2.267004347826087 |
|AUD     |2002|11   |2.1852  |2.2451  |2.2180789473684213|
|AUD     |2002|12   |2.1605  |2.246   |2.2059100000000003|
|AUD     |2003|1    |2.1603  |2.3328  |2.234272727272727 |
|AUD     |2003|2    |2.236   |2.37    |2.300614999999999

                                                                                

+--------+
|currency|
+--------+
|AUD     |
|BRL     |
|CAD     |
|CHF     |
|CLP     |
|CNY     |
|CZK     |
|DKK     |
|EUR     |
|GBP     |
|HKD     |
|HUF     |
|IDR     |
|ILS     |
|INR     |
|ISK     |
|JPY     |
|KRW     |
|MXN     |
|MYR     |
+--------+
only showing top 20 rows

Years range:




+----+
|year|
+----+
|2002|
|2003|
|2004|
|2005|
|2006|
|2007|
|2008|
|2009|
|2010|
|2011|
|2012|
|2013|
|2014|
|2015|
|2016|
|2017|
|2018|
|2019|
|2020|
|2021|
+----+
only showing top 20 rows



                                                                                

In [30]:
print("\nSTART CURRENCY YEARLY AGGREGATION\n")

print("Input row count:", currency.count())
print("Currencies available:")
currency.select("currency").distinct().orderBy("currency").show(truncate=False)

print("Years available:")
currency.select("year").distinct().orderBy("year").show(30)

print("\nGROUP BY CURRENCY / YEAR\n")

currency_yearly = (
    currency
    .groupBy("currency", "year")
    .agg(
        min("rate").alias("min_rate"),
        max("rate").alias("max_rate"),
        avg("rate").alias("avg_rate")
    )
    .orderBy("currency", "year")
)

print("Row count after aggregation:", currency_yearly.count())

print("Schema:")
currency_yearly.printSchema()

print("Sample yearly rows:")
currency_yearly.show(20, truncate=False)


#------------------------------------------------------------------------------------------------------------------
print("\nCHECK: MIN / AVG / MAX\n")

currency_yearly.filter(
    (col("avg_rate") < col("min_rate")) |
    (col("avg_rate") > col("max_rate"))
).show(truncate=False)


#-----------------------------------------------------------------------------------------------------------------
print("\nFINAL CHECKPOINT (CURRENCY YEARLY)\n")

print("Final yearly row count:", currency_yearly.count())

print("Years range:")
currency_yearly.select("year").distinct().orderBy("year").show()
currency_yearly.printSchema()


START CURRENCY YEARLY AGGREGATION

Input row count: 166407
Currencies available:
+--------+
|currency|
+--------+
|AUD     |
|BRL     |
|CAD     |
|CHF     |
|CLP     |
|CNY     |
|CZK     |
|DKK     |
|EUR     |
|GBP     |
|HKD     |
|HUF     |
|IDR     |
|ILS     |
|INR     |
|ISK     |
|JPY     |
|KRW     |
|MXN     |
|MYR     |
+--------+
only showing top 20 rows

Years available:


                                                                                

+----+
|year|
+----+
|2002|
|2003|
|2004|
|2005|
|2006|
|2007|
|2008|
|2009|
|2010|
|2011|
|2012|
|2013|
|2014|
|2015|
|2016|
|2017|
|2018|
|2019|
|2020|
|2021|
|2022|
|2023|
|2024|
|2025|
+----+


GROUP BY CURRENCY / YEAR



                                                                                

Row count after aggregation: 661
Schema:
root
 |-- currency: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- min_rate: double (nullable = true)
 |-- max_rate: double (nullable = true)
 |-- avg_rate: double (nullable = true)

Sample yearly rows:


                                                                                

+--------+----+--------+--------+------------------+
|currency|year|min_rate|max_rate|avg_rate          |
+--------+----+--------+--------+------------------+
|AUD     |2002|2.0227  |2.3882  |2.2185633466135437|
|AUD     |2003|2.1603  |2.8604  |2.535251778656127 |
|AUD     |2004|2.3167  |3.0464  |2.6850578124999993|
|AUD     |2005|2.3158  |2.6038  |2.4654158730158717|
|AUD     |2006|2.2524  |2.4162  |2.3366333333333333|
|AUD     |2007|2.1289  |2.4544  |2.315681349206348 |
|AUD     |2008|1.7347  |2.2526  |2.0219452755905523|
|AUD     |2009|2.0758  |2.6299  |2.4465521568627455|
|AUD     |2010|2.5175  |3.0813  |2.7715913725490178|
|AUD     |2011|2.8209  |3.5088  |3.054929365079366 |
|AUD     |2012|3.1977  |3.6206  |3.3727972222222222|
|AUD     |2013|2.6771  |3.4226  |3.0599553784860563|
|AUD     |2014|2.6701  |3.0266  |2.843943650793651 |
|AUD     |2015|2.6214  |3.0638  |2.834748818897638 |
|AUD     |2016|2.7609  |3.1411  |2.9335158730158737|
|AUD     |2017|2.6818  |3.1424  |2.89489561752

                                                                                

+--------+----+--------+--------+--------+
|currency|year|min_rate|max_rate|avg_rate|
+--------+----+--------+--------+--------+
+--------+----+--------+--------+--------+


FINAL CHECKPOINT (CURRENCY YEARLY)



                                                                                

Final yearly row count: 661
Years range:




+----+
|year|
+----+
|2002|
|2003|
|2004|
|2005|
|2006|
|2007|
|2008|
|2009|
|2010|
|2011|
|2012|
|2013|
|2014|
|2015|
|2016|
|2017|
|2018|
|2019|
|2020|
|2021|
+----+
only showing top 20 rows

root
 |-- currency: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- min_rate: double (nullable = true)
 |-- max_rate: double (nullable = true)
 |-- avg_rate: double (nullable = true)



                                                                                

## `WDI`

In [32]:
iso2_to_iso3 = {c.alpha_2: c.alpha_3 for c in pycountry.countries}
def iso2_to_iso3_func(code):
    return iso2_to_iso3.get(code, None)


In [33]:
print("\nLOAD WDI PARQUET\n")

input_path = "hdfs://localhost:8020/user/vagrant/project/WDI/*.parquet"
print("Input path:", input_path)

wdi = (
    spark.read.parquet(input_path)
    .withColumn("value", col("value").cast("double"))
    .withColumn("year", col("year").cast("integer"))
)

print("WDI loaded")
print("Row count:", wdi.count())
print("Columns:", wdi.columns)

print("Schema:")
wdi.printSchema()

print("Sample rows:")
wdi.show(5, truncate=False)


#-------------------------------------------------------------------------------------------------
print("\nYEAR RANGE (INPUT WDI)\n")

wdi.select(
    min("year").alias("min_year"),
    max("year").alias("max_year")
).show()

print("Distinct years:")
wdi.select("year").distinct().orderBy("year").show(25)


#--------------------------------------------------------------------------------------------------
print("\nPIVOT WDI\n")

pivoted_wdi = (
    wdi.groupBy("country_id", "country_name", "year")
       .pivot("indicator_name")
       .agg(first("value"))
)

print("Row count after pivot:", pivoted_wdi.count())

print("Schema after pivot:")
pivoted_wdi.printSchema()

print("Sample pivoted rows:")
pivoted_wdi.show(5, truncate=False)


#-------------------------------------------------------------------------------------------------
print("\nMAPPING ISO CODES\n")

iso2_to_iso3_udf = F.udf(iso2_to_iso3_func, StringType())

# dodajemy kolumnę
pivoted_wdi = pivoted_wdi.withColumn("countryISO3", iso2_to_iso3_udf(F.col("country_id")))

pivoted_wdi.select("country_id", "countryISO3").show(5)


#-------------------------------------------------------------------------------------------------
print("\nRENAME INDICATORS\n")

rename_map = {
    "External debt stocks, total (DOD, current US$)": "external_debt",
    "GDP (current US$)": "gdp",
    "Imports of goods and services (current US$)": "import",
    "Industry (including construction), value added (% of GDP)": "industry_in_gdp",
    "Inflation, consumer prices (annual %)": "inflation",
    "Population, total": "population",
    "Services, value added (% of GDP)": "services_in_gdp",
    "Trade (% of GDP)": "trade_in_gdp"
}

for old_name, new_name in rename_map.items():
    if old_name in pivoted_wdi.columns:
        print(f"Renaming: {old_name} → {new_name}")
        pivoted_wdi = pivoted_wdi.withColumnRenamed(old_name, new_name)
    else:
        print(f"Column not found (skipped): {old_name}")


print("Columns after rename:")
print(pivoted_wdi.columns)


#-------------------------------------------------------------------------------------------------
print("\nCALCULATED INDICATORS\n")

final_wdi = (
    pivoted_wdi
    .withColumn(
        "gdp_per_capita",
        round(col("gdp") / col("population"), 2)
    )
    .drop("indicator_name")
)

final_wdi.select(
    "country_name", "countryISO3", "year", "gdp", "population", "gdp_per_capita"
).show(10, truncate=False)


#-------------------------------------------------------------------------------------------------
print("\nYEAR RANGE (FINAL WDI)\n")

final_wdi.select(
    min("year").alias("min_year"),
    max("year").alias("max_year")
).show()

print("Years available:")
final_wdi.select("year").distinct().orderBy("year").show(25)


#-------------------------------------------------------------------------------------------------
print("\nPOLAND CHECK\n")

poland_df = final_wdi.filter(col("country_name") == "Poland")

print("Poland row count:", poland_df.count())

poland_df.select(
    min("year").alias("min_year"),
    max("year").alias("max_year")
).show()

poland_df.orderBy("year").show(15, truncate=False)


#-------------------------------------------------------------------------------------------------
print("\nFINAL WDI CHECKPOINT\n")

final_wdi = final_wdi.orderBy("countryISO3", "year")

print("Final schema:")
final_wdi.printSchema()

print("Final DataFrame count:", final_wdi.count())


LOAD WDI PARQUET

Input path: hdfs://localhost:8020/user/vagrant/project/WDI/*.parquet
WDI loaded
Row count: 89376
Columns: ['country_id', 'country_name', 'year', 'indicator_id', 'indicator_name', 'value']
Schema:
root
 |-- country_id: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- indicator_id: string (nullable = true)
 |-- indicator_name: string (nullable = true)
 |-- value: double (nullable = true)

Sample rows:
+----------+-------------+----+--------------+-------------------------------------+-----+
|country_id|country_name |year|indicator_id  |indicator_name                       |value|
+----------+-------------+----+--------------+-------------------------------------+-----+
|country_id|country_name |null|indicator_id  |indicator_name                       |null |
|LI        |Liechtenstein|2022|FP.CPI.TOTL.ZG|Inflation, consumer prices (annual %)|null |
|country_id|country_name |null|indicator_id  |indicator_name 

                                                                                

+----+
|year|
+----+
|null|
|2003|
|2004|
|2005|
|2006|
|2007|
|2008|
|2009|
|2010|
|2011|
|2012|
|2013|
|2014|
|2015|
|2016|
|2017|
|2018|
|2019|
|2020|
|2021|
|2022|
|2023|
+----+


PIVOT WDI



                                                                                

Row count after pivot: 5587
Schema after pivot:
root
 |-- country_id: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- External debt stocks, total (DOD, current US$): double (nullable = true)
 |-- GDP (current US$): double (nullable = true)
 |-- Imports of goods and services (current US$): double (nullable = true)
 |-- Industry (including construction), value added (% of GDP): double (nullable = true)
 |-- Inflation, consumer prices (annual %): double (nullable = true)
 |-- Population, total: double (nullable = true)
 |-- Services, value added (% of GDP): double (nullable = true)
 |-- Trade (% of GDP): double (nullable = true)
 |-- indicator_name: double (nullable = true)

Sample pivoted rows:


                                                                                

+----------+------------+----+----------------------------------------------+-------------------+-------------------------------------------+---------------------------------------------------------+-------------------------------------+-----------------+--------------------------------+----------------+--------------+
|country_id|country_name|year|External debt stocks, total (DOD, current US$)|GDP (current US$)  |Imports of goods and services (current US$)|Industry (including construction), value added (% of GDP)|Inflation, consumer prices (annual %)|Population, total|Services, value added (% of GDP)|Trade (% of GDP)|indicator_name|
+----------+------------+----+----------------------------------------------+-------------------+-------------------------------------------+---------------------------------------------------------+-------------------------------------+-----------------+--------------------------------+----------------+--------------+
|SZ        |Eswatini    |2003|4.53448

                                                                                

+----------+-----------+
|country_id|countryISO3|
+----------+-----------+
|        SZ|        SWZ|
|        TL|        TLS|
|        KI|        KIR|
|        PT|        PRT|
|        ES|        ESP|
+----------+-----------+
only showing top 5 rows


RENAME INDICATORS

Renaming: External debt stocks, total (DOD, current US$) → external_debt
Renaming: GDP (current US$) → gdp
Renaming: Imports of goods and services (current US$) → import
Renaming: Industry (including construction), value added (% of GDP) → industry_in_gdp
Renaming: Inflation, consumer prices (annual %) → inflation
Renaming: Population, total → population
Renaming: Services, value added (% of GDP) → services_in_gdp
Renaming: Trade (% of GDP) → trade_in_gdp
Columns after rename:
['country_id', 'country_name', 'year', 'external_debt', 'gdp', 'import', 'industry_in_gdp', 'inflation', 'population', 'services_in_gdp', 'trade_in_gdp', 'indicator_name', 'countryISO3']

CALCULATED INDICATORS



                                                                                

+-------------------------+-----------+----+-------------------+-------------+--------------+
|country_name             |countryISO3|year|gdp                |population   |gdp_per_capita|
+-------------------------+-----------+----+-------------------+-------------+--------------+
|Eswatini                 |SWZ        |2003|2.14963243327703E9 |1066700.0    |2015.22       |
|Timor-Leste              |TLS        |2018|1.55598861443594E9 |1275959.0    |1219.47       |
|Kiribati                 |KIR        |2012|2.07001545867109E8 |112284.0     |1843.55       |
|Portugal                 |PRT        |2019|2.40115970063019E11|1.0286263E7  |23343.36      |
|Spain                    |ESP        |2007|1.47674627685451E12|4.5226803E7  |32652.02      |
|Croatia                  |HRV        |2008|6.84728546172831E10|4309705.0    |15888.06      |
|Middle income            |null       |2022|3.65287677581778E13|5.846464711E9|6248.01       |
|Sint Maarten (Dutch part)|SXM        |2014|1.36181150837989

                                                                                

+--------+--------+
|min_year|max_year|
+--------+--------+
|    2003|    2023|
+--------+--------+

Years available:


                                                                                

+----+
|year|
+----+
|null|
|2003|
|2004|
|2005|
|2006|
|2007|
|2008|
|2009|
|2010|
|2011|
|2012|
|2013|
|2014|
|2015|
|2016|
|2017|
|2018|
|2019|
|2020|
|2021|
|2022|
|2023|
+----+


POLAND CHECK



                                                                                

Poland row count: 21


                                                                                

+--------+--------+
|min_year|max_year|
+--------+--------+
|    2003|    2023|
+--------+--------+



                                                                                

+----------+------------+----+-------------+-------------------+-------------------+----------------+------------------+-----------+----------------+----------------+-----------+--------------+
|country_id|country_name|year|external_debt|gdp                |import             |industry_in_gdp |inflation         |population |services_in_gdp |trade_in_gdp    |countryISO3|gdp_per_capita|
+----------+------------+----+-------------+-------------------+-------------------+----------------+------------------+-----------+----------------+----------------+-----------+--------------+
|PL        |Poland      |2003|null         |2.1856122599847E11 |7.86269228544063E10|27.5111441045363|0.682701375787681 |3.820457E7 |57.9482847667238|69.2134479841788|POL        |5720.81       |
|PL        |Poland      |2004|null         |2.56268656145134E11|9.46128680718343E10|29.185278751871 |3.38264681884691  |3.8182222E7|55.9833742114646|70.9815765694162|POL        |6711.73       |
|PL        |Poland      |2005|



Final DataFrame count: 5587


                                                                                

## Joining data into monthly and yearly tabels

### `MONTHLY TABLE`

In [34]:
print("\nJOIN COMTRADE WITH CURRENCY INFO\n")

# join z df_dim_currency (country ISO)
comtrade_with_currency = (
    comtrade_monthly
    .join(
        df_dim_currency,
        comtrade_monthly.partnerISO == df_dim_currency.country_iso3,
        how="left"
    )
    .drop(df_dim_currency.country_iso3)
)

print("After join with country info:")
print("Row count:", comtrade_with_currency.count())
print("Columns:", comtrade_with_currency.columns)
comtrade_with_currency.select("partnerISO", "currency").distinct().show(20, truncate=False)


# join z kursami walut (miesięczne)
print("\nJOIN WITH MONTHLY CURRENCY RATES\n")

comtrade_with_rates = (
    comtrade_with_currency
    .join(
        currency_data,
        on=["currency", "year", "month"],
        how="left"
    )
)

print("After join with currency rates:")
print("Row count:", comtrade_with_rates.count())
comtrade_with_rates.select("currency", "year", "month", "commodity_desc", "partnerISO", "max_rate").show(10, truncate=False)



# filtr lat 2004–2023
comtrade_with_rates = comtrade_with_rates.filter((col("year") >= 2004) & (col("year") <= 2023))

print("\nAfter calculating value_local_currency and filtering years 2004-2023:")
print("Row count:", comtrade_with_rates.count())
print("Years range:")
comtrade_with_rates.selectExpr("min(year) as min_year", "max(year) as max_year").show()

print("Sample rows:")
comtrade_with_rates.orderBy("year", "month").show(10, truncate=False)

print("\nSchema after all joins:")
comtrade_with_rates.printSchema()

montly_agg = comtrade_with_rates
print("\nCreated monthly Comtrade with currency rates (2004-2023)")


JOIN COMTRADE WITH CURRENCY INFO

After join with country info:


                                                                                

Row count: 9996
Columns: ['quantity_code', 'quantity', 'primary_value_usd', 'quantity_desc', 'data_period', 'weight', 'partner_code', 'commodity_desc', 'hs_code', 'partnerISO', 'unit_value_usd', 'usd_per_kg', 'year', 'month', 'quarter', 'quarter_label', 'month_world_export_value', 'share_of_month_market', 'currency']
+----------+--------+
|partnerISO|currency|
+----------+--------+
|NIU       |null    |
|HTI       |HTG     |
|PSE       |ILS     |
|BRB       |BBD     |
|LVA       |EUR     |
|JAM       |JMD     |
|ZMB       |ZMW     |
|BRA       |BRL     |
|ARM       |AMD     |
|MOZ       |MZN     |
|CUB       |CUP     |
|JOR       |JOD     |
|ABW       |AWG     |
|SOM       |SOS     |
|BRN       |null    |
|COD       |CDF     |
|BOL       |BOB     |
|URY       |UYU     |
|GIB       |GIP     |
|LBY       |null    |
+----------+--------+
only showing top 20 rows


JOIN WITH MONTHLY CURRENCY RATES

After join with currency rates:


                                                                                

Row count: 9996


                                                                                

+--------+----+-----+--------------+----------+--------+
|currency|year|month|commodity_desc|partnerISO|max_rate|
+--------+----+-----+--------------+----------+--------+
|null    |2021|1    |null          |NIU       |null    |
|HTG     |2016|1    |null          |HTI       |null    |
|HTG     |2018|1    |null          |HTI       |null    |
|HTG     |2020|1    |null          |HTI       |null    |
|ILS     |2015|1    |null          |PSE       |0.9514  |
|ILS     |2015|1    |null          |PSE       |0.9514  |
|ILS     |2016|1    |null          |PSE       |1.0429  |
|ILS     |2015|1    |null          |PSE       |0.9514  |
|ILS     |2016|1    |null          |PSE       |1.0429  |
|ILS     |2018|1    |null          |PSE       |1.0208  |
+--------+----+-----+--------------+----------+--------+
only showing top 10 rows


After calculating value_local_currency and filtering years 2004-2023:


                                                                                

Row count: 8992
Years range:


                                                                                

+--------+--------+
|min_year|max_year|
+--------+--------+
|    2015|    2023|
+--------+--------+

Sample rows:


                                                                                

+--------+----+-----+-------------+-----------+-----------------+-------------+-----------+-----------+------------+--------------+-------+----------+-------------------+-------------------+-------+-------------+------------------------+---------------------+--------+--------+------------------+
|currency|year|month|quantity_code|quantity   |primary_value_usd|quantity_desc|data_period|weight     |partner_code|commodity_desc|hs_code|partnerISO|unit_value_usd     |usd_per_kg         |quarter|quarter_label|month_world_export_value|share_of_month_market|min_rate|max_rate|avg_rate          |
+--------+----+-----+-------------+-----------+-----------------+-------------+-----------+-----------+------------+--------------+-------+----------+-------------------+-------------------+-------+-------------+------------------------+---------------------+--------+--------+------------------+
|EUR     |2015|1    |null         |81880.0    |1.6690039E7      |null         |2015       |86409.0    |428   

### `YEARLY TABLE`

In [35]:
print("\nJOIN COMTRADE YEARLY WITH CURRENCY AND WDI\n")

# join z df_dim_currency (country ISO → currency)
comtrade_with_currency = (
    df_yearly
    .join(
        df_dim_currency,
        df_yearly.partnerISO == df_dim_currency.country_iso3,
        how="left"
    )
    .drop(df_dim_currency.country_iso3)
)

print("After join with country info:")
print("Row count:", comtrade_with_currency.count())
print("Columns:", comtrade_with_currency.columns)
comtrade_with_currency.select("partnerISO", "currency").distinct().show(20, truncate=False)


# join z kursami walut rocznymi
print("\nJOIN WITH YEARLY CURRENCY RATES\n")

comtrade_with_rates = (
    comtrade_with_currency
    .join(
        currency_yearly,
        on=["currency", "year"],
        how="left"
    ).drop(currency_yearly.year)
)

print("After join with currency rates:")
print("Row count:", comtrade_with_rates.count())
currencies = ["EUR", "USD", "HUF", "CHF"]

comtrade_with_rates.filter(
    col("currency").isin(currencies)
).select(
    "currency", "year", "partnerISO", "avg_rate", "min_rate", "max_rate"
).show(10, truncate=False)


# join z danymi WDI
print("\nJOIN WITH WDI DATA\n")

comtrade_with_wdi = (
    comtrade_with_rates
    .join(
        final_wdi,
        (comtrade_with_rates.partnerISO == final_wdi.countryISO3) &
        (comtrade_with_rates.year == final_wdi.year),
        how="left"
    ).drop(final_wdi.year)
    
)

print("After join with WDI:")
print("Row count:", comtrade_with_wdi.count())
print("Columns:", comtrade_with_wdi.columns)

# filtr lat 2004–2023
comtrade_with_wdi = comtrade_with_wdi.filter((col("year") >= 2004) & (col("year") <= 2023))

print("\nAfter filtering years 2004-2023:")
print("Row count:", comtrade_with_wdi.count())
print("Years range:")
comtrade_with_wdi.selectExpr("min(year) as min_year", "max(year) as max_year").show()

# sanity check – kilka przykładowych wierszy
print("Sample rows:")
comtrade_with_wdi.orderBy("year").show(10, truncate=False)

print("\nSchema after all joins:")
comtrade_with_wdi.printSchema()

print("\nCreated yearly Comtrade with currency rates and WDI (2004-2023)")



JOIN COMTRADE YEARLY WITH CURRENCY AND WDI

After join with country info:


                                                                                

Row count: 9272
Columns: ['year', 'partnerISO', 'commodity_desc', 'hs_code', 'annual_value_usd', 'quantity', 'weight', 'unit_value_usd', 'usd_per_kg', 'world_export_value', 'share_of_year_market', 'currency']


                                                                                

+----------+--------+
|partnerISO|currency|
+----------+--------+
|NIU       |null    |
|HTI       |HTG     |
|PSE       |ILS     |
|BRB       |BBD     |
|LVA       |EUR     |
|JAM       |JMD     |
|ZMB       |ZMW     |
|BRA       |BRL     |
|ARM       |AMD     |
|MOZ       |MZN     |
|CUB       |CUP     |
|JOR       |JOD     |
|ABW       |AWG     |
|SOM       |SOS     |
|BRN       |null    |
|COD       |CDF     |
|BOL       |BOB     |
|URY       |UYU     |
|GIB       |GIP     |
|LBY       |null    |
+----------+--------+
only showing top 20 rows


JOIN WITH YEARLY CURRENCY RATES

After join with currency rates:


                                                                                

Row count: 9272


                                                                                

+--------+----+----------+-----------------+--------+--------+
|currency|year|partnerISO|avg_rate         |min_rate|max_rate|
+--------+----+----------+-----------------+--------+--------+
|EUR     |2023|FIN       |4.543663346613546|4.3053  |4.7895  |
|EUR     |2016|FIN       |4.363654365079365|4.2355  |4.5035  |
|EUR     |2018|FIN       |4.261723412698413|4.1423  |4.3978  |
|EUR     |2017|FIN       |4.258272509960161|4.1709  |4.4157  |
|EUR     |2016|FIN       |4.363654365079365|4.2355  |4.5035  |
|EUR     |2023|FIN       |4.543663346613546|4.3053  |4.7895  |
|EUR     |2020|FIN       |4.444947058823527|4.2279  |4.633   |
|EUR     |2017|FIN       |4.258272509960161|4.1709  |4.4157  |
|EUR     |2016|FIN       |4.363654365079365|4.2355  |4.5035  |
|EUR     |2020|FIN       |4.444947058823527|4.2279  |4.633   |
+--------+----+----------+-----------------+--------+--------+
only showing top 10 rows


JOIN WITH WDI DATA

After join with WDI:


                                                                                

Row count: 9272
Columns: ['currency', 'year', 'partnerISO', 'commodity_desc', 'hs_code', 'annual_value_usd', 'quantity', 'weight', 'unit_value_usd', 'usd_per_kg', 'world_export_value', 'share_of_year_market', 'min_rate', 'max_rate', 'avg_rate', 'country_id', 'country_name', 'external_debt', 'gdp', 'import', 'industry_in_gdp', 'inflation', 'population', 'services_in_gdp', 'trade_in_gdp', 'countryISO3', 'gdp_per_capita']

After filtering years 2004-2023:


                                                                                

Row count: 8339
Years range:


                                                                                

+--------+--------+
|min_year|max_year|
+--------+--------+
|    2015|    2023|
+--------+--------+

Sample rows:




+--------+----+----------+--------------+-------+----------------+-----------+-----------+-------------------+-------------------+------------------+---------------------+--------+--------+------------------+----------+------------------+-------------+-------------------+-------------------+----------------+-----------------+----------+----------------+----------------+-----------+--------------+
|currency|year|partnerISO|commodity_desc|hs_code|annual_value_usd|quantity   |weight     |unit_value_usd     |usd_per_kg         |world_export_value|share_of_year_market |min_rate|max_rate|avg_rate          |country_id|country_name      |external_debt|gdp                |import             |industry_in_gdp |inflation        |population|services_in_gdp |trade_in_gdp    |countryISO3|gdp_per_capita|
+--------+----+----------+--------------+-------+----------------+-----------+-----------+-------------------+-------------------+------------------+---------------------+--------+--------+-----------

                                                                                

# `Saving combined data in HDFS`

In [36]:
print("\nSAVING DATA INTO HDFS\n")

final_path = "hdfs:///user/vagrant/project/final_tables/"
tmp_wdi = final_path + "tmp_wdi"
tmp_currency = final_path + "tmp_currency"

comtrade_with_wdi.coalesce(1).write.mode("overwrite").parquet(tmp_wdi)
comtrade_with_currency.coalesce(1).write.mode("overwrite").parquet(tmp_currency)

!hdfs dfs -ls {tmp_wdi}
!hdfs dfs -ls {tmp_currency}

!hdfs dfs -mv {tmp_wdi}/part-*.parquet {final_path}comtrade_with_wdi.parquet
!hdfs dfs -mv {tmp_currency}/part-*.parquet {final_path}comtrade_with_currency.parquet

!hdfs dfs -rm -r -f {tmp_wdi}
!hdfs dfs -rm -r -f {tmp_currency}

!hdfs dfs -ls {final_path}


SAVING DATA INTO HDFS



                                                                                

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hadoop-2.7.6/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/apache-tez-0.9.1-bin/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 2 items
-rw-r--r--   1 vagrant supergroup          0 2026-01-15 04:44 hdfs:///user/vagrant/project/final_tables/tmp_wdi/_SUCCESS
-rw-r--r--   1 vagrant supergroup     567612 2026-01-15 04:44 hdfs:///user/vagrant/project/final_tables/tmp_wdi/part-00000-ca092e4e-77e8-49a9-8a4b-e3061881976f-c000.snappy.parquet
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hadoop-2.7.6/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.cla

# `Views`

In [39]:
def check_hbase_table(table_name):
    connection = happybase.Connection('localhost', port=9090)
    table = connection.table(table_name)

    for key, data in table.scan(limit=5):
        print(f"Key: {key.decode()}, Data: {data}")

    connection.close()

`Trade Time View`

In [41]:
trade_time_view = (
    comtrade_monthly
        .groupBy("year", "month")
        .agg(
            F.sum("primary_value_usd").alias("total_value_usd"),
            F.sum("quantity").alias("total_quantity"),
            F.avg("unit_value_usd").alias("avg_unit_price")
        )
        .orderBy(F.desc("total_value_usd"))
)

print("Row count:", trade_time_view.count())
trade_time_view.printSchema()
trade_time_view.show(12, truncate=False)

                                                                                

Row count: 10
root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- total_value_usd: double (nullable = true)
 |-- total_quantity: double (nullable = true)
 |-- avg_unit_price: double (nullable = true)

+----+-----+---------------+--------------------+------------------+
|year|month|total_value_usd|total_quantity      |avg_unit_price    |
+----+-----+---------------+--------------------+------------------+
|2024|1    |2.3661386956E10|3.2883985195430007E9|2840.412758597528 |
|2023|1    |2.280906675E10 |3.3790199434150014E9|2634.9914872083564|
|2021|1    |2.0318853884E10|4.341011380583E9    |1735.7688860003743|
|2022|1    |1.9494085442E10|4.642509606504003E9 |2121.938166275594 |
|2020|1    |1.9032814482E10|3.493180154442998E9 |2139.799598476934 |
|2019|1    |1.7624294132E10|3.9333469215220017E9|1520.6117273497978|
|2018|1    |1.757087192E10 |3.511780822357E9    |1515.3103545469428|
|2017|1    |1.6793880956E10|4.068309684753E9    |1508.5282913738733|
|2015|1

                                                                                

`---> HBase`

In [42]:
rows_to_insert = trade_time_view.collect()
connection = happybase.Connection('localhost', port=9090)
connection.open()

table_name = 'trade_time'
cf_name = 'stats'

tables = [t.decode('utf-8') for t in connection.tables()]

if table_name in tables:
    connection.disable_table(table_name)
    connection.delete_table(table_name)

connection.create_table(table_name, {cf_name: dict()})
table = connection.table(table_name)

with table.batch(batch_size=1000) as b:
    for row in rows_to_insert:
        row_key = f"{row['year']}_{row['month']}"
        data = {
            f"{cf_name}:year": str(row['year']),
            f"{cf_name}:month": str(row['month']),
            f"{cf_name}:total_value_usd": str(row['total_value_usd']),
            f"{cf_name}:total_quantity": str(row['total_quantity']),
            f"{cf_name}:avg_unit_price": str(row['avg_unit_price']),
        }

        b.put(row_key.encode(), {k.encode(): v.encode() for k, v in data.items()})

connection.close()

                                                                                

In [43]:
check_hbase_table(table_name)

Key: 2015_1, Data: {b'stats:avg_unit_price': b'1331.8310130442228', b'stats:month': b'1', b'stats:total_quantity': b'4068857177.0', b'stats:total_value_usd': b'14635101052.0', b'stats:year': b'2015'}
Key: 2016_1, Data: {b'stats:avg_unit_price': b'1493.3583941708293', b'stats:month': b'1', b'stats:total_quantity': b'4750669141.0', b'stats:total_value_usd': b'13645532516.0', b'stats:year': b'2016'}
Key: 2017_1, Data: {b'stats:avg_unit_price': b'1508.5282913738733', b'stats:month': b'1', b'stats:total_quantity': b'4068309684.753', b'stats:total_value_usd': b'16793880956.0', b'stats:year': b'2017'}
Key: 2018_1, Data: {b'stats:avg_unit_price': b'1515.3103545469428', b'stats:month': b'1', b'stats:total_quantity': b'3511780822.357', b'stats:total_value_usd': b'17570871920.0', b'stats:year': b'2018'}
Key: 2019_1, Data: {b'stats:avg_unit_price': b'1520.6117273497978', b'stats:month': b'1', b'stats:total_quantity': b'3933346921.5220017', b'stats:total_value_usd': b'17624294132.0', b'stats:year':

`Commodity Ranking View`

In [None]:
commodity_ranking_view = (
    comtrade_with_currency.withColumn(
        "commodity_short",
        F.split(F.col("commodity_desc"), ";").getItem(0)
    )
    .groupBy("commodity_short", "hs_code")
    .agg(
        F.sum("annual_value_usd").alias("total_value_usd"),
        F.sum("quantity").alias("total_quantity"),
        F.avg("unit_value_usd").alias("avg_unit_price")
    )
    .orderBy(F.desc("total_value_usd"))
)

print("Row count:", commodity_ranking_view.count())
commodity_ranking_view.printSchema()
commodity_ranking_view.show(12, truncate=False)

                                                                                

Row count: 14
root
 |-- commodity_short: string (nullable = true)
 |-- hs_code: string (nullable = true)
 |-- total_value_usd: double (nullable = true)
 |-- total_quantity: double (nullable = true)
 |-- avg_unit_price: double (nullable = true)





+---------------+-------+---------------+--------------------+------------------+
|commodity_short|hs_code|total_value_usd|total_quantity      |avg_unit_price    |
+---------------+-------+---------------+--------------------+------------------+
|null           |240220 |7.052404596E10 |3.4160223545309997E9|37.5371715626737  |
|null           |300490 |4.6884232204E10|1.271692381272E9    |225.8849296125339 |
|null           |330499 |2.6953845756E10|3.035496235526E9    |32.207307951995034|
|null           |040690 |9.44855433E9   |2.162953395587E9    |5.04279917962624  |
|null           |870323 |9.220593968E9  |879876.0279999999   |23457.42244245355 |
|null           |847130 |8.005201414E9  |3.1391904E7         |755.7909847040744 |
|null           |080810 |7.47863374E9   |1.6980829676E10     |0.6116339962682349|
|null           |220300 |4.390513482E9  |7.332230074E9       |1.437052564666526 |
|null           |100119 |1.005015518E9  |3.602127144E9       |0.4137147752570849|
|null           

                                                                                

`---> HBase`

In [46]:
rows_to_insert = commodity_ranking_view.collect()
connection = happybase.Connection('localhost', port=9090)
connection.open()

table_name = 'commodity_ranking'
cf_name = 'info'

tables = [t.decode('utf-8') for t in connection.tables()]

if table_name in tables:
    connection.disable_table(table_name)
    connection.delete_table(table_name)

connection.create_table(table_name, {cf_name: dict()})
table = connection.table(table_name)

with table.batch(batch_size=1000) as b:
    for row in rows_to_insert:
        code = str(row['hs_code']) if row['hs_code'] else "UNKNOWN"
        name = str(row['commodity_short']).strip().replace(" ", "_") if row['commodity_short'] else "NO_NAME"
        row_key = f"{code}_{name[:20]}"
        data = {
            f"{cf_name}:commodity_name": str(row['commodity_short']),
            f"{cf_name}:hs_code": str(row['hs_code']),
            f"{cf_name}:total_value_usd": str(row['total_value_usd']),
            f"{cf_name}:total_quantity": str(row['total_quantity']),
            f"{cf_name}:avg_unit_price": str(row['avg_unit_price'])
        }

        safe_data = {k.encode(): str(v).encode() for k, v in data.items() if v is not None}
        b.put(row_key.encode(), safe_data)

connection.close()

                                                                                

In [47]:
check_hbase_table(table_name)

Key: 020321_NO_NAME, Data: {b'info:avg_unit_price': b'2.1036523960069897', b'info:commodity_name': b'None', b'info:hs_code': b'020321', b'info:total_quantity': b'52554020.0', b'info:total_value_usd': b'113937560.0'}
Key: 040690_NO_NAME, Data: {b'info:avg_unit_price': b'5.04279917962624', b'info:commodity_name': b'None', b'info:hs_code': b'040690', b'info:total_quantity': b'2162953395.587', b'info:total_value_usd': b'9448554330.0'}
Key: 080810_NO_NAME, Data: {b'info:avg_unit_price': b'0.6116339962682349', b'info:commodity_name': b'None', b'info:hs_code': b'080810', b'info:total_quantity': b'16980829676.0', b'info:total_value_usd': b'7478633740.0'}
Key: 100119_NO_NAME, Data: {b'info:avg_unit_price': b'0.4137147752570849', b'info:commodity_name': b'None', b'info:hs_code': b'100119', b'info:total_quantity': b'3602127144.0', b'info:total_value_usd': b'1005015518.0'}
Key: 220290_NO_NAME, Data: {b'info:avg_unit_price': b'0.8457822076732189', b'info:commodity_name': b'None', b'info:hs_code': b

`Partner Market Share View`

In [52]:
partner_market_share_view = (
    comtrade_with_currency
    .filter(F.col("partnerISO") != "W00")
    .groupBy("partnerISO", "year")
    .agg(
        F.sum("annual_value_usd").alias("partner_value_usd"),
        F.avg("share_of_year_market").alias("avg_market_share")
    )
    .orderBy(F.desc("avg_market_share"))
)

print("Row count:", partner_market_share_view.count())
partner_market_share_view.show(truncate=False)

                                                                                

Row count: 1712




+------------+----+-----------------+-------------------+
|partnerISO  |year|partner_value_usd|avg_market_share   |
+------------+----+-----------------+-------------------+
|UNIDENTIFIED|2016|7.499256852E9    |0.5631815428325354 |
|UNIDENTIFIED|2015|8.237812808E9    |0.5626221980523987 |
|UNIDENTIFIED|2017|9.176068007E9    |0.5617020510081819 |
|UNIDENTIFIED|2018|9.468912858E9    |0.5607745501827532 |
|UNIDENTIFIED|2019|9.456312272E9    |0.5598826089952972 |
|UNIDENTIFIED|2022|1.0481918763E10  |0.5465762341454933 |
|UNIDENTIFIED|2021|1.0926510327E10  |0.5460553350535321 |
|UNIDENTIFIED|2020|1.0188893122E10  |0.5423453429990615 |
|UNIDENTIFIED|2023|1.223772614E10   |0.5420907333480662 |
|UNIDENTIFIED|2024|1.2669167199E10  |0.534720405624278  |
|DEU         |2019|1.928398556E9    |0.1219119144845635 |
|DEU         |2020|2.469131645E9    |0.11968815133066231|
|DEU         |2018|1.800536048E9    |0.11504938757944343|
|DEU         |2023|3.330414053E9    |0.10745667370594668|
|DEU         |

                                                                                

`---> HBase`

In [53]:
rows_to_insert = partner_market_share_view.collect()
connection = happybase.Connection('localhost', port=9090)
connection.open()

table_name = 'partner_market_share'
cf_name = 'info'

tables = [t.decode('utf-8') for t in connection.tables()]

if table_name in tables:
    connection.disable_table(table_name)
    connection.delete_table(table_name)

connection.create_table(table_name, {cf_name: dict()})
table = connection.table(table_name)

with table.batch(batch_size=1000) as b:
    for row in rows_to_insert:
        partner = str(row['partnerISO']) if row['partnerISO'] else "UNKNOWN"
        year = str(row['year']) if row['year'] else "0000"
        row_key = f"{partner}_{year}"
        
        data = {
            f"{cf_name}:partner_iso": str(row['partnerISO']),
            f"{cf_name}:year": str(row['year']),
            f"{cf_name}:partner_value_usd": str(row['partner_value_usd']),
            f"{cf_name}:avg_market_share": str(row['avg_market_share'])
        }

        safe_data = {k.encode(): str(v).encode() for k, v in data.items() if v is not None}
        b.put(row_key.encode(), safe_data)

connection.close()

                                                                                

In [54]:
check_hbase_table(table_name)

Key: ABW_2015, Data: {b'info:avg_market_share': b'8.883992663883145e-09', b'info:partner_iso': b'ABW', b'info:partner_value_usd': b'13.0', b'info:year': b'2015'}
Key: ABW_2018, Data: {b'info:avg_market_share': b'9.480075055358534e-08', b'info:partner_iso': b'ABW', b'info:partner_value_usd': b'221.0', b'info:year': b'2018'}
Key: ABW_2019, Data: {b'info:avg_market_share': b'8.048685960503389e-09', b'info:partner_iso': b'ABW', b'info:partner_value_usd': b'20.0', b'info:year': b'2019'}
Key: ABW_2020, Data: {b'info:avg_market_share': b'7.517317341862756e-07', b'info:partner_iso': b'ABW', b'info:partner_value_usd': b'1342.0', b'info:year': b'2020'}
Key: ABW_2021, Data: {b'info:avg_market_share': b'3.377130616481169e-07', b'info:partner_iso': b'ABW', b'info:partner_value_usd': b'1098.0', b'info:year': b'2021'}


`Economic Structure View`

In [49]:
economic_structure_view = (
    comtrade_with_wdi
      .groupBy("partnerISO")
      .agg(
          F.avg("industry_in_gdp").alias("industry_share"),
          F.avg("services_in_gdp").alias("services_share"),
          F.avg("trade_in_gdp").alias("trade_share"),
          F.sum("annual_value_usd").alias("trade_value")
      )
).orderBy(F.desc("trade_value"))
economic_structure_view.show()



+------------+------------------+------------------+------------------+---------------+
|  partnerISO|    industry_share|    services_share|       trade_share|    trade_value|
+------------+------------------+------------------+------------------+---------------+
|UNIDENTIFIED|              null|              null|              null|8.7673411149E10|
|         DEU| 25.79496784322826| 63.07429828552411|  79.1754079506646|1.7387596813E10|
|         ITA| 21.56279894062632| 65.78801705972678|59.563399289174065|  5.430847217E9|
|         GBR| 17.63064339498507|  71.4000227016855|62.393309581279766|  4.434463894E9|
|         NLD|17.539938596010018| 70.04621636993481| 162.2555761017519|   4.30180645E9|
|         ESP| 20.09187117746622|  68.1000696181885| 66.73040024785224|  4.255951194E9|
|         CZE|30.904554676293145|57.755060242498246| 141.6598708144104|  4.142411843E9|
|         RUS|30.944746259007452| 55.45825266685887| 47.13011828487344|  3.065630946E9|
|         BEL|19.165152556002916

                                                                                

`---> HBase`

In [50]:
rows_to_insert = economic_structure_view.collect()
connection = happybase.Connection('localhost', port=9090)
connection.open()

table_name = 'economic_structure'
cf_name = 'info'

tables = [t.decode('utf-8') for t in connection.tables()]

if table_name in tables:
    connection.disable_table(table_name)
    connection.delete_table(table_name)

connection.create_table(table_name, {cf_name: dict()})
table = connection.table(table_name)

with table.batch(batch_size=1000) as b:
    for row in rows_to_insert:
        row_key = str(row['partnerISO']) if row['partnerISO'] else "UNKNOWN"
        
        data = {
            f"{cf_name}:partner_iso": str(row['partnerISO']),
            f"{cf_name}:industry_share": str(row['industry_share']),
            f"{cf_name}:services_share": str(row['services_share']),
            f"{cf_name}:trade_share": str(row['trade_share']),
            f"{cf_name}:trade_value": str(row['trade_value'])
        }

        safe_data = {k.encode(): str(v).encode() for k, v in data.items() if v is not None}
        b.put(row_key.encode(), safe_data)

connection.close()

                                                                                

In [51]:
check_hbase_table(table_name)

Key: ABW, Data: {b'info:industry_share': b'11.947701250438433', b'info:partner_iso': b'ABW', b'info:services_share': b'78.26654964172087', b'info:trade_share': b'137.38053905982585', b'info:trade_value': b'13028.0'}
Key: AFG, Data: {b'info:industry_share': b'13.145582019192071', b'info:partner_iso': b'AFG', b'info:services_share': b'55.27434075205431', b'info:trade_share': b'53.70594176544601', b'info:trade_value': b'327907.0'}
Key: AGO, Data: {b'info:industry_share': b'36.86057495976961', b'info:partner_iso': b'AGO', b'info:services_share': b'44.96817736052077', b'info:trade_share': b'53.427507097543014', b'info:trade_value': b'1092792.0'}
Key: AIA, Data: {b'info:industry_share': b'None', b'info:partner_iso': b'AIA', b'info:services_share': b'None', b'info:trade_share': b'None', b'info:trade_value': b'1704.0'}
Key: ALB, Data: {b'info:industry_share': b'23.300655186997655', b'info:partner_iso': b'ALB', b'info:services_share': b'45.14529545267558', b'info:trade_share': b'74.646510081100

In [57]:
spark.stop()