In [51]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path="../.env")



True

In [52]:
from snowflake.snowpark import Session

connection_parameters = {
    "account": "YUJMLNP-YOB51920",
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_PASSWORD"],
    "role": os.environ.get("SNOWFLAKE_ROLE"),
    "warehouse": "COMPUTE_WH",
    "database": "NBU_EXCHANGE",
    "schema": "SILVER"
}

session = Session.builder.configs(connection_parameters).create()


In [53]:
exchange_rate_extracted_df = session.table("nbu_exchange.silver.exchange_rate_extracted")
exchange_rate_extracted_df.show(5)


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CALCULATION_DATE"  |"CURRENCY_CODE"  |"CURRENCY_NAME"  |"EXCHANGE_DATE"  |"GROUP_NUMBER"  |"R030_CODE"  |"RATE"    |"RATE_PER_UNIT"  |"SPECIAL_CONDITIONS"  |"CURRENCY_NAME_UA"      |"UNITS"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|                    |USD              |NULL             |30.06.1996       |1               |1            |178600.0  |178600.0         |NULL                  |Долар США               |1        |
|                    |LTL              |NULL             |30.06.1996       |1               |954          |44650.0   |44650.0          |NULL                  |Литовський літ          |1        |
|                    |NLG

-- Need to do:
-- 0. deduplicate - after cleaning and mapping!
-- 3. currency_code upper case maybe
-- 4. currency_name has nulls (IS null), map
-- 6. group_number (1,2,3) find what's the meaning, map with names
-- 7. r030_code - 150 r030 count, 109 currency count?
-- 8. r030_code - 3d group is precious metals, 2nd and 1st the same currency?
-- 9. rate - seems ok
-- 10. rate_per_unit - seems ok
-- 11. special_conditions - seems ok
-- Special=null/Y/N - sign of the conditions for calculating the hryvnia to US dollar 
-- exchange rate: null - for records for days when the sign was not determined and for
-- valcode≠usd, Y - under special conditions, N - under normal conditions;
-- 12. currency_name_ua - seems ok, can add additional column with countries 
-- 13. units - seems ok
-- add collumn if records valid

-- Create a new table for cleaned and transformed data

In [97]:
from snowflake.snowpark.functions import col, when, trim, to_date, lit, sql_expr, count 

In [55]:
# Cast date columns to date type and handle null or empty values for calculation_date

exchange_rate_extracted_df = exchange_rate_extracted_df \
    .with_column(
        "calculation_date",
        when(
            (trim(col("calculation_date")) == lit('')) | col("calculation_date").is_null(),
            to_date(lit("01.01.1900"), "DD.MM.YYYY")
        ).otherwise(
            to_date(trim(col("calculation_date")), "DD.MM.YYYY")
        )
    ) \
    .with_column(
        "exchange_date",
        to_date(col("exchange_date"), "DD.MM.YYYY")
    )


In [56]:
exchange_rate_extracted_df \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .order_by('currency_name_ua') \
    .show(250)


--------------------------------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"              |"CURRENCY_NAME_UA"                  |
--------------------------------------------------------------------------------------
|DEM              |NULL                         |Hімецькі маpки                      |
|EUR              |NULL                         |Євро                                |
|EUR              |Euro                         |Євро                                |
|EGP              |Egyptian Pound               |Єгипетський фунт                    |
|EGP              |NULL                         |Єгипетський фунт                    |
|JPY              |Yen                          |Єна                                 |
|JPY              |NULL                         |Єна                                 |
|INR              |Indian Rupee                 |Індійська рупія                     |
|INR              |NULL                    

In [57]:
exchange_rate_extracted_df \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .order_by('currency_name_ua') \
    .count()

# 221

221

In [58]:
exchange_rate_extracted_df \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .where(col("currency_name").isNull()) \
    .show(150)

-------------------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"  |"CURRENCY_NAME_UA"                 |
-------------------------------------------------------------------------
|NOK              |NULL             |Норвезька крона                    |
|CAD              |NULL             |Канадський долар                   |
|TMM              |NULL             |Туркменський манат                 |
|ROL              |NULL             |Лей (Румунія)                      |
|GRD              |NULL             |Грецька драхма                     |
|SIT              |NULL             |Словенський толар                  |
|XEU              |NULL             |ЕКЮ                                |
|SKK              |NULL             |Словацькі крони                    |
|PTE              |NULL             |Португальські ескудо               |
|IQD              |NULL             |Іракський динар                    |
|LYD              |NULL             |Л

In [59]:
exchange_rate_extracted_df \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .where(col("currency_name").isNull()) \
    .count()

# 149

149

In [60]:
exchange_rate_extracted_df \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .where(col("currency_code") == "AUD") \
    .order_by(col("currency_name").desc()) \
    .show(10)


--------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"    |"CURRENCY_NAME_UA"    |
--------------------------------------------------------------
|AUD              |Australian Dollar  |Австралійський долар  |
|AUD              |NULL               |Австралійський долар  |
--------------------------------------------------------------



In [61]:
from snowflake.snowpark import Window
from snowflake.snowpark.functions import coalesce, first_value, trim, upper

# To fill in nulls in currency_name with value from currency_name that has the same code

windows = Window.partition_by(trim(upper(col("currency_code")))).order_by(col("currency_name").desc())

exchange_rate_extracted_df_m = exchange_rate_extracted_df \
    .with_column(
        "currency_name",
        coalesce(
            col("currency_name"),
            first_value(col("currency_name")).over(windows)
        )
    )

In [62]:
exchange_rate_extracted_df_m \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .order_by('currency_name_ua') \
    .count()

# 165

165

In [63]:
exchange_rate_extracted_df_m \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .order_by('currency_name_ua') \
    .show(170)

--------------------------------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"              |"CURRENCY_NAME_UA"                  |
--------------------------------------------------------------------------------------
|DEM              |NULL                         |Hімецькі маpки                      |
|EUR              |Euro                         |Євро                                |
|EGP              |Egyptian Pound               |Єгипетський фунт                    |
|JPY              |Yen                          |Єна                                 |
|INR              |Indian Rupee                 |Індійська рупія                     |
|IQD              |Iraqi Dinar                  |Іракський динар                     |
|IRR              |Iranian Rial                 |Іранський ріал                      |
|IEP              |NULL                         |Ірландський фунт                    |
|IEP              |NULL                    

In [64]:
iso_4217_currency_codes_df = session.table("nbu_exchange.silver.iso_4217_currencies")
iso_4217_currency_codes_df.show(5)

-------------------------------------------------------------------------------------------------------------------------
|"COUNTRY_NAME"       |"CURRENCY_NAME"        |"CURRENCY_CODE"  |"CURRENCY_NUMBER"  |"MINOR_UNITS"  |"WITHDRAWAL_DATE"  |
-------------------------------------------------------------------------------------------------------------------------
|ANTARCTICA           |No universal currency  |NULL             |NULL               |NULL           |NULL               |
|ANTIGUA AND BARBUDA  |East Caribbean Dollar  |XCD              |951                |2              |NULL               |
|AUSTRIA              |Euro                   |EUR              |978                |2              |NULL               |
|COOK ISLANDS (THE)   |New Zealand Dollar     |NZD              |554                |2              |NULL               |
|MALTA                |Euro                   |EUR              |978                |2              |NULL               |
------------------------

In [65]:
iso_4217_currency_codes_df_m = iso_4217_currency_codes_df \
    .select(
        col("country_name"),
        col("currency_name").alias("iso_currency_name"),
        col("currency_code").alias("iso_currency_code"),
        col("withdrawal_date")
    )

In [66]:
exchange_rate_extracted_df_u = exchange_rate_extracted_df_m \
    .join(
        iso_4217_currency_codes_df_m,
        trim(upper(exchange_rate_extracted_df_m.currency_code)) == iso_4217_currency_codes_df_m.iso_currency_code,
        "left"
    ) \
    .select(
        exchange_rate_extracted_df_m["*"],
        iso_4217_currency_codes_df_m["country_name"],
        iso_4217_currency_codes_df_m["iso_currency_name"],
        iso_4217_currency_codes_df_m["withdrawal_date"]
    ) \
    .withColumn("currency_name", coalesce(col("currency_name"), col("iso_currency_name"))) \
    .drop("iso_currency_name")

In [67]:
exchange_rate_extracted_df_u.show(5)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CURRENCY_CODE"  |"GROUP_NUMBER"  |"R030_CODE"  |"RATE"    |"RATE_PER_UNIT"  |"SPECIAL_CONDITIONS"  |"CURRENCY_NAME_UA"  |"UNITS"  |"CALCULATION_DATE"  |"EXCHANGE_DATE"  |"COUNTRY_NAME"             |"WITHDRAWAL_DATE"  |"CURRENCY_NAME"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|KZT              |1               |398          |11.1296   |0.111296         |NULL                  |Теньге              |100      |1900-01-01          |2015-05-22       |KAZAKHSTAN                 |NULL               |Tenge            |
|MDL              |1               |498     

In [68]:
exchange_rate_extracted_df_u \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .where(col("currency_name").isNull()) \
    .order_by(col("currency_name_ua").desc()) \
    .show(10)

--------------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"  |"CURRENCY_NAME_UA"            |
--------------------------------------------------------------------
|USM              |NULL             |Узбецький сум                 |
|SDR              |NULL             |Спецiальнi права запозичення  |
|___              |NULL             |Долари США по розр. з Індією  |
|WDR              |NULL             |Вiрменський драм              |
--------------------------------------------------------------------



In [69]:
iso_4217_currency_codes_df \
    .where((col("currency_code") == "USM") | (col("currency_code") == "SDR") | (col("currency_code") == "WDR")) \
    .show(10)

--------------------------------------------------------------------------------------------------------------
|"COUNTRY_NAME"  |"CURRENCY_NAME"  |"CURRENCY_CODE"  |"CURRENCY_NUMBER"  |"MINOR_UNITS"  |"WITHDRAWAL_DATE"  |
--------------------------------------------------------------------------------------------------------------
|                |                 |                 |                   |               |                   |
--------------------------------------------------------------------------------------------------------------



In [70]:
exchange_rate_extracted_df.where(col("currency_code") == "USM").count()

664

In [71]:
exchange_rate_extracted_df_u \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .where(col("currency_name_ua").like("%Узбецький сум%")) \
    .show(10)

# USM code is not valid for Uzbekistan Som

----------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"  |"CURRENCY_NAME_UA"  |
----------------------------------------------------------
|UZS              |Uzbekistan Som   |Узбецький сум       |
|USM              |NULL             |Узбецький сум       |
|UZS              |Uzbekistan Sum   |Узбецький сум       |
----------------------------------------------------------



In [72]:
iso_4217_currency_codes_df \
    .where((col("currency_code") == "UZS")) \
    .show(10)

--------------------------------------------------------------------------------------------------------------
|"COUNTRY_NAME"  |"CURRENCY_NAME"  |"CURRENCY_CODE"  |"CURRENCY_NUMBER"  |"MINOR_UNITS"  |"WITHDRAWAL_DATE"  |
--------------------------------------------------------------------------------------------------------------
|UZBEKISTAN      |Uzbekistan Sum   |UZS              |860                |2              |NULL               |
--------------------------------------------------------------------------------------------------------------



In [None]:
# |USM              |NULL             |Узбецький сум                 |
exchange_rate_extracted_df_uzs = exchange_rate_extracted_df_u \
    .withColumn("currency_code", when(col("currency_code") == "USM", "UZS").otherwise(col("currency_code"))) \
    .withColumn("currency_name", 
                when((col("currency_code") == "UZS") 
                     & ((col("currency_name") != "Uzbekistan Sum") | (col("currency_name").isNull())), 
                     "Uzbekistan Sum") \
                     .otherwise(col("currency_name"))) 

In [87]:
exchange_rate_extracted_df_uzs.select("currency_code", "currency_name", "currency_name_ua").distinct().where(col("currency_code") == "UZS").show(5)

----------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"  |"CURRENCY_NAME_UA"  |
----------------------------------------------------------
|UZS              |Uzbekistan Sum   |Узбецький сум       |
----------------------------------------------------------



In [90]:
# |WDR              |NULL             |Вiрменський драм              |
exchange_rate_extracted_df_uzs.select("currency_code", "currency_name", "currency_name_ua").distinct().where(col("currency_name_ua").like("%Вiрменський драм%")).show(5)

iso_4217_currency_codes_df \
    .where((col("currency_code") == "AMD")) \
    .show(10)

exchange_rate_extracted_df_amd = exchange_rate_extracted_df_uzs \
    .withColumn("currency_code", when(col("currency_code") == "WDR", "AMD").otherwise(col("currency_code"))) \
    .withColumn("currency_name", when((col("currency_code") == "AMD") & col("currency_name").isNull(), "Armenian Dram").otherwise(col("currency_name")))

exchange_rate_extracted_df_amd.select("currency_code", "currency_name", "currency_name_ua", "r030_code").distinct().where(col("currency_code") == "AMD").show(5)

----------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"  |"CURRENCY_NAME_UA"  |
----------------------------------------------------------
|WDR              |NULL             |Вiрменський драм    |
|AMD              |Armenian Dram    |Вiрменський драм    |
----------------------------------------------------------

--------------------------------------------------------------------------------------------------------------
|"COUNTRY_NAME"  |"CURRENCY_NAME"  |"CURRENCY_CODE"  |"CURRENCY_NUMBER"  |"MINOR_UNITS"  |"WITHDRAWAL_DATE"  |
--------------------------------------------------------------------------------------------------------------
|ARMENIA         |Armenian Dram    |AMD              |51                 |2              |NULL               |
--------------------------------------------------------------------------------------------------------------

------------------------------------------------------------------------
|"CURRENCY_CODE"

In [93]:
# SDR              |NULL             |Спецiальнi права запозичення
# where currency_code = 'SDR' or currency_code = 'XDR' or r030_code = 960

exchange_rate_extracted_df_amd.select("currency_code", "currency_name", "currency_name_ua", "r030_code").distinct().where((col("currency_code") == "SDR") | (col("currency_code") == "XDR") | (col("r030_code") == 960)).show(20)

iso_4217_currency_codes_df.where((col("currency_code") == "SDR") | (col("currency_code") == "XDR")).show(10)

exchange_rate_extracted_df_xdr = exchange_rate_extracted_df_amd \
    .withColumn("currency_code", 
                when(col("currency_code") == "SDR", "XDR").otherwise(col("currency_code"))) \
    .withColumn("currency_name", 
                when((col("currency_code") == "XDR") & col("currency_name").isNull(), "SDR (Special Drawing Right)") \
                .otherwise(col("currency_name"))) \
    .withColumn("currency_name_ua", 
                when(col("currency_code") == "XDR", "СПЗ (спеціальні права запозичення)") \
                .otherwise(col("currency_name_ua")))

exchange_rate_extracted_df_xdr.select("currency_code", "currency_name", "currency_name_ua", "r030_code").distinct().where(col("currency_code") == "XDR").show(5)


----------------------------------------------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"              |"CURRENCY_NAME_UA"                  |"R030_CODE"  |
----------------------------------------------------------------------------------------------------
|SDR              |NULL                         |Спецiальнi права запозичення        |12           |
|XDR              |SDR (Special Drawing Right)  |Спецiальнi права запозичення        |960          |
|XDR              |SDR (Special Drawing Right)  |СПЗ (спеціальні права запозичення)  |960          |
|XDR              |SDR (Special Drawing Right)  |СПЗ(спеціальні права запозичення)   |960          |
|XDR              |SDR (Special Drawing Right)  |СПЗ(спец.права запозичення)         |960          |
----------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------

In [94]:
exchange_rate_extracted_df_xdr \
    .select('currency_code', 'currency_name', 'currency_name_ua') \
    .distinct() \
    .where(col("currency_name").isNull()) \
    .order_by(col("currency_name_ua").desc()) \
    .show(10)

--------------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"  |"CURRENCY_NAME_UA"            |
--------------------------------------------------------------------
|___              |NULL             |Долари США по розр. з Індією  |
--------------------------------------------------------------------



In [None]:
# nbu_exchange.bronze.r030_csv_raw

# need to create separate df with currency_code that have duplicaitons
exchange_rate_extracted_df_xdr \
    .select('currency_code', 'currency_name', 'currency_name_ua', 'r030_code') \
    .distinct() \
    .groupBy('currency_code', 'currency_name', 'currency_name_ua') \
    .count() \
    .filter(col("count") > 1) \
    .orderBy(col("currency_code")) \
    .show(100) # 0


---------------------------------------------------------------------------------------------------
|"CURRENCY_CODE"  |"CURRENCY_NAME"                 |"CURRENCY_NAME_UA"                  |"COUNT"  |
---------------------------------------------------------------------------------------------------
|AMD              |Armenian Dram                   |Вiрменський драм                    |2        |
|AUD              |Australian Dollar               |Австралійський долар                |2        |
|AZM              |Azerbaijanian Manat             |Азербайджанський манат              |2        |
|BEF              |Belgian Franc                   |Бельгійський франк                  |2        |
|BYR              |Belarusian Ruble                |Бiлоруський рубль                   |2        |
|CAD              |Canadian Dollar                 |Канадський долар                    |2        |
|CHF              |Swiss Franc                     |Швейцарський франк                  |2        |


"select distinct currency_code, currency_name, currency_name_ua, r030_code\nfrom nbu_exchange.silver.exchange_rate_extracted\nwhere currency_code = 'SDR' or currency_code = 'XDR' or r030_code = 960\n;\n"

In [None]:
# session.close()
