# DKD kode eksempel notebook
Denne notebooken brukes til å teste koden for dkd når man gjør eventuelle endrigner lokalt.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import date

from src.skatt_dqv import dataQuality
from src.skatt_dqv.expectations import *

Lager test data.

In [2]:
spark = SparkSession.builder.getOrCreate()

data = [
    (1, 'Widget', date(2023, 5, 1), 10, 19.99),
    (2, 'Gadget', date(2023, 5, 2), 5, 29.99),
    (3, 'Widget', date(2023, 5, 3), 8, None),               # Missing price
    (4, 'Widget', None, 10, 19.99),                         # Missing date
    (5, 'Gadget', date(2023, 5, 5), None, 29.99),           # Missing amount
    (6, 'Widget', date(2023, 5, 6), 10000, 19.99),          # Extreme amount
    (7, 'Thingy', date(2023, 5, 7), 2, 9999.99),            # Extreme price
    (8, None, date(2023, 5, 8), 3, 15.50),                  # Missing product
    (9, 'Gadget', date(2023, 5, 9), 7, 29.99),
    (10, 'Widget', date(2023, 5, 10), 5, 0.0),              # Zero price
    (11, 'Thingy', date(2023, 5, 11), 0, 10.99),            # Zero amount
    (12, 'Gadget', date(2023, 5, 12), 3, -5.00),            # Negative price
    (13, 'Widget', date(2023, 5, 13), -2, 19.99),           # Negative amount
    (14, 'Gadget', date(2023, 5, 14), 4, 29.99),
    (15, 'Thingy', date(2023, 5, 15), 3, 15.50),
    (16, 'Widget', date(2023, 5, 16), 5, 0.0),           
    (17, 'Widget', date(2023, 5, 17), 0, 19.99),       
    (18, 'Gadget', date(2023, 5, 18), 2, 29.99),
    (19, 'Widget', date(2023, 5, 1), 10, 19.99),            # Duplicate of row 1
    (20, 'Widget', date(2023, 5, 19), 1, 100000.00),        # Extreme price
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("product", StringType(), True),
    StructField("date", DateType(), True),
    StructField("amount", IntegerType(), True),  # Use StringType to allow wrong types
    StructField("sale_price", FloatType(), True),
])

df = spark.createDataFrame(data, schema)
df.show(truncate=False)


25/08/06 13:46:26 WARN Utils: Your hostname, U2204i-0352 resolves to a loopback address: 127.0.1.1; using 10.207.73.55 instead (on interface ens160)
25/08/06 13:46:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/06 13:46:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+---+-------+----------+------+----------+
|id |product|date      |amount|sale_price|
+---+-------+----------+------+----------+
|1  |Widget |2023-05-01|10    |19.99     |
|2  |Gadget |2023-05-02|5     |29.99     |
|3  |Widget |2023-05-03|8     |NULL      |
|4  |Widget |NULL      |10    |19.99     |
|5  |Gadget |2023-05-05|NULL  |29.99     |
|6  |Widget |2023-05-06|10000 |19.99     |
|7  |Thingy |2023-05-07|2     |9999.99   |
|8  |NULL   |2023-05-08|3     |15.5      |
|9  |Gadget |2023-05-09|7     |29.99     |
|10 |Widget |2023-05-10|5     |0.0       |
|11 |Thingy |2023-05-11|0     |10.99     |
|12 |Gadget |2023-05-12|3     |-5.0      |
|13 |Widget |2023-05-13|-2    |19.99     |
|14 |Gadget |2023-05-14|4     |29.99     |
|15 |Thingy |2023-05-15|3     |15.5      |
|16 |Widget |2023-05-16|5     |0.0       |
|17 |Widget |2023-05-17|0     |19.99     |
|18 |Gadget |2023-05-18|2     |29.99     |
|19 |Widget |2023-05-01|10    |19.99     |
|20 |Widget |2023-05-19|1     |100000.0  |
+---+------

Lager forventninger til datasettet.

In [3]:
product_not_null = dataQuality.expectation(
    expectation_name = "Product not null",
    expectation_test = value_not_null,
    expectation_id = "F1",
    expectation_description = "Product need to be id to get correct info from reference table.",
    quality_dimension = "Fullstendighet",
    # Resten av parameterne er ikke nødvendig for denne testen. De er kun her for å vise alt som kan mulig inngå i en forventning.
    regex = None,
    sql_filter = None,
    args = None,
    path = None,
    value_set = None,
)

sales_above_zero = dataQuality.expectation(
    expectation_name = "Sales less zero",
    expectation_test = value_is_less_then,
    expectation_id = "F2",
    expectation_description = "Product sales are above zero.",
    quality_dimension = "Gyldighet",
    # Resten av parameterne er ikke nødvendig for denne testen. De er kun her for å vise alt som kan mulig inngå i en forventning.
    regex = None,
    sql_filter = None,
    args = 1,
    path = None,
    value_set = None,
)

Kjører datakvalitetstest

In [4]:
test_dkd = dataQuality.data_quality_test(
    spark_df = df,
    column_expectations = {
        "product":[product_not_null],
        "amount":[sales_above_zero],
    },
    uuid_columns = ["id","date"]
)

df_rows, df_agg, df_ref = test_dkd.run_validation()

In [5]:
df_rows.show()

+---+----------+--------------+-------+
| id|      date|forventning_id|kolonne|
+---+----------+--------------+-------+
|  8|2023-05-08|            F1|product|
| 11|2023-05-11|            F2| amount|
| 13|2023-05-13|            F2| amount|
| 17|2023-05-17|            F2| amount|
+---+----------+--------------+-------+



In [6]:
df_agg

Unnamed: 0,forventning_id,kolonne,resultat,dimensjon,testkjoering_tidspunkt
0,F1,product,0.95,Fullstendighet,2025-08-06 13:46:37
1,F2,amount,0.85,Gyldighet,2025-08-06 13:46:37


In [7]:
df_ref

Unnamed: 0,forventning_id,kolonne,dimensjon,forventning
0,F1,product,Fullstendighet,Product need to be id to get correct info from...
0,F2,amount,Gyldighet,Product sales are above zero.
