In [None]:
from src.aut_etl_pipeline.utils.bronze_profile_funcs import get_csv_files, profile_data
from src.aut_etl_pipeline.utils.validation_rules import asset_schema
from cerberus import Validator

validator = Validator(asset_schema())
files = get_csv_files(RAW_BUCKET, "dl_data/downloaded-data/AUT/your_dl_code", "Loan_Data", "assets")
for f in files:
    clean, dirty = profile_data(RAW_BUCKET, f, "assets", validator)
    print(f"Clean: {len(clean)}, Dirty: {len(dirty)}")

# Guida Step-by-Step: Test della Pipeline ESMA Auto Loans in un Notebook

Questa guida ti permette di testare i **moduli Spark** della pipeline ESMA Auto Loans direttamente da un notebook (Colab, Dataproc, Jupyter).  
L’obiettivo è simulare una run di pipeline su dati reali leggendo e scrivendo da GCS, senza passare da Airflow.

---

## **1. Prerequisiti**

- Python ≥ 3.8
- Librerie: `pyspark`, `google-cloud-storage`, `pandas`, `cerberus`, `delta-spark`
- Service account GCP con permessi su bucket GCS
- File di input caricati su GCS (ad esempio, CSV in `dl_data/downloaded-data/AUT/<dl_code>/`)

---

## **2. Setup Ambiente**

In [None]:
# Installazione delle dipendenze (Colab/Dataproc)
!pip install pyspark==3.3.1 delta-spark==2.1.0 google-cloud-storage cerberus pandas

import os
from google.colab import auth
auth.authenticate_user()

**Oppure:**  
Assicurati che la tua variabile di ambiente GOOGLE_APPLICATION_CREDENTIALS punti al service account json!

In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account.json"

## **3. Setup SparkSession con Delta e GCS**

In [None]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder.appName("esma_test") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.1.0") \
    .config("spark.delta.logStore.gs.impl", "io.delta.storage.GCSLogStore") \
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED") \
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## **4. Impostazione Parametri di Test**

In [None]:
# Sostituisci con i tuoi valori reali di test
PROJECT_ID = "your_project_id"
RAW_BUCKET = "your_raw_bucket"
DATA_BUCKET = "your_data_bucket"
DL_CODE = "test_deal_code"  # esempio: 'AUT1234'
INGESTION_DATE = "2025-05-31"
SOURCE_PREFIX = f"dl_data/downloaded-data/AUT/{DL_CODE}"
TARGET_BRONZE_PREFIX = "AUT/bronze/assets"
TARGET_SILVER_PREFIX = "AUT/silver/assets"
FILE_KEY = "Loan_Data"

## **5. Test Profilazione Bronze**

In [None]:
from src.aut_etl_pipeline.profile_bronze_data import profile_bronze_data

# Lancia la profilazione: genera i file clean_dump/ e dirty_dump/ su GCS
profile_bronze_data(
    raw_bucketname=RAW_BUCKET,
    data_bucketname=DATA_BUCKET,
    source_prefix=SOURCE_PREFIX,
    file_key=FILE_KEY,
    data_type="assets",
    ingestion_date=INGESTION_DATE
)

**Controlla su GCS:**  
Dovresti trovare i file CSV in `gs://<DATA_BUCKET>/clean_dump/assets/` e `dirty_dump/assets/`.

## **6. Test Bronze Table**

In [None]:
from src.aut_etl_pipeline.generate_bronze_tables import generate_bronze_tables

generate_bronze_tables(
    spark=spark,
    data_bucketname=DATA_BUCKET,
    source_prefix=SOURCE_PREFIX,
    target_prefix=TARGET_BRONZE_PREFIX,
    data_type="assets",
    ingestion_date=INGESTION_DATE
)

**Controlla su GCS:**  
Dovresti trovare le tabelle Delta in `gs://<DATA_BUCKET>/AUT/bronze/assets/`.

## **7. Test Silver Table**

In [None]:
from src.aut_etl_pipeline.generate_asset_silver import generate_asset_silver

generate_asset_silver(
    spark=spark,
    bucket_name=DATA_BUCKET,
    source_prefix=TARGET_BRONZE_PREFIX,
    target_prefix=TARGET_SILVER_PREFIX,
    dl_code=DL_CODE,
    ingestion_date=INGESTION_DATE
)

**Controlla su GCS:**  
Dovresti trovare i file Parquet in `gs://<DATA_BUCKET>/AUT/silver/assets/lease_info_table/`.

## **8. Lettura e Verifica Output**

In [None]:
# Leggi la tabella silver per verificare il risultato
df = spark.read.parquet(f"gs://{DATA_BUCKET}/AUT/silver/assets/lease_info_table/")
df.show(5)
df.printSchema()

## **9. (Opzionale) Test Deal Details**

Applica lo stesso schema, cambiando `data_type`, prefissi e funzioni (usa `generate_deal_details_bronze` e `generate_deal_details_silver`).

## **10. Debug**

- Se ricevi errori, stampa i log, controlla i permessi GCS e la presenza dei file di input.
- Se hai errori di dipendenze, assicurati che tutte le versioni siano compatibili.
- Puoi modificare i parametri o lavorare su un solo DL_CODE per debug più veloce.

## **11. Pulizia (Cleanup)**

Se vuoi rimuovere i file di test dai bucket:

In [None]:
from google.cloud import storage
client = storage.Client(project=PROJECT_ID)
bucket = client.get_bucket(DATA_BUCKET)
for blob in bucket.list_blobs(prefix="AUT/"):
    print("Deleting", blob.name)
    blob.delete()

## **Simualtion**

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
import os

# ---- 1. Setup ambiente Spark ----
spark = SparkSession.builder.appName("esma_local_test").getOrCreate()

# ---- 2. Leggi file Excel o CSV locale ----

# Esempio con CSV
csv_path = "/Users/hp2/Deeploans/deeploans/ETL-Pipelines/ESMA-Loan-level-data-templates/Auto_Loans_ESMA/Auto Loans Synthetic Dataset.csv"
df = spark.read.csv(csv_path, header=True, inferSchema=True)

# Esempio con Excel (richiede pandas + openpyxl)
# excel_path = "./dati/Loan_Data.xlsx"
# pdf = pd.read_excel(excel_path)
# df = spark.createDataFrame(pdf)

print("Prime righe del dataframe letto:")
df.show(5)
df.printSchema()

# ---- 3. Profilazione semplice dei dati ----

print("Numero totale record:", df.count())
print("Conteggio valori nulli per colonna:")
for col in df.columns:
    null_count = df.filter(df[col].isNull()).count()
    print(f"{col}: {null_count}")

# ---- 4. (Facoltativo) Scrivi i dati risultanti su disco ----

output_path = "./output/bronze_table"
os.makedirs(output_path, exist_ok=True)
df.write.mode("overwrite").parquet(output_path)
print(f"Dati scritti in formato parquet su {output_path}")

# ---- 5. Lettura del parquet per verifica ----

df2 = spark.read.parquet(output_path)
print("Prime righe dal parquet scritto:")
df2.show(5)

Prime righe del dataframe letto:
+----------+--------------------+--------------------+--------------+--------------+--------------+--------------+--------------+
|FIELD CODE|          FIELD NAME|   CONTENT TO REPORT|        LOAN 1|        LOAN 2|        LOAN 3|        LOAN 4|        LOAN 5|
+----------+--------------------+--------------------+--------------+--------------+--------------+--------------+--------------+
|     AUTL1|   Unique Identifier|The unique identi...|AUT-2024-001-A|AUT-2024-002-B|AUT-2024-003-C|AUT-2024-004-D|AUT-2024-005-E|
|     AUTL2|Original Underlyi...|Unique underlying...|  ORIG-EXP-001|  ORIG-EXP-002|  ORIG-EXP-003|  ORIG-EXP-004|  ORIG-EXP-005|
|     AUTL3|New Underlying Ex...|New identifier if...|  ORIG-EXP-001|  ORIG-EXP-002|  ORIG-EXP-003|  ORIG-EXP-004|  ORIG-EXP-005|
|     AUTL4|Original Obligor ...|Original unique o...| OBLG-2024-001| OBLG-2024-002| OBLG-2024-003| OBLG-2024-004| OBLG-2024-005|
|     AUTL5|New Obligor Ident...|New identifier if...| OB

In [13]:
import pandas as pd
from pyspark.sql import SparkSession
import os

# 1. Setup Spark locale
spark = SparkSession.builder.appName("esma_local_test").getOrCreate()

# 2. Scegli il file da testare
csv_path = "/Users/hp2/Deeploans/deeploans/ETL-Pipelines/ESMA-Loan-level-data-templates/Auto_Loans_ESMA/Auto Loans Synthetic Dataset.csv"         # Oppure...
excel_path = "/Users/hp2/Deeploans/deeploans/ETL-Pipelines/ESMA-Loan-level-data-templates/annex5_underlying_exposures-automobile.xlsx"

# 3. Leggi il file
if os.path.exists(csv_path):
    df = spark.read.csv(csv_path, header=True, inferSchema=True)
elif os.path.exists(excel_path):
    pdf = pd.read_excel(excel_path)
    df = spark.createDataFrame(pdf)
else:
    raise FileNotFoundError("Nessun file CSV o Excel trovato!")

print("Prime righe del dataframe letto:")
df.show(5)
df.printSchema()

# 4. Esegui una semplice pulizia: rimuovi righe con tutti valori nulli
df_clean = df.na.drop(how="all")
print("Dopo pulizia righe totalmente nulle:")
df_clean.show(5)

# 5. Scrivi su disco in formato parquet
output_path = "./output/bronze_table"
os.makedirs(output_path, exist_ok=True)
df_clean.write.mode("overwrite").parquet(output_path)
print(f"Dati puliti scritti in {output_path}")

# 6. Leggi e mostra il parquet appena scritto
df_verifica = spark.read.parquet(output_path)
print("Verifica parquet:")
df_verifica.show(5)

Prime righe del dataframe letto:
+----------+--------------------+--------------------+--------------+--------------+--------------+--------------+--------------+
|FIELD CODE|          FIELD NAME|   CONTENT TO REPORT|        LOAN 1|        LOAN 2|        LOAN 3|        LOAN 4|        LOAN 5|
+----------+--------------------+--------------------+--------------+--------------+--------------+--------------+--------------+
|     AUTL1|   Unique Identifier|The unique identi...|AUT-2024-001-A|AUT-2024-002-B|AUT-2024-003-C|AUT-2024-004-D|AUT-2024-005-E|
|     AUTL2|Original Underlyi...|Unique underlying...|  ORIG-EXP-001|  ORIG-EXP-002|  ORIG-EXP-003|  ORIG-EXP-004|  ORIG-EXP-005|
|     AUTL3|New Underlying Ex...|New identifier if...|  ORIG-EXP-001|  ORIG-EXP-002|  ORIG-EXP-003|  ORIG-EXP-004|  ORIG-EXP-005|
|     AUTL4|Original Obligor ...|Original unique o...| OBLG-2024-001| OBLG-2024-002| OBLG-2024-003| OBLG-2024-004| OBLG-2024-005|
|     AUTL5|New Obligor Ident...|New identifier if...| OB

In [14]:
import pandas as pd
from pyspark.sql import SparkSession
import os

# --------- 1. Setup Spark locale ---------
spark = SparkSession.builder.appName("esma_local_test").getOrCreate()

# --------- 2. Scegli il file da testare ---------
csv_path = "/Users/hp2/Deeploans/deeploans/ETL-Pipelines/ESMA-Loan-level-data-templates/Auto_Loans_ESMA/Auto Loans Synthetic Dataset.csv"         # Oppure...
excel_path = "/Users/hp2/Deeploans/deeploans/ETL-Pipelines/ESMA-Loan-level-data-templates/annex5_underlying_exposures-automobile.xlsx"

# --------- 3. Leggi il file ---------
if os.path.exists(csv_path):
    df = spark.read.csv(csv_path, header=True, inferSchema=True)
elif os.path.exists(excel_path):
    pdf = pd.read_excel(excel_path)
    df = spark.createDataFrame(pdf)
else:
    raise FileNotFoundError("Nessun file CSV o Excel trovato!")

print("Prime righe del dataframe letto:")
df.show(5)
df.printSchema()

# --------- 4. Simula clean/dirty dump ---------
# (ESMA: una riga "clean" se tutte le colonne obbligatorie NON sono nulle)
# Scegli alcune colonne obbligatorie (ad esempio modificare secondo schema reale)
obbligatorie = [col for col in df.columns if col.lower().startswith("loan") or col.lower().startswith("asset")]

if not obbligatorie:
    # Se non ci sono colonne che soddisfano il filtro, prendi le prime due (solo per test!)
    obbligatorie = df.columns[:2]

print(f"Colonne obbligatorie per la validazione: {obbligatorie}")

df_clean = df
for col in obbligatorie:
    df_clean = df_clean.filter(df_clean[col].isNotNull())

df_dirty = df.subtract(df_clean)

print("Righe CLEAN:")
df_clean.show(5)
print("Righe DIRTY:")
df_dirty.show(5)

# --------- 5. Scrivi clean/dirty dump come CSV ---------
output_clean = "./output/clean_dump/assets"
output_dirty = "./output/dirty_dump/assets"
os.makedirs(output_clean, exist_ok=True)
os.makedirs(output_dirty, exist_ok=True)
df_clean.write.mode("overwrite").csv(output_clean, header=True)
df_dirty.write.mode("overwrite").csv(output_dirty, header=True)
print(f"Dump clean: {output_clean}\nDump dirty: {output_dirty}")

# --------- 6. Bronze: salva tutto come Parquet ---------
bronze_path = "./output/bronze_table"
os.makedirs(bronze_path, exist_ok=True)
df_clean.write.mode("overwrite").parquet(bronze_path)
print(f"Bronze table scritta in formato parquet su {bronze_path}")

# --------- 7. Silver: esempio semplice di arricchimento/trasformazione ---------
# Per test: aggiungiamo una colonna calcolata, es: length di un campo stringa
from pyspark.sql.functions import length

if df_clean.columns:
    col_da_arricchire = df_clean.columns[0]  # Scegli una colonna qualsiasi per esempio
    df_silver = df_clean.withColumn("field_length", length(df_clean[col_da_arricchire]))
else:
    df_silver = df_clean

silver_path = "./output/silver_table"
os.makedirs(silver_path, exist_ok=True)
df_silver.write.mode("overwrite").parquet(silver_path)
print(f"Silver table scritta in formato parquet su {silver_path}")

# --------- 8. Lettura finale di verifica ---------
print("Verifica parquet silver:")
df_verifica = spark.read.parquet(silver_path)
df_verifica.show(5)

Prime righe del dataframe letto:
+----------+--------------------+--------------------+--------------+--------------+--------------+--------------+--------------+
|FIELD CODE|          FIELD NAME|   CONTENT TO REPORT|        LOAN 1|        LOAN 2|        LOAN 3|        LOAN 4|        LOAN 5|
+----------+--------------------+--------------------+--------------+--------------+--------------+--------------+--------------+
|     AUTL1|   Unique Identifier|The unique identi...|AUT-2024-001-A|AUT-2024-002-B|AUT-2024-003-C|AUT-2024-004-D|AUT-2024-005-E|
|     AUTL2|Original Underlyi...|Unique underlying...|  ORIG-EXP-001|  ORIG-EXP-002|  ORIG-EXP-003|  ORIG-EXP-004|  ORIG-EXP-005|
|     AUTL3|New Underlying Ex...|New identifier if...|  ORIG-EXP-001|  ORIG-EXP-002|  ORIG-EXP-003|  ORIG-EXP-004|  ORIG-EXP-005|
|     AUTL4|Original Obligor ...|Original unique o...| OBLG-2024-001| OBLG-2024-002| OBLG-2024-003| OBLG-2024-004| OBLG-2024-005|
|     AUTL5|New Obligor Ident...|New identifier if...| OB

In [4]:
import os
from pyspark.sql import SparkSession
import sys
sys.path.append("src")

# 1. Setup Spark locale
spark = SparkSession.builder.appName("esma_local_test").getOrCreate()

# 2. Parametri di test (modifica secondo necessità)
LOCAL_RAW_PATH = "./dati"  # Dove hai i tuoi file CSV/Excel
LOCAL_OUTPUT_PATH = "./output"
DL_CODE = "test_deal_code"
INGESTION_DATE = "2025-06-01"

# 3. Importa le funzioni reali dal progetto
from src.aut_etl_pipeline.profile_bronze_data import profile_bronze_data
from src.aut_etl_pipeline.generate_bronze_tables import generate_bronze_tables
from src.aut_etl_pipeline.generate_asset_silver import generate_asset_silver

# 4. Adatta le funzioni per accettare path locali
# Esempio: modifica le funzioni per accettare un flag "local_mode"
# oppure, se usano direttamente GCS, copia temporaneamente il codice 
# in questo script e sostituisci GCS con l'I/O locale.

# 5. Esegui la pipeline reale, lavorando su file locali

# Profilazione bronze
profile_bronze_data(
    raw_bucketname="",
    data_bucketname="",
    source_prefix="./dati/Loan_Data.csv",
    file_key="Loan_Data",
    data_type="assets",
    ingestion_date="2025-06-01",
    local_mode=True,
    local_output_path="./output"
)

generate_bronze_tables(
    spark=spark,
    data_bucketname="",
    source_prefix="./output",
    target_prefix="./output/bronze_table",
    data_type="assets",
    ingestion_date="2025-06-01",
    local_mode=True
)

generate_asset_silver(
    spark=spark,
    bucket_name="",
    source_prefix="./output/bronze_table",
    target_prefix="./output/silver_table",
    dl_code="DL_CODE_TEST",
    ingestion_date="2025-06-01",
    local_mode=True
)

print("Pipeline completata! Controlla la cartella ./output per i risultati.")

2025-06-03 10:24:27,833 - src.aut_etl_pipeline.profile_bronze_data - INFO - Start ASSETS BRONZE PROFILING job.


SchemaError: {'AUTL16': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL20': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL24': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL25': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL26': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_int']}]}], 'AUTL29': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL30': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL31': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL33': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL37': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL38': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL39': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL40': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL43': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL44': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_int']}]}], 'AUTL45': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL46': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL47': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_int']}]}], 'AUTL48': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL49': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL50': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL51': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL52': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL59': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL6': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL60': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL61': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL62': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL63': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL64': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL65': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL66': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL67': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL68': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL69': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_int']}]}], 'AUTL7': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL72': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL73': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL74': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL75': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL76': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL77': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL78': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_number']}]}], 'AUTL8': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}], 'AUTL9': [{'coerce': ['none or more than one rule validate', {'oneof definition 0': ['must be of callable type'], 'oneof definition 1': ['must be of list type'], 'oneof definition 2': ['unallowed value to_date']}]}]}

In [3]:
from cerberus import Validator
from src.aut_etl_pipeline.utils.validation_rules import asset_schema, TO_DATE, TO_NUMBER


schema = {
    "date_col": {"type": "datetime", "coerce": TO_DATE}
}

v = Validator(schema)
print(v.validate({"date_col": "2025-06-01"}))  # True
print(v.document)  # {'date_col': datetime.datetime(2025, 6, 1, 0, 0)}


True
{'date_col': datetime.datetime(2025, 6, 1, 0, 0)}
