# Synthetic Data Validation & Snowflake Load  
This notebook validates the synthetic TPC-DS–like dataset generated by our project.  It performs the following checks:  
- Ensures all CSVs defined in the lineage config are present and loaded  
- Verifies primary key uniqueness for each table  
- Checks foreign key consistency (normalizing any `_date` columns)  
- Stages the files to Snowflake  
- Loads each table into Snowflake using `schema_metadata.json`

## 1. Load environment variables, configurations, and build dynamic maps  
In this cell we read Snowflake credentials from `.env`, load our table/lineage YAML, and derive the list of CSVs, date columns, PKs, and FK rules entirely from config.


In [29]:
# %% 
import os, json, yaml
from pathlib import Path
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# ——————————————————————————————
# Setup paths & env
# ——————————————————————————————
ROOT_DIR = Path.cwd().parent             # one level up from notebooks/
DATA_DIR = ROOT_DIR / "data"
META_DIR = ROOT_DIR / "metadata"
CFG_DIR  = ROOT_DIR / "data_generation_config"
load_dotenv(dotenv_path=ROOT_DIR / ".env")

# ——————————————————————————————
# Load YAML configs
# ——————————————————————————————
with open(CFG_DIR / "tables.yml")  as f: tbl_cfg = yaml.safe_load(f)
with open(CFG_DIR / "lineage.yml") as f: lin_cfg = yaml.safe_load(f)

# ——————————————————————————————
# 1.a) Build list of CSVs from lineage steps
# ——————————————————————————————
file_set = set()
for step in lin_cfg["steps"]:
    file_set.update(step.get("inputs", []))
    file_set.update(step.get("outputs", []))
# include any backups if present on disk
for p in DATA_DIR.glob("*_backup.csv"):
    file_set.add(p.name)
file_list = sorted(file_set)

# ——————————————————————————————
# 1.b) Derive date columns from tables.yml
# ——————————————————————————————
date_map = {}
from datetime import timedelta, datetime
for tname, tdef in tbl_cfg["tables"].items():
    dates = []
    for col, cdef in tdef["columns"].items():
        # faker.date_between or derived expr containing 'date'
        if (cdef["type"] == "faker"       and cdef.get("method")=="date_between") \
        or (cdef["type"] == "derived"     and "date" in cdef.get("expr","")):
            dates.append(col)
    if dates:
        date_map[tname] = dates

# 1.c) Derive PKs by convention: first int _id
PKS = {}
for tname, tdef in tbl_cfg["tables"].items():
    for col, cdef in tdef["columns"].items():
        if col.endswith("_id") and cdef["type"]=="int":
            PKS[tname] = col
            break

# 1.d) Load schema_metadata for FK definitions
schema_meta   = json.load(open(META_DIR/"schema_metadata.json"))
relationships = schema_meta.get("relationships", [])

print("→ Will load CSVs:", file_list)
print("→ Date parsing map:", date_map)
print("→ Primary keys:", PKS)
print("→ Foreign key rules:", relationships)


→ Will load CSVs: ['customers.csv', 'dates.csv', 'employees.csv', 'ghost.csv', 'inventory.csv', 'legacy_customers.csv', 'orders.csv', 'orders_backup.csv', 'products.csv', 'promotions.csv', 'returns.csv', 'sales_targets.csv', 'stores.csv', 'suppliers.csv', 'transactions.csv', 'transactions_backup.csv']
→ Date parsing map: {'promotions': ['start_date', 'end_date'], 'dates': ['full_date', 'day', 'month', 'year', 'weekday'], 'employees': ['hire_date'], 'orders': ['order_date'], 'inventory': ['inventory_date'], 'returns': ['return_date']}
→ Primary keys: {'customers': 'customer_id', 'products': 'product_id', 'stores': 'store_id', 'promotions': 'promo_id', 'dates': 'date_id', 'suppliers': 'supplier_id', 'employees': 'employee_id', 'orders': 'order_id', 'transactions': 'transaction_id', 'returns': 'return_id'}
→ Foreign key rules: [{'table': 'orders', 'column': 'customer_id', 'references': 'customers.customer_id'}, {'table': 'orders', 'column': 'store_id', 'references': 'stores.store_id'}, {'

## 2. Load CSVs with proper date parsing  
Here we load each CSV present on disk, automatically parsing only those columns whose names end in `_date`, and skip any files declared in lineage but not generated.

In [30]:
# %% 
# 2) Load CSVs with _only_ real date columns
from pathlib import Path
import pandas as pd

dfs = {}
print("Loading CSVs…")
for fname in file_list:
    path  = DATA_DIR / fname
    table = path.stem

    if not path.exists():
        print(f" • {table:20s} MISSING on disk, skipping")
        continue

    # Peek at columns
    sample = pd.read_csv(path, nrows=0)
    # Only parse those ending in '_date'
    parse_dt = [c for c in sample.columns if c.lower().endswith("_date")]

    df = pd.read_csv(
        path,
        parse_dates=parse_dt,
        infer_datetime_format=True
    )
    dfs[table] = df
    print(f" • {table:20s} → {df.shape[0]:5d} rows × {df.shape[1]:2d} cols"
          + (f"  (dates: {parse_dt})" if parse_dt else ""))


Loading CSVs…
 • customers            → 10000 rows ×  8 cols
 • dates                →   365 rows ×  6 cols  (dates: ['full_date'])
 • employees            →   200 rows ×  5 cols  (dates: ['hire_date'])
 • ghost                MISSING on disk, skipping
 • inventory            → 10000 rows ×  4 cols  (dates: ['inventory_date'])
 • legacy_customers     →   500 rows ×  9 cols
 • orders               → 10000 rows ×  6 cols  (dates: ['order_date'])
 • orders_backup        → 10000 rows ×  6 cols  (dates: ['order_date'])
 • products             →  1000 rows ×  6 cols
 • promotions           →   100 rows ×  5 cols  (dates: ['start_date', 'end_date'])
 • returns              →  1000 rows ×  5 cols  (dates: ['return_date'])


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


 • sales_targets        →  1200 rows ×  4 cols
 • stores               →   100 rows ×  5 cols
 • suppliers            →    50 rows ×  3 cols
 • transactions         → 30000 rows ×  7 cols
 • transactions_backup  → 30095 rows ×  7 cols


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


## 3. Verify primary key uniqueness for each table  
We infer the primary key for each table by convention (`*_id`) and check that every value is unique, reporting PASS/FAIL.


In [31]:
# %% 
print("\nPrimary Key Checks:")
for tbl, pk in PKS.items():
    df = dfs.get(tbl)
    if df is None: 
        print(f" • {tbl:20s} MISSING") 
        continue
    unique = df[pk].nunique()
    total  = len(df)
    print(f" • {tbl:20s} {unique:5d}/{total:<5d} →"
          + (" PASS" if unique==total else " FAIL"))



Primary Key Checks:
 • customers            10000/10000 → PASS
 • products              1000/1000  → PASS
 • stores                 100/100   → PASS
 • promotions             100/100   → PASS
 • dates                  365/365   → PASS
 • suppliers               50/50    → PASS
 • employees              200/200   → PASS
 • orders               10000/10000 → PASS
 • transactions         30000/30000 → PASS
 • returns               1000/1000  → PASS


## 4. Check foreign key consistency (with normalized dates)  
All `_date` columns are normalized to midnight, then we join each child→parent FK and count any mismatches, reporting PASS or the number of failures.

In [32]:
# %% 
# 3) Foreign Key Consistency with normalized dates
import numpy as np

# Normalize any datetime columns ending in '_date'
for tbl, df in dfs.items():
    for col in df.columns:
        if col.lower().endswith("_date") and np.issubdtype(df[col].dtype, np.datetime64):
            df[col] = df[col].dt.normalize()

def fk_issues(child_df, ckey, parent_df, pkey):
    return (~child_df[ckey].dropna().isin(parent_df[pkey].dropna())).sum()

print("\nForeign Key Checks:")
for rel in relationships:
    child, ckey = rel["table"], rel["column"]
    parent, pkey = rel["references"].split(".",1)
    df_c = dfs.get(child)
    df_p = dfs.get(parent)
    if df_c is None or df_p is None:
        print(f" • {child}.{ckey} → {parent}.{pkey}: TABLE_MISSING")
        continue
    issues = fk_issues(df_c, ckey, df_p, pkey)
    status = "PASS" if issues == 0 else f"FAIL ({issues})"
    print(f" • {child:15s}.{ckey:15s} → {parent:15s}.{pkey:15s} : {status}")



Foreign Key Checks:
 • orders         .customer_id     → customers      .customer_id     : PASS
 • orders         .store_id        → stores         .store_id        : PASS
 • transactions   .order_id        → orders         .order_id        : PASS
 • transactions   .product_id      → products       .product_id      : PASS
 • transactions   .supplier_id     → suppliers      .supplier_id     : PASS
 • inventory      .store_id        → stores         .store_id        : PASS
 • inventory      .product_id      → products       .product_id      : PASS
 • returns        .order_id        → orders         .order_id        : PASS
 • returns        .product_id      → products       .product_id      : PASS
 • sales_targets  .store_id        → stores         .store_id        : PASS
 • employees      .store_id        → stores         .store_id        : PASS
 • orders         .order_date      → dates          .full_date       : FAIL (29)
 • inventory      .inventory_date  → dates          .full_date

## 5. Stage CSVs and load tables into Snowflake  
We PUT each CSV into the user stage, then use Snowpark (with `parse_header` option) and our `schema_metadata.json` to CREATE/OVERWRITE each table in Snowflake.

In [33]:
# %% 
# 5) Staging & Loading to Snowflake

import snowflake.connector
from snowflake.snowpark import Session
from snowflake.snowpark.types import StructType, StructField, IntegerType, FloatType, StringType, DateType

# Stage CSVs to user stage (~)
print("\nStaging CSVs to Snowflake…")
conn = snowflake.connector.connect(**SF_CONN)
cur = conn.cursor()
for path in DATA_DIR.glob("*.csv"):
    uri = f"file:///{path.resolve().as_posix()}"
    print(" PUT", uri)
    cur.execute(f"PUT '{uri}' @~/ OVERWRITE=TRUE")
cur.close()
conn.close()

# Load into Snowflake tables
print("\nLoading into Snowflake tables…")
sess = Session.builder.configs(SF_CONN).create()
sess.use_warehouse(SF_CONN["warehouse"])
sess.use_database(SF_CONN["database"])
sess.use_schema(SF_CONN["schema"])

def to_snow_type(tstr):
    t = tstr.upper()
    if t.startswith("INT"):    return IntegerType()
    if t.startswith(("FLOAT","DECIMAL","NUMERIC","DOUBLE")): return FloatType()
    if t.startswith("DATE"):   return DateType()
    return StringType()

# Use the proper 'parse_header' option
reader = sess.read.option("parse_header", True).option("field_delimiter", ",")

for tbl_name, tbl_info in schema_meta["tables"].items():
    # Build schema from metadata
    fields = []
    for col_name, col_meta in tbl_info["columns"].items():
        raw_t = col_meta["type"] if isinstance(col_meta, dict) else str(col_meta)
        fields.append(StructField(col_name, to_snow_type(raw_t)))
    schema = StructType(fields)

    df_snow = reader.schema(schema).csv(f"@~/{tbl_name}.csv")
    table_ident = f'"{tbl_name.upper()}"'
    print(f" • {table_ident:30s}", end=" ")
    df_snow.write.mode("overwrite").save_as_table(table_ident)
    print("✔")

sess.close()
print("✅ Snowflake load complete.")



Staging CSVs to Snowflake…
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/data/customers.csv
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/data/dates.csv
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/data/employees.csv
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/data/inventory.csv
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/data/legacy_customers.csv
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/data/orders.csv
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/data/orders_backup.csv
 PUT file:///C:/Users/user/Desktop/job_assignments/Thesis work/Thesis_work_old/synthetic_data_generator/dat