In [18]:
import pandas as pd
import json
import os
import pyarrow as pa
import pyarrow.parquet as pq

In [19]:
os.makedirs("../staging", exist_ok=True)

In [20]:
def extract_tickets():
    with open("/home/jovyan/work/data/support_tickets.json") as f:
        tickets = json.load(f)
    df = pd.DataFrame(tickets)
    df.to_parquet("/home/jovyan/work/staging/tickets.parquet", index=False)
    print("✅ Ticket data extracted to staging/tickets.parquet")

In [21]:
def extract_logs():
    df = pd.read_csv("/home/jovyan/work/data/web_logs.csv")
    df.to_parquet("/home/jovyan/work/staging/logs.parquet", index=False)
    print("✅ Log data extracted to staging/logs.parquet")

In [22]:
def extract_crm():
    df = pd.read_csv("/home/jovyan/work/data/customers.csv")
    df.to_parquet("/home/jovyan/work/staging/crm.parquet", index=False)
    print("✅ CRM data extracted to staging/crm.parquet")

In [23]:
def extract_txns():
    df = pd.read_excel("/home/jovyan/work/data/transactions.xlsx")

    # Fix all datetime columns to millisecond precision
    for col in df.select_dtypes(include=["datetime64[ns]"]).columns:
        df[col] = df[col].dt.floor("ms")

    # Write Parquet using pyarrow with explicit timestamp coercion
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_table(
        table,
        "/home/jovyan/work/staging/txns.parquet",
        coerce_timestamps="ms",
        use_deprecated_int96_timestamps=False
    )

    print("✅ Transaction data saved to staging/txns.parquet with ms timestamps")

In [24]:
extract_crm()
extract_tickets()
extract_logs()
extract_txns()

✅ CRM data extracted to staging/crm.parquet
✅ Ticket data extracted to staging/tickets.parquet
✅ Log data extracted to staging/logs.parquet
✅ Transaction data saved to staging/txns.parquet with ms timestamps
