In [1]:
import os
import time
from dotenv import load_dotenv
import tensorflow as tf
from tfx import v1 as tfx
from tfx.v1.components import ImportExampleGen, StatisticsGen, SchemaGen
from tfx.orchestration.experimental.interactive.interactive_context import (
    InteractiveContext,
)
from tfx.proto import example_gen_pb2
from tfx.v1.proto import Output, SplitConfig

2024-10-17 14:37:17.497940: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-17 14:37:18.315628: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-17 14:37:19.560260: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-17 14:37:19.560378: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-17 14:37:19.674130: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
load_dotenv()

True

In [3]:
BUCKET = os.getenv("BUCKET")
PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")

In [4]:
ARTIFACT_STORE = os.path.join(os.sep, "home", "jupyter", "artifact-store")
PIPELINE_NAME = "neurips-interactive-test"
PIPELINE_ROOT = os.path.join(
    ARTIFACT_STORE, PIPELINE_NAME, time.strftime("%Y%m%d_%H%M%S")
)
INPUT_BASE = f"gs://{BUCKET}/primary"

In [5]:
context = InteractiveContext(
    pipeline_name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
    metadata_connection_config=None,
)



In [6]:
output = Output(
    split_config=example_gen_pb2.SplitConfig(
        splits=[
            SplitConfig.Split(name="train", hash_buckets=4),
            SplitConfig.Split(name="eval", hash_buckets=1),
        ]
    )
)

In [7]:
ARTIFACT_STORE

'/home/jupyter/artifact-store'

In [8]:
example_gen = ImportExampleGen(
    input_base="/home/jupyter/artifact-store", output_config=output
).with_beam_pipeline_args(
    beam_pipeline_args=[
        "--machine_type",
        "n2-custom-8-131072-ext",
        "--disk_size_gb",
        "500",
        "--runner",
        "DataflowRunner",
        "--worker_disk_type",
        f"compute.googleapis.com/projects/{PROJECT_ID}/zones/{REGION}/diskTypes/pd-standard",
        "--project",
        PROJECT_ID,
        "--region",
        REGION,
        "--temp_location",
        f"gs://{BUCKET}/pipeline_root/",
        "--experiments",
        "use_runner_v2",
    ]
)

In [None]:
context.run(example_gen)





In [None]:
statistics_gen = StatisticsGen(
    examples=example_gen.outputs["examples"], exclude_splits=["train"]
)

In [None]:
schema_gen = SchemaGen(
    statistics=statistics_gen.outputs["statistics"], exclude_splits=["train"]
)