In [1]:
# install SDKs if the environment is fresh
!pip install -q --upgrade google-cloud-aiplatform huggingface_hub
!pip install -q --upgrade google-cloud-secret-manager

In [19]:
# 1 Pull the token from Secret Manager (first cell in the CPU notebook)

from google.cloud import secretmanager, aiplatform
import datetime, os

PROJECT_ID = "sentiment-analysis-steam"          # ← your GCP project
REGION     = "us-central1"

def get_secret(secret_id: str, project_id: str) -> str:
    client = secretmanager.SecretManagerServiceClient()
    name   = f"projects/{project_id}/secrets/{secret_id}/versions/latest"
    return client.access_secret_version(name=name).payload.data.decode("utf-8")

HF_TOKEN = get_secret("HF_TOKEN", PROJECT_ID)     # 🔒 fetched securely

In [20]:
# 2 Upload the CPU model artifact

aiplatform.init(project=PROJECT_ID, location=REGION)

CONTAINER = (
    "us-docker.pkg.dev/deeplearning-platform-release/"
    "gcr.io/huggingface-pytorch-inference-"
    "cpu.2-3.transformers.4-46.ubuntu2204.py311"    # CPU image (no “cu121”)
)

TIMESTAMP   = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S")
MODEL_NAME  = f"steam-distilbert-cpu-{TIMESTAMP}"
REPO_ID     = "andrewting89/steam-distilbert"       # HF repo you pushed earlier

cpu_model = aiplatform.Model.upload(
    display_name  = MODEL_NAME,
    serving_container_image_uri = CONTAINER,
    serving_container_environment_variables = {
        "HF_MODEL_ID": REPO_ID,
        "HF_TASK":     "text-classification",
        "HUGGING_FACE_HUB_TOKEN": HF_TOKEN,
    },
    sync=True,
)
print("🆔 Vertex Model:", cpu_model.resource_name)

Creating Model
Create Model backing LRO: projects/1063155306158/locations/us-central1/models/3829769423846113280/operations/7084012508473720832
Model created. Resource name: projects/1063155306158/locations/us-central1/models/3829769423846113280@1
To use this Model in another session:
model = aiplatform.Model('projects/1063155306158/locations/us-central1/models/3829769423846113280@1')
🆔 Vertex Model: projects/1063155306158/locations/us-central1/models/3829769423846113280


In [23]:
# 3 Deploy on a CPU endpoint
ENDPOINT_NAME = "steam-sentiment-endpoint"

# Re-use the endpoint if it exists, otherwise create a new one
eps = aiplatform.Endpoint.list(
    filter=f'display_name="{ENDPOINT_NAME}"', location=REGION)
endpoint = eps[0] if eps else aiplatform.Endpoint.create(
    display_name=ENDPOINT_NAME, sync=True)

# (optional) wipe any old, failed revisions
endpoint.undeploy_all()

cpu_model.deploy(
    endpoint            = endpoint,
    machine_type        = "n1-standard-4",   # 4-vCPU CPU VM
    min_replica_count   = 1,
    max_replica_count   = 1,
    traffic_percentage  = 100,
    sync=True,                               # wait until READY
)
print("✅ Endpoint live on CPU:", endpoint.resource_name)

Deploying model to Endpoint : projects/1063155306158/locations/us-central1/endpoints/265181313898643456
Deploy Endpoint model backing LRO: projects/1063155306158/locations/us-central1/endpoints/265181313898643456/operations/1733736151157571584
Endpoint model deployed. Resource name: projects/1063155306158/locations/us-central1/endpoints/265181313898643456
✅ Endpoint live on CPU: projects/1063155306158/locations/us-central1/endpoints/265181313898643456


In [9]:
# Confirm correct endpoint_id

print(endpoint.resource_name)
# → projects/123456789012/locations/us-central1/endpoints/9876543210987654321

ENDPOINT_ID = endpoint.resource_name.split("/")[-1]
print(ENDPOINT_ID)        # → 9876543210987654321

projects/1063155306158/locations/us-central1/endpoints/265181313898643456
265181313898643456


In [24]:
# 4 Smoke-test
TEST_TEXT = "This game is so bug-free and the story is amazing!"

prediction = endpoint.predict(
    instances=[{"text": TEST_TEXT}]
)

print("↳ raw response:", prediction)

# nice-format
label = prediction.predictions[0]["label"]
score = prediction.predictions[0]["score"]
print(f"\n✅ Model says: {label}  (confidence ≈ {score:.2%})")

↳ raw response: Prediction(predictions=[{'score': 0.9148386120796204, 'label': 'POSITIVE'}], deployed_model_id='6228560748025479168', metadata=None, model_version_id='1', model_resource_name='projects/1063155306158/locations/us-central1/models/3829769423846113280', explanations=None)

✅ Model says: POSITIVE  (confidence ≈ 91.48%)


In [26]:
# Tear down API Endpoint and Cluster/VMs

# ── FULL TEAR-DOWN CELL ───────────────────────────────────────────────
from google.cloud import aiplatform, compute_v1
from google.api_core.exceptions import NotFound
import re

PROJECT   = "sentiment-analysis-steam"
REGION    = "us-central1"
ENDPOINT  = "projects/1063155306158/locations/us-central1/endpoints/265181313898643456"
CLUSTER_PREFIX = r"steam-sentiment-cluster"     # regex for VM names to stop

# 1) Scale Vertex endpoint to zero -------------------------------------
try:
    endpoint = aiplatform.Endpoint(ENDPOINT)
    if endpoint.gca_resource.deployed_models:
        print("Undeploying all replicas …")
        endpoint.undeploy_all(sync=True)
        print("✅ Endpoint replicas = 0; node-hour billing stopped.")
    else:
        print("Endpoint already at 0 replicas.")
except NotFound:
    print("Endpoint not found (already deleted).")

# 2) Stop any matching Compute-Engine VMs ------------------------------
compute = compute_v1.InstancesClient()
request = compute_v1.AggregatedListInstancesRequest(
    project=PROJECT,
    filter="status = RUNNING"
)

stopped = []
pattern = re.compile(CLUSTER_PREFIX)
for zone_path, resp in compute.aggregated_list(request=request):
    zone = zone_path.split("/")[-1]
    for inst in getattr(resp, "instances", []):
        if pattern.match(inst.name):
            print(f"Stopping VM {inst.name} in {zone} …")
            compute.stop(project=PROJECT, zone=zone, instance=inst.name)
            stopped.append(inst.name)

if stopped:
    print("✅ All cluster VMs are stopping; billing ends when they reach TERMINATED.")
else:
    print("No running VMs matched the prefix; nothing to stop.")

Undeploying all replicas …
Undeploying Endpoint model: projects/1063155306158/locations/us-central1/endpoints/265181313898643456
Undeploy Endpoint model backing LRO: projects/1063155306158/locations/us-central1/endpoints/265181313898643456/operations/7102449119448268800
Endpoint model undeployed. Resource name: projects/1063155306158/locations/us-central1/endpoints/265181313898643456
✅ Endpoint replicas = 0; node-hour billing stopped.
No running VMs matched the prefix; nothing to stop.


In [None]:
# 4 Smoke-test
TEST_TEXT = "This game is so bug-free and the story is amazing!"

prediction = endpoint.predict(
    instances=[{"text": TEST_TEXT}]
)

print("↳ raw response:", prediction)

# nice-format
label = prediction.predictions[0]["label"]
score = prediction.predictions[0]["score"]
print(f"\n✅ Model says: {label}  (confidence ≈ {score:.2%})")

## Make sure all clusters/API endpoints are shut down

In [11]:
!pip install -q --upgrade google-cloud-compute tabulate

In [27]:
# ── Billing-sanity cell ─────────────────────────────────────────────
from google.cloud import aiplatform, compute_v1
from tabulate import tabulate
import os

PROJECT  = "sentiment-analysis-steam"      # adjust if needed
REGION   = "us-central1"
# Numeric endpoint ID or full resource name:
ENDPOINT_ID = "265181313898643456"

# 1) Endpoint status -------------------------------------------------
aiplatform.init(project=PROJECT, location=REGION)
endpoint = aiplatform.Endpoint(f"projects/{PROJECT}/locations/{REGION}/endpoints/{ENDPOINT_ID}")

deployed = endpoint.gca_resource.deployed_models
if not deployed:
    print("✅ Vertex endpoint has **0** deployed replicas → $0 node-hour right now.\n")
else:
    rows = []
    for dm in deployed:
        res = dm.dedicated_resources
        machine = res.machine_spec.machine_type.split("/")[-1]
        rows.append([dm.id, machine,
                     res.min_replica_count, res.max_replica_count])
    print("⚠️  Endpoint is billing for the following replicas:")
    print(tabulate(rows, headers=["deployed_model_id", "machine_type", "min_repl", "max_repl"]))
    print()

# 2) Running Compute-Engine VMs -------------------------------------
compute = compute_v1.InstancesClient()
request = compute_v1.AggregatedListInstancesRequest(
    project=PROJECT,
    filter="status = RUNNING"
)

running = []
for zone, resp in compute.aggregated_list(request=request):
    for inst in getattr(resp, "instances", []):
        z = zone.split("/")[-1]
        running.append([inst.name, z, inst.machine_type.split('/')[-1]])

if not running:
    print("✅ No Compute-Engine instances are RUNNING → no VM charges.")
else:
    print("⚠️  These VMs are still RUNNING (incurring charges):")
    print(tabulate(running, headers=["instance", "zone", "machine_type"]))

✅ Vertex endpoint has **0** deployed replicas → $0 node-hour right now.

⚠️  These VMs are still RUNNING (incurring charges):
instance                  zone           machine_type
------------------------  -------------  --------------
instance-20250629-185113  us-central1-a  n2-highmem-8
