In [68]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from collections import defaultdict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import json
from google.cloud import storage
from google.cloud import aiplatform
import os 

- `project_id`: The ID of your Google Cloud project.
- `location`: The region where your Vertex AI resources are located.
- `model_resource_name`: The resource name of the model you want to use for batch prediction. This should be a fully qualified resource name that includes the project, location, and model ID.
- `job_display_name`: The display name of the batch prediction job. This can be any string you choose.
- `gcs_source`: The Google Cloud Storage (GCS) URI or URIs where your input data is stored. This can be a string (for a single URI) or a list of strings (for multiple URIs).
- `gcs_destination`: The GCS URI where you want the output data to be stored.
- `instances_format`: The format of the input data. This can be "jsonl", "csv", "tf-record", "tf-record-gzip", or "file-list".
- `machine_type`: The type of machine to use for the batch prediction job. This should be a string that specifies a Compute Engine machine type, such as "n1-standard-2".
- `accelerator_count`: The number of accelerators to attach to each machine.
- `accelerator_type`: The type of accelerator to attach to each machine. This can be a string that specifies a Compute Engine accelerator type, such as "NVIDIA_TESLA_K80", or an `AcceleratorType` enum value.
- `starting_replica_count`: The initial number of replicas to use for the batch prediction job.
- `max_replica_count`: The maximum number of replicas to use for the batch prediction job.
- `sync`: A boolean value that specifies whether to block until the batch prediction job is completed. If `True`, the function will block until the job is completed. If `False`, the function will return immediately after the job is created.

In [69]:
PROJECT_ID = os.environ.get("PROJECT_ID")
REGION = os.environ.get("REGION")
BUCKET_NAME = os.environ.get("BUCKET_NAME")
MODEL_NAME = "berkamodel"
PIPELINE_NAME = "production"
MACHINE_TYPE = "n1-standard-2"
FILE_SCORING_NAME = "training_drivers"


In [70]:
model_resource_name = f"projects/{PROJECT_ID}/locations/{REGION}/models/{MODEL_NAME}"
job_display_name = f"{MODEL_NAME}_batch_prediction_job"
gcs_source = f"gs://{BUCKET_NAME}/{PIPELINE_NAME}/data/05_features/{FILE_SCORING_NAME}.csv"
gcs_destination = f"gs://{BUCKET_NAME}/{PIPELINE_NAME}/data/07_output/"
instances_format = "csv"
machine_type = "n1-standard-2"
accelerator_count = 0
accelerator_type = None
starting_replica_count = 1
max_replica_count = 1
sync = True


from google.cloud import aiplatform

def get_model_by_display_name(display_name, verbose=False):
    client = aiplatform.gapic.ModelServiceClient(client_options={"api_endpoint": f"{REGION}-aiplatform.googleapis.com"})
    parent = f"projects/{PROJECT_ID}/locations/{REGION}"
    response = client.list_models(parent=parent)

    for model in response:
        if model.display_name == display_name:
            if verbose:
                print(f"Model {display_name} found.")
                print(f"Model details:\n {model}")
            return model

    return None


# Define the details for the batch prediction job
aiplatform.init(project=PROJECT_ID, location=REGION)

model_id = get_model_by_display_name(MODEL_NAME).name
model_container = aiplatform.Model(model_id)

batch_prediction_job = model_container.batch_predict(
    job_display_name=job_display_name,
    gcs_source=gcs_source,
    gcs_destination_prefix=gcs_destination,
    instances_format=instances_format,
    machine_type=machine_type,
    accelerator_count=accelerator_count,
    accelerator_type=accelerator_type,
    starting_replica_count=starting_replica_count,
    max_replica_count=max_replica_count,
    sync=sync,
)

batch_prediction_job.wait()

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/1036389498447/locations/europe-west6/batchPredictionJobs/5847697616745267200
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/1036389498447/locations/europe-west6/batchPredictionJobs/5847697616745267200')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/europe-west6/batch-predictions/5847697616745267200?project=1036389498447
BatchPredictionJob projects/1036389498447/locations/europe-west6/batchPredictionJobs/5847697616745267200 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/1036389498447/locations/europe-west6/batchPredictionJobs/5847697616745267200 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/1036389498447/locations/europe-west6/batchPredictionJobs/5847697616745267200 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/1036389498447/locations/europe-west6/batchP