In [None]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

# Generate text embeddings with `MLTransform` using Vertex AI API


Vertex AI provides an [API](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) to generate text embeddings using Google’s large generative AI models. To generate text embeddings using Vertex AI API, use `VertexAITextEmbeddings` class to specify configuration of the model.


To use Vertex AI API,  `google-cloud-aiplatform` python package needs to be installed and either one of the following should be met
* Credentials configured for your Google cloud project such as gcloud etc. For more information, look at https://googleapis.dev/python/google-auth/latest/reference/google.auth.html#module-google.auth
* Store the path to a service account JSON file using [GOOGLE_APPLICATION_CREDENTIALS](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) environment variable.

To use your Google Cloud account, authenticate this notebook.

In [None]:
from google.colab import auth
auth.authenticate_user()

project = 'google.com:clouddfe' # Replace with a valid project id.

# Install dependencies
 Install Apache Beam and required dependencies for VertexAI API

In [None]:
! git clone https://github.com/apache/beam.git
! cd beam/sdks/python
! pip install beam/sdks/python[gcp]


## Import the required modules



In [None]:
import tempfile
import apache_beam as beam
from apache_beam.ml.transforms.embeddings.vertex_ai import VertexAITextEmbeddings

#### MLTransform in write mode

In `write` mode, `MLTransform` saves the transforms and its attributes to an artifact location. These transforms are reused in `read` mode of `MLTransform`.


For more information about using `MLTransform`, see [Preprocess data with MLTransform](https://beam.apache.org/documentation/ml/preprocess-data/) in the Apache Beam documentation.

In [None]:
artifact_location = tempfile.mkdtemp(prefix='vertex_ai')

# use the latest text embedding model from Vertex AI API documentation
# https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text-embeddings
text_embedding_model_name = 'textembedding-gecko@latest'

# generate text embedding on the sentences.
content = [{ 'x' : 'I would like embeddings for this text'}, {'x' : 'Hello world'},
           {
               'x': 'The Dog is running in the park.'
           }]

The `MLTransform` function processes a dictionary containing column names and their corresponding text data. For each sentence, it generates a list of embeddings. This pipeline involves calling the Vertex AI API for online prediction on a specified model, to yield text embeddings based on the input sentences.

In [None]:
embedding_transform = VertexAITextEmbeddings(
        model_name=text_embedding_model_name, columns=['x'], project=project)

with beam.Pipeline() as pipeline:
  data_pcoll = (
          pipeline
          | "CreateData" >> beam.Create(content))
  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location).with_transform(embedding_transform))

  transformed_pcoll | 'LogOutput' >> beam.Map(print)

  transformed_pcoll | "PrintEmbeddingShape" >> beam.Map(lambda x: print(f"Embedding shape: {len(x['x'])}"))

{'x': [0.041293490678071976, -0.010302993468940258, -0.048611514270305634, -0.01360565796494484, 0.06441926211118698, 0.022573700174689293, 0.016446372494101524, -0.033894773572683334, 0.004581860266625881, 0.060710687190294266, -0.021728642284870148, 0.021351153030991554, -0.029735974967479706, 0.02554303966462612, -0.003689623437821865, -0.054144348949193954, 0.045556843280792236, 0.024512041360139847, 0.033651020377874374, -0.007227035705000162, 0.0034407798666507006, 0.01046749297529459, -0.0003862503217533231, -0.017267994582653046, 0.013953671790659428, -0.02976437471807003, 0.023665405809879303, -0.04075342044234276, -0.03480035066604614, -0.0114308912307024, -0.0239212978631258, 0.04272296652197838, -0.028070665895938873, 0.016720645129680634, 0.01396490354090929, -0.03568996116518974, -0.012728322297334671, -0.01839173398911953, -0.00044931433512829244, -0.01082014013081789, 0.007709820754826069, -0.03283832222223282, -0.022093195468187332, 0.008698013611137867, -0.02187746576

#### MLTransform in read mode

In `read` mode, `MLTransform` will use the saved artifacts from `write` mode. In this case, the transform and its attributes are loaded from the saved artifacts without the need of specifying artifacts again during `read` mode.

In this way, `MLTransform` provides a consistent preprocessing steps for training and inference workloads.

For more information about using `MLTransform`, see [Preprocess data with MLTransform](https://beam.apache.org/documentation/ml/preprocess-data/) in the Apache Beam documentation

In [None]:
test_content = [
    {
        'x': 'This is a test sentence'
    },
    {
        'x': 'The park is full of dogs'
    },
]

with beam.Pipeline() as pipeline:
  data_pcoll = (
          pipeline
          | "CreateData" >> beam.Create(test_content))
  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(read_artifact_location=artifact_location))

  transformed_pcoll | 'LogOutput' >> beam.Map(print)
