In [None]:
# !pip install --upgrade google-cloud-aiplatform
# !pip install pypdf

## Deploying Gemma to Vertex AI

In [None]:
GCP_PROJECT_ID = 'mbd-2024-gemma'
PROJECT_NUMBER = '743869327957'

In [None]:
from google.colab import auth
auth.authenticate_user(project_id=GCP_PROJECT_ID)

In [None]:
from typing import Dict, List, Union
from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

def predict_custom_trained_model_sample(
    project: str,
    endpoint_id: str,
    instances: Union[Dict, List[Dict]],
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    """
    `instances` can be either single instance of type dict or a list
    of instances.
    """
    client_options = {"api_endpoint": api_endpoint}

    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)

    instances = instances if isinstance(instances, list) else [instances]

    instances = [
        json_format.ParseDict(instance_dict, Value()) for instance_dict in instances
    ]

    parameters_dict = {}

    parameters = json_format.ParseDict(parameters_dict, Value())

    endpoint = client.endpoint_path(
        project=project, location=location, endpoint=endpoint_id
    )


    response = client.predict(
        endpoint=endpoint, instances=instances, parameters=parameters
    )

    res_list = list(response.predictions)
    prediction = res_list[0].split('\nOutput:\n')[-1]

    return prediction


In [None]:
instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]

In [None]:
res = predict_custom_trained_model_sample(
    project="743869327957",
    endpoint_id="3561003702040920064",
    location="us-central1",
    instances=instances
)

## Vector Embeddings

In [None]:
from vertexai.language_models import TextEmbeddingModel

def text_embedding(text_input) -> list:
    """
    Text embedding with a Large Language Model.

    Args:
      - text_input (str).

    Returns
      - vector (List[floats]).

    """
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    embeddings = model.get_embeddings([text_input])
    for embedding in embeddings:
        vector = embedding.values
    return vector


# Storing embeddings

### Task 1. Find a data structure and function that can store text chunks and their embeddings. Write a function to easily add a new text chunk to that data structure.

In [None]:
vector_memory = {}

In [None]:
def add_chunk(text_string, vector_memory=vector_memory):
  """
  Adds new text to vector memory.

  Args:
    - text_string (str). The text we want to embedd and store.
    - vector_memory (Dict) Dictionary mapping from text_strings to embedding vectors.
  """

  #TODO

# Retrieving embeddings

### Task 2. Write a function to retrieve the k most similar embeddings from that data structure.

In [None]:
def find_k_nearest_neighbors(query, k, vector_memory=vector_memory):
  """
  Given a query string, retrieves the k most similar vectors.

  Args:
    - query (string).
    - k (int)
    - vector_memory (Dict). Maps of text and embeddings

  Returns:
    - k most similar text strings.
  """

  #TODO

# Loading PDFs

### Task 3. Write a function that takes a filename, extracts the text of a pdf document, and splits it into chunks. You can use pypdf for that.

In [None]:
import pypdf

def extract_text_from_pdf(filename):
  """
  Loads a pdf file and chunks it.

  Args:
    - filen_mae (str): path to PDF file.

  Returns
  """
  with open(filename, 'rb') as pdf_file:

    pdf_reader = pypdf.PdfReader(pdf_file)

    #TODO

    return chunks


### Task 4. Load text chunks, create embeddings and save them in your vector data structure.

In [None]:
#TODO

### Task 5. Get a question from the user and find the most relevant context from the PDF you loaded.

In [None]:
#TODO

### Task 6. Use prompt engineering and the Gemma LLM to answer the question based on the context you retrieved.

In [None]:
#TODO