In [20]:
from google.cloud import storage
# !pip install PyMuPDF


In [29]:
def list_pdfs(bucket_name = 'lossless-learning', prefix="books"):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    return ['gs://lossless-learning/'+blob.name for blob in bucket.list_blobs(prefix=prefix) if blob.name.endswith(".pdf")]


In [30]:
pdf_list = list_pdfs()

In [31]:
pdf_list

['gs://lossless-learning/books/A Modern Introduction to Probability and Statistics by FM Dekking.pdf',
 'gs://lossless-learning/books/Advanced Calculus by Lynn H Loomis.pdf',
 'gs://lossless-learning/books/An Introduction to Statistical Learning by Gareth James.pdf',
 'gs://lossless-learning/books/Calculus by Gilbert Strang.pdf',
 'gs://lossless-learning/books/Deep Learning by Ian Goodfellow.pdf',
 'gs://lossless-learning/books/Deep Learning with Python by François Chollet.pdf',
 'gs://lossless-learning/books/Introducing MLOps by Mark Treveil.pdf',
 'gs://lossless-learning/books/Introduction to Applied Linear Algebra by Stephen Boyd.pdf',
 'gs://lossless-learning/books/Linear Algebra Done Right by Sheldon Axler.pdf',
 'gs://lossless-learning/books/Mathematics for Machine Learning by Marc Peter Deisenroth.pdf',
 'gs://lossless-learning/books/Probability and Statistics The Science of Uncertainity by Michael J. Evans.pdf',
 'gs://lossless-learning/books/The Big Book of MLOps by Databricks

In [68]:
import fitz  # PyMuPDF
from google.cloud import storage
import io

def read_pdf_from_gcs(bucket_name, blob_name):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    pdf_bytes = blob.download_as_bytes()

    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    pages = [page.get_text().lower() for page in doc]
    doc.close()
    return pages  # List of page texts


In [69]:
books_pages = []
for pdf in pdf_list:
    pages = read_pdf_from_gcs('lossless-learning',pdf )
    books_pages.append(pages)

In [70]:
import re

def slice_books_from_chapter_1(books_pages):
    sliced_books = []

    for book_pages in books_pages:
        chapter_start_idx = 0

        for i, page in enumerate(book_pages):
            if re.search(r"chapter\s*[\/\-:]?\s*(ONE|1)", page):
                chapter_start_idx = i
                break

        sliced_books.append(book_pages[chapter_start_idx:])

    return sliced_books


In [71]:
req_book_pages =  slice_books_from_chapter_1(books_pages)

In [100]:
req_book_pages[12][14]

'xxii\ncontents\n18.2\ndiagonal linear discriminant analysis\nand nearest shrunken centroids . . . . . . . . . . . . . .\n651\n18.3\nlinear classiﬁers with quadratic regularization . . . . .\n654\n18.3.1\nregularized discriminant analysis . . . . . . . .\n656\n18.3.2\nlogistic regression\nwith quadratic regularization . . . . . . . . . .\n657\n18.3.3\nthe support vector classiﬁer\n. . . . . . . . . .\n657\n18.3.4\nfeature selection . . . . . . . . . . . . . . . . . .\n658\n18.3.5\ncomputational shortcuts when p ≫n . . . . .\n659\n18.4\nlinear classiﬁers with l1 regularization\n. . . . . . . . .\n661\n18.4.1\napplication of lasso\nto protein mass spectroscopy\n. . . . . . . . . .\n664\n18.4.2\nthe fused lasso for functional data\n. . . . . .\n666\n18.5\nclassiﬁcation when features are unavailable . . . . . . .\n668\n18.5.1\nexample: string kernels\nand protein classiﬁcation . . . . . . . . . . . . .\n668\n18.5.2\nclassiﬁcation and other models using\ninner-product kernels and pairwise d

In [12]:
from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine

# TODO(developer): Uncomment these variables before running the sample.
project_id = "ardent-sun-453501-d5"
location = "global" # Values: "global"
data_store_id = "book_store"


def create_data_store_sample(
    project_id: str,
    location: str,
    data_store_id: str,
) -> str:
    #  For more information, refer to:
    # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)

    # The full resource name of the collection
    # e.g. projects/{project}/locations/{location}/collections/default_collection
    parent = client.collection_path(
        project=project_id,
        location=location,
        collection="default_collection",
    )

    data_store = discoveryengine.DataStore(
        display_name="datastore books",
        # Options: GENERIC, MEDIA, HEALTHCARE_FHIR
        industry_vertical=discoveryengine.IndustryVertical.GENERIC,
        # Options: SOLUTION_TYPE_RECOMMENDATION, SOLUTION_TYPE_SEARCH, SOLUTION_TYPE_CHAT, SOLUTION_TYPE_GENERATIVE_CHAT
        solution_types=[discoveryengine.SolutionType.SOLUTION_TYPE_SEARCH],
        # TODO(developer): Update content_config based on data store type.
        # Options: NO_CONTENT, CONTENT_REQUIRED, PUBLIC_WEBSITE
        content_config=discoveryengine.DataStore.ContentConfig.CONTENT_REQUIRED,
    )

    request = discoveryengine.CreateDataStoreRequest(
        parent=parent,
        data_store_id=data_store_id,
        data_store=data_store,
        # Optional: For Advanced Site Search Only
        # create_advanced_site_search=True,
    )

    # Make the request
    operation = client.create_data_store(request=request)

    print(f"Waiting for operation to complete: {operation.operation.name}")
    response = operation.result()

    # After the operation is complete,
    # get information from operation metadata
    metadata = discoveryengine.CreateDataStoreMetadata(operation.metadata)

    # Handle the response
    print(response)
    print(metadata)

    return operation.operation.name


In [13]:
create_data_store_sample(  project_id, location, data_store_id)

Waiting for operation to complete: projects/203101603788/locations/global/collections/default_collection/operations/create-data-store-5470281226982137829
name: "projects/203101603788/locations/global/collections/default_collection/dataStores/book_store"
display_name: "datastore books"
industry_vertical: GENERIC
solution_types: SOLUTION_TYPE_SEARCH
content_config: CONTENT_REQUIRED
default_schema_id: "default_schema"
document_processing_config {
  name: "projects/203101603788/locations/global/collections/default_collection/dataStores/book_store/documentProcessingConfig"
  default_parsing_config {
    digital_parsing_config {
    }
  }
}
serving_config_data_store {
}




'projects/203101603788/locations/global/collections/default_collection/operations/create-data-store-5470281226982137829'

In [33]:
from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine

# TODO(developer): Uncomment these variables before running the sample.
# project_id = "YOUR_PROJECT_ID"
# location = "YOUR_LOCATION" # Values: "global"
# data_store_id = "YOUR_DATA_STORE_ID"

# Examples:
# - Unstructured documents
#   - `gs://bucket/directory/file.pdf`
#   - `gs://bucket/directory/*.pdf`
# - Unstructured documents with JSONL Metadata
#   - `gs://bucket/directory/file.json`
# - Unstructured documents with CSV Metadata
#   - `gs://bucket/directory/file.csv`
gcs_uri = pdf_list

#  For more information, refer to:
# https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
client_options = (
    ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
    if location != "global"
    else None
)

# Create a client
client = discoveryengine.DocumentServiceClient(client_options=client_options)

# The full resource name of the search engine branch.
# e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
parent = client.branch_path(
    project=project_id,
    location=location,
    data_store=data_store_id,
    branch="default_branch",
)

request = discoveryengine.ImportDocumentsRequest(
    parent=parent,
    gcs_source=discoveryengine.GcsSource(
        # Multiple URIs are supported
        input_uris=gcs_uri,
        # Options:
        # - `content` - Unstructured documents (PDF, HTML, DOC, TXT, PPTX)
        # - `custom` - Unstructured documents with custom JSONL metadata
        # - `document` - Structured documents in the discoveryengine.Document format.
        # - `csv` - Unstructured documents with CSV metadata
        data_schema="content",
    ),
    # Options: `FULL`, `INCREMENTAL`
    reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
)

operation = client.import_documents(request=request)

print(f"Waiting for operation to complete: {operation.operation.name}")
response = operation.result()

metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

# Handle the response
print(response)
print(metadata)

Waiting for operation to complete: projects/203101603788/locations/global/collections/default_collection/dataStores/book_store/branches/0/operations/import-documents-13123355099857093690
error_config {
  gcs_prefix: "gs://203101603788_us_central1_import_content/errors13123355099857096525"
}

create_time {
  seconds: 1744483317
  nanos: 62505000
}
update_time {
  seconds: 1744483998
  nanos: 953814000
}
success_count: 13
total_count: 13



In [71]:
PROJECT_ID = "ardent-sun-453501-d5"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

DATA_STORE_ID = "book_store"  # @param {type:"string"}
DATA_STORE_LOCATION = "global"  # @param {type:"string"}

MODEL = "gemini-2.0-flash"  # @param {type:"string"}

# if PROJECT_ID == "ardent-sun-453501-d5" or DATA_STORE_ID == "book_store":
#     raise ValueError(
#         "Please set the PROJECT_ID, DATA_STORE_ID constants to reflect your environment."
#     )
     
    
from langchain.chains import (
    ConversationalRetrievalChain,
    RetrievalQA,
    RetrievalQAWithSourcesChain,
)
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain_google_community import (
    VertexAIMultiTurnSearchRetriever,
    VertexAISearchRetriever,
)
from langchain_google_vertexai import VertexAI




llm = VertexAI(model_name=MODEL)

retriever = VertexAISearchRetriever(
    project_id=PROJECT_ID,
    location_id=DATA_STORE_LOCATION,
    data_store_id=DATA_STORE_ID,
    get_extractive_answers=True,
    max_documents=4,
    max_extractive_segment_count=1,
    max_extractive_answer_count=5,
    beta=True
    
)


search_query = "What is MLOps"  # @param {type:"string"}

retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

ans  = retrieval_qa.invoke(search_query)



In [72]:
ans

{'query': 'What is MLOps',
 'result': "MLOps (Machine Learning Operations) is a rapidly evolving field that combines DataOps, DevOps, and ModelOps to build and maintain robust, flexible, and efficient workflows for developing, deploying, and maintaining AI models at scale. It's becoming critical for successful data science project deployment in the enterprise.\n",
 'source_documents': [Document(metadata={'id': 'c95500c41a9d900b798c436f0368e892', 'source': 'gs://lossless-learning/books/Introducing MLOps by Mark Treveil.pdf17'}, page_content='CHAPTER 1 Why Now and Challenges Machine learning operations (MLOps) is quickly becoming a critical component of successful data science project deployment in the enterprise (Figure 1-1).'),
  Document(metadata={'id': 'c95500c41a9d900b798c436f0368e892', 'source': 'gs://lossless-learning/books/Introducing MLOps by Mark Treveil.pdf21'}, page_content='The complexity of this environment, including the fact that machine learning models are made up of bot