# Read in all the required documents

## If your docs are all chunked and stored in the Matching Enginee, this document-loading part does not need to be run.

In [1]:
# Load in your documents to local environment

# Documentation Optimization


## Try the method of "question_ansering_document“ from Google Generative AI



### Some preparation steps, there is a "utils" folder in one of the Google's Github repo, it is an important one that contains some key functions to make the rest work. At the end of this section, there are some variable for google cloud project. I was using the one Jon created. Feel free to adjust it if there is need

In [None]:
# Copy a required utils folder from github

%%capture
!git clone https://github.com/GoogleCloudPlatform/generative-ai.git
!cp -r /content/generative-ai/language/examples/document-qa/utils /content

# Install Vertex AI LLM SDK
! pip install -U google-cloud-aiplatform==1.27.0 langchain==0.0.201

# Dependencies required by Unstructured PDF loader
! sudo apt -y -qq install tesseract-ocr libtesseract-dev
! sudo apt-get -y -qq install poppler-utils
! pip install -U unstructured==0.7.5 pdf2image==1.16.3 pytesseract==0.3.10 pdfminer.six

# For Matching Engine integration dependencies (default embeddings)
! pip install -U tensorflow_hub==0.13.0 tensorflow_text==2.12.1

# Automatically restart kernel after installs so that your environment can access the new packages
# The environment will restart and just run the following cells
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
from google.colab import auth

auth.authenticate_user()

import json
import textwrap
# Utils
import time
import uuid
from typing import List

import numpy as np
import vertexai
# Vertex AI
from google.cloud import aiplatform

print(f"Vertex AI SDK version: {aiplatform.__version__}")

# Langchain
import langchain

print(f"LangChain version: {langchain.__version__}")

from langchain.chains import RetrievalQA
from langchain.document_loaders import GCSDirectoryLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pydantic import BaseModel

# Import custom Matching Engine packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils


PROJECT_ID = "mdai-interns-summer-2023"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]


# Text model instance integrated with langChain
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=1024,
    temperature=0.2,
    top_p=0.8,
    # top_k=40,
    top_k=20,
    verbose=True,
)

# Embeddings API integrated with langChain
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)


ME_REGION = # your region
ME_INDEX_NAME = # your index name @param {type:"string"}
ME_EMBEDDING_DIR =  # a google cloud storage bucket @param {type:"string"}
ME_DIMENSIONS = 768  # when using Vertex PaLM Embedding
mengine = MatchingEngineUtils(PROJECT_ID, ME_REGION, ME_INDEX_NAME)

Vertex AI SDK version: 1.27.0
LangChain version: 0.0.201


### Run this section of codes if the matching engine has not been deployed. (I have deployed, so this part could be skipped)

In [None]:
# Make a Google Cloud Storage bucket for your Matching Engine index
! gsutil mb -l us-central1 $ME_EMBEDDING_DIR

Creating gs://mdai_matching_engine_test_1/...


In [None]:
# dummy embedding
init_embedding = {"id": str(uuid.uuid4()), "embedding": list(np.zeros(ME_DIMENSIONS))}

# dump embedding to a local file
with open("embeddings_0.json", "w") as f:
    json.dump(init_embedding, f)

# write embedding to Cloud Storage
! gsutil cp embeddings_0.json {ME_EMBEDDING_DIR}/init_index/embeddings_0.json

Copying file://embeddings_0.json [Content-Type=application/json]...
/ [1 files][  3.8 KiB/  3.8 KiB]                                                
Operation completed over 1 objects/3.8 KiB.                                      


In [None]:
# create index
index = mengine.create_index(
    embedding_gcs_uri=f"{ME_EMBEDDING_DIR}/init_index",
    dimensions=ME_DIMENSIONS,
    index_update_method="streaming",
    index_algorithm="tree-ah",
)
if index:
    print(index.name)

......................................................projects/837432346917/locations/us-central1/indexes/4678738635535679488


In [None]:
index_endpoint = mengine.deploy_index()
if index_endpoint:
    print(f"Index endpoint resource name: {index_endpoint.name}")
    print(
        f"Index endpoint public domain name: {index_endpoint.public_endpoint_domain_name}"
    )
    print("Deployed indexes on the index endpoint:")
    for d in index_endpoint.deployed_indexes:
        print(f"    {d.id}")

.................Index endpoint resource name: projects/837432346917/locations/us-central1/indexEndpoints/3170032760366563328
Index endpoint public domain name: 
Deployed indexes on the index endpoint:


### This section of code load documents into the matching enginee. If you already loaded documents before, the docs will be added to a google bucket. So be careful that repeated running will cause repeated files inserted (I have loaded into this as my project, so this part could be skipped)

In [None]:
# """
# The commented area loads documents from GCS
# """
# # cloud bucket path
# GCS_URL_KB = "gs://mdai-summer-interns-2023-files/docs/content/"

# # Ingest files
# bucket_name = GCS_URL_KB.split("/")[2]
# print(bucket_name)
# prefix = "/".join(GCS_URL_KB.split("/")[3:])
# print(prefix)

# print(f"Processing documents from {GCS_URL_KB}")
# loader = GCSDirectoryLoader(project_name=PROJECT_ID, bucket=bucket_name, prefix=prefix, glob="**/*.md")

"""
This part tries to directly read from directory within the environment
"""
from langchain.document_loaders import DirectoryLoader

path = "/content/gcs-store/docs/content"
loader = DirectoryLoader(path, glob="**/*.md")

documents = loader.load()

# Add document name and source to the metadata
for document in documents:
    doc_md = document.metadata
    document_name = doc_md["source"].split("/")[-1]

    split_list = doc_md["source"].split("/")
    source = "/".join(split_list[:len(split_list)-1])

    # This commented part works for Google cloud storage path

    # doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    # source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": source, "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents)}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# of documents loaded (pre-chunking) = 71


In [None]:
# Do an extra step of filter (filter out those files with keyword "update" in title), we don't want unnecessary changelog files
documents = [document for document in documents if 'updates' not in document.metadata['document_name']]
print(len(documents))

53


In [None]:
# chunk documents

# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)
# doc_splits = text_splitter.split_documents(documents_dyllan)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")


# of documents = 227


### This part is created to insert an extra file outside the documentation page, (If this is already added, can skip) All these could be adjusted, code is quite similar

In [None]:
# This code is wrote to specifically insert Dyllan's code which is out of the current
# documenation folder

from langchain.document_loaders import TextLoader

path = "/content/DICOM_SR_SEG.md"
loader_dyllan = TextLoader(path)

documents_dyllan = loader_dyllan.load()

# Add document name and source to the metadata
for document in documents_dyllan:
    doc_md = document.metadata
    document_name = doc_md["source"].split("/")[-1]


    # This commented part works for Google cloud storage path

    # doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    # source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": "DICOM_SR_EXPORT", "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents_dyllan)}")

# of documents loaded (pre-chunking) = 1


In [None]:
# chunk documents

# split the documents into chunks
text_splitter_dyllan = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
# doc_splits = text_splitter.split_documents(documents)
doc_splits_dyllan = text_splitter_dyllan.split_documents(documents_dyllan)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits_dyllan):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits_dyllan)}")



# of documents = 28


In [None]:
doc_splits_dyllan[0]

Document(page_content='## MD.AI Annotation to SR Export Tool\n### Export Annotations from MD.AI\'s platform into DICOM SR Format\n\nSome MD.ai packeges and some other Python packages need to be inputted. And then there is the of class "SR_SEG_Export". You need to have this class to be in a python file that can be accessed from your envionrment. You could then use this class to finish exporting annotation to DICOM SR format. The SR_SEG_Export class could also be found in this notebook: https://colab.research.google.com/drive/1d4I7wZhlRIS2T0nFo6dhmefi5qDHBSkW?usp=sharing\n\n```python\nimport mdai.SR_SEG_Export\n\nmdai.SR_SEG_EXPORT.SR_SEG_EXPORT.SR_Export(\n  annotation_json = \'mdai_staging_project_L1NpnQBv_annotations_2023-06-28-135404.json\',\n  metadata_json = \'mdai_staging_project_L1NpnQBv_annotations_2023-06-28-165932.json\'\n)\n\n```\n\n\n```python\nimport mdai\nimport pandas as pd\nfrom datetime import datetime\nimport os\nimport requests', metadata={'source': 'DICOM_SR_EXPORT',

In [None]:
# doc_splits = doc_splits+ doc_splits_dyllan
# len(doc_splits)

In [None]:
# Specifically for Dyllan's extra Dicom document
texts = [doc.page_content for doc in doc_splits_dyllan]
metadatas = [
    [
        {"namespace": "source", "allow_list": [doc.metadata["source"]]},
        {"namespace": "document_name", "allow_list": [doc.metadata["document_name"]]},
        {"namespace": "chunk", "allow_list": [str(doc.metadata["chunk"])]},
    ]
    for doc in doc_splits_dyllan
]

In [None]:
first = [doc_splits_dyllan[0]]
texts = [doc.page_content for doc in first]
metadatas = [
    [
        {"namespace": "source", "allow_list": [doc.metadata["source"]]},
        {"namespace": "document_name", "allow_list": [doc.metadata["document_name"]]},
        {"namespace": "chunk", "allow_list": [str(doc.metadata["chunk"])]},
    ]
    for doc in first
]

### Start to feed in document chunks (If the documents are already added, can skip)

In [None]:
# mengine = MatchingEngineUtils(PROJECT_ID, ME_REGION, ME_INDEX_NAME)
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

ME_INDEX_ID=projects/837432346917/locations/us-central1/indexes/4678738635535679488
ME_INDEX_ENDPOINT_ID=projects/837432346917/locations/us-central1/indexEndpoints/3170032760366563328


In [None]:
# initialize vector store
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=ME_REGION,
    gcs_bucket_name=f'gs://{ME_EMBEDDING_DIR.split("/")[2]}',
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
)

In [None]:
# Store docs as embeddings in Matching Engine index
# It may take a while since API is rate limited
texts = [doc.page_content for doc in doc_splits]
metadatas = [
    [
        {"namespace": "source", "allow_list": [doc.metadata["source"]]},
        {"namespace": "document_name", "allow_list": [doc.metadata["document_name"]]},
        {"namespace": "chunk", "allow_list": [str(doc.metadata["chunk"])]},
    ]
    for doc in doc_splits
]

In [None]:
# Insert, 100 at a time, otherwise weird quota limit error might occur
for i in range(int(len(metadatas)/100)+1):
  print(f"{i} start")
  start = i * 100

  end = (i+1) * 100
  if end <= len(metadatas):
    doc_ids = me.add_texts(texts=texts[start:end], metadatas=metadatas[start:end])
  else:
    doc_ids = me.add_texts(texts=texts[start:], metadatas=metadatas[start:])


### Testing section

In [None]:
# mengine = MatchingEngineUtils(PROJECT_ID, ME_REGION, ME_INDEX_NAME)
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

# initialize vector store
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=ME_REGION,
    gcs_bucket_name=f'gs://{ME_EMBEDDING_DIR.split("/")[2]}',
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
)

ME_INDEX_ID=projects/837432346917/locations/us-central1/indexes/4678738635535679488
ME_INDEX_ENDPOINT_ID=projects/837432346917/locations/us-central1/indexEndpoints/3170032760366563328


### This is the part that uses GPT model, the ultimate function used to generate response is called "gpt_response_generation"

In [None]:
# This is the code provided by Jon to deal with some really weird situtions, when some input document text cause LLM not able to generate anything
import requests
import logging
from http.client import HTTPConnection  # py3

log = logging.getLogger('urllib3')
log.setLevel(logging.DEBUG)

# logging from urllib3 to console
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
log.addHandler(ch)

# print statements from `http.client.HTTPConnection` to console/stdout
HTTPConnection.debuglevel = 1

In [None]:
question = "Write me a script that works for the whole pipeline of exporting MD.ai annotations into DICOM SR format and store to my default google bucket."
# question = "write me a script that exports annotations into DICOM SR format"

#### Start using gpt to generate response

In [None]:
%%capture
! pip install --upgrade mdai

import mdai
import json
domain = # your domain
access_token = # your personal token
mdai_client = mdai.Client(domain= domain, access_token= access_token)

In [None]:
def gpt_response_generation(me, question):
  system_prompt1 = " The detailed pipeline script should cover these steps 1. download the annotation and metadata json files using the mdai_client.project and mdaiclient.download_dicom_metadata functions, do not download anything else such as model output, do not convert json to dataframe 2. after downloading, find these 2 json files by searching keyword 'dicom_meta_data' and 'annotations' in it, pass them as input to do the DICOM SR export 3. then directly upload the output folder to google bucket"
  question_with_guideline = question + system_prompt1

  res = me.similarity_search(question_with_guideline, k=7, search_distance = 0.6)
  content = [doc.page_content for doc in res]
  # Retrieval Prompt Template
  template = """SYSTEM: You are an intelligent assistant helping the users with their questions on techniques related to md.ai cloud platform products.

  Question: {question}

  There is a set of rules:

  1. Use the following pieces of context to answer the question at the end. Think step-by-step and then answer.

  2. If the answer is about coding, please make sure it is easy to separate text and code.

  3. If there are parts that the context does not answer, and it is about coding, you can use your coding knowledge.

  4. For code that interacts with MD.ai, please strictly follow the structure in context, do not create function name by yourself

  5. But for conceptual questions, Do not try to make up an answer and strictly answer based on the context.

  6. If the context is empty, just say "I do not know the answer to that."

  ============
  {explanation}
  ============

  =============
  {context}
  =============

  Personal Information that should be filled into code
  =============
  domain: staging.md.ai
  personal token: 1234567mdai
  project id: 1234567mdaiProject
  dataset id: 1234567Dataset
  google bucket name: ClarenceTest1
  output folder: ./SR_Output



  =============



  Question: {question}
  Helpful Answer:"""

  """
  - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
  """
  formatted_template = template.format(explanation = system_prompt1 , context='\n'.join(content), question = question)
  return formatted_template
  # messages_p2 = [
  #           {"role": "user", "content": formatted_template}]

  # result = mdai_client.chat_completion.create(messages_p2, model='gpt-4', temperature=0)
  # return result["choices"][0]["message"]["content"]

In [None]:
gpt_response_generation(me, question)

Waiting




#### The generated response:

Sure, here is a Python script that should accomplish this task. Please replace the placeholders with your actual values.

```python
import os
import mdai
from google.cloud import storage

# Define your personal information
DOMAIN = 'staging.md.ai'
YOUR_PERSONAL_TOKEN = '1234567mdai'
PROJECT_ID = '1234567mdaiProject'
DATASET_ID = '1234567Dataset'
GOOGLE_BUCKET_NAME = 'ClarenceTest1'
OUTPUT_FOLDER = './SR_Output'

# Create mdai client
mdai_client = mdai.Client(domain=DOMAIN, access_token=YOUR_PERSONAL_TOKEN)

# Download only the annotation data
p = mdai_client.project(PROJECT_ID, path='.',  annotations_only=True)

# Download only the DICOM metadata
p = mdai_client.download_dicom_metadata(PROJECT_ID, DATASET_ID, format ="json", path='.')

# Find the downloaded json files
annotation_json = [f for f in os.listdir('.') if 'annotations' in f][0]
metadata_json = [f for f in os.listdir('.') if 'dicom_meta_data' in f][0]

# Export annotations to DICOM SR format
import mdai.SR_SEG_Export
mdai.SR_SEG_EXPORT.SR_SEG_EXPORT.SR_Export(
  annotation_json = annotation_json,
  metadata_json = metadata_json
)

# Upload the output folder to Google Cloud Storage
storage_client = storage.Client()
bucket = storage_client.get_bucket(GOOGLE_BUCKET_NAME)
blobs = bucket.list_blobs()

for blob in blobs:
    file_path = os.path.join(OUTPUT_FOLDER, blob.name)
    blob.upload_from_filename(file_path)
```

This script first downloads the annotation and metadata json files using the mdai_client.project and mdai_client.download_dicom_metadata functions. It then finds these json files and passes them as input to the DICOM SR export function. Finally, it uploads the output folder to your specified Google Cloud Storage bucket.

#### Best Version of the response



Based on the data obtained from the link [https://link](https://link), the content suggests a process to export MD.ai annotations into DICOM SR format and store them in a Google bucket. Here is a step-by-step guide:

1. First, we need to download the annotation and metadata JSON files. This can be done using the `mdai_client.project` and `mdai_client.download_dicom_metadata` functions.

```python
import mdai

DOMAIN = 'http://md.ai'
YOUR_PERSONAL_TOKEN = '1234567mdai'
PROJECT_ID = '1234567mdaiProject'
DATASET_ID = '1234567Dataset'
mdai_client = mdai.Client(domain=DOMAIN, access_token=YOUR_PERSONAL_TOKEN)

# Download the annotation data
p = mdai_client.project(PROJECT_ID, path='.',  annotations_only=True)

# Download the DICOM metadata
p = mdai_client.download_dicom_metadata(PROJECT_ID, DATASET_ID, format ="json", path='.')
```

2. After downloading, we need to read in the path of these two files as input to do the DICOM SR export.

```python
import mdai.SR_SEG_Export

mdai.SR_SEG_EXPORT.SR_SEG_EXPORT.SR_Export(
  annotation_json = 'mdai_public_project_MwBe19Br_annotations_labelgroup_all_2020-09-23-214038.json',
  metadata_json = 'mdai_public_project_MwBe19Br_dicom_metadata_2020-09-23-214038.json'
)
```

3. Finally, we need to upload the output folder to the Google bucket.

```python
from google.cloud import storage

def upload_to_bucket(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        "File {} uploaded to {}.".format(
            source_file_name, destination_blob_name
        )
    )

upload_to_bucket('ClarenceTest1', './SR_Output', 'SR_Output')
```

Please replace the placeholders with your actual data. This script should help you achieve your goal.

### This is the part that uses Google PaLM model, the utlimate function used to generate is called "ask"

In [None]:
# Configure Question/Answering Chain with Vector Store using Text

# Create chain to answer questions
NUMBER_OF_RESULTS = 5
SEARCH_DISTANCE_THRESHOLD = 0.6

# Expose index to the retriever
retriever = me.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
)


In [None]:
# Retrieval Prompt Template
template = """SYSTEM: You are an intelligent assistant helping the users with their questions on techniques related to md.ai cloud platform products.

Question: {question}

There is a set of rules:

1. Use the following pieces of context to answer the question at the end. Think step-by-step and then answer.

2. If the answer is about coding, please make sure it is easy to separate text and code.

3. If there are parts that the context does not answer, and it is about coding, you can use your coding knowledge.

4. If the context has a link, always directly include the link in the response

5. But for conceptual questions, Do not try to make up an answer and strictly answer based on the context.

6. If the context is empty, just say "I do not know the answer to that."


=============
{context}
=============

Question: {question}
Helpful Answer:"""

"""
 - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
"""

'\n - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."\n'

In [None]:
# Configure RetrievalQA chain

# Uses LLM to synthesize results from the search index.
# Use Vertex PaLM Text API for LLM
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    },
)

In [None]:
# Enable for troubleshooting
qa.combine_documents_chain.verbose = True
qa.combine_documents_chain.llm_chain.verbose = True
qa.combine_documents_chain.llm_chain.llm.verbose = True

In [None]:
# Main functions

def formatter(result):
    print(f"Query: {result['query']}")
    print("." * 80)
    if "source_documents" in result.keys():
        for idx, ref in enumerate(result["source_documents"]):
            print("-" * 80)
            print(f"REFERENCE #{idx}")
            print("-" * 80)
            if "score" in ref.metadata:
                print(f"Matching Score: {ref.metadata['score']}")
            if "source" in ref.metadata:
                print(f"Document Source: {ref.metadata['source']}")
            if "document_name" in ref.metadata:
                print(f"Document Name: {ref.metadata['document_name']}")
            print("." * 80)
            print(f"Content: \n{wrap(ref.page_content)}")
    print("." * 80)
    response = wrap(result['result'])
    print(f"Response: {response}")
    print("." * 80)
    return response


def wrap(s):
    return "\n".join(textwrap.wrap(s, width=120, break_long_words=False))


def ask(query, qa=qa, k=NUMBER_OF_RESULTS, search_distance=SEARCH_DISTANCE_THRESHOLD):
    qa.retriever.search_kwargs["search_distance"] = search_distance
    qa.retriever.search_kwargs["k"] = k
    result = qa({"query": query})
    print(result)
    return formatter(result)

#### This is the part that I try to answer a list of questions in a file called "questions.txt“ and format the response in a csv form

In [None]:
%%capture
import pandas as pd
response_list = []
with open('/content/questions.txt', 'r') as file:
    content = file.read()

# Split the content by newline
question_list = content.split('\n')
for question in question_list:
  response_list.append(ask(question))


In [None]:
df.to_csv("PaLM_systemPrompt1_neighbor3_threshold0.6.csv", index = False)

In [None]:
df = pd.DataFrame({'prompt': question_list, 'response': response_list})
df

Unnamed: 0,prompt,response
0,What is Md.ai command line interface tool?,The MD.ai CLI is a command line interface tool...
1,What is Md.ai?,MD.ai is a cloud platform for medical image an...
2,What should I have for dinner?,I do not know the answer to that.
3,Do you like dolphin?,I do not know the answer to that.
4,Does MD.ai support de-id?,"Yes, MD.ai supports de-id. It has a separate D..."
5,Show me the example code to download MD.ai ann...,The following code can be used to download MD....
6,How can I perform segmentation on <some anatom...,To perform segmentation on <some anatomical st...
7,How can I quickly segment <some region of inte...,To quickly segment <some region of interest> a...
8,Show me how to render images from one plane to...,To render images from one plane to other plane...
9,How would I provide sub-classification on a la...,"To provide sub-classification on a label, you ..."


In [None]:
df = pd.DataFrame({'prompt': question_list, 'response': response_list})
df

Unnamed: 0,prompt,response
0,What is Md.ai command line interface tool?,The MD.ai CLI is a convenient tool for efficie...
1,What is Md.ai?,MD.ai is a cloud platform for medical imaging ...
2,What should I have for dinner?,I do not know the answer to that.
3,Do you like dolphin?,I do not know the answer to that.
4,Does MD.ai support de-id?,"Yes, MD.ai provides a suite of tools for all y..."
5,Show me the example code to download MD.ai ann...,The code to download MD.ai annotations is as f...
6,How can I perform segmentation on <some anatom...,To perform segmentation on <some anatomical st...
7,How can I quickly segment <some region of inte...,Interpolation helps solve this problem by aski...
8,Show me how to render images from one plane to...,I do not know the answer to that.
9,How would I provide sub-classification on a la...,I do not know the answer to that.


In [None]:
df = pd.DataFrame({'prompt': question_list, 'response': response_list})
df

Unnamed: 0,prompt,response
0,What is Md.ai command line interface tool?,The MD.ai CLI is a convenient tool for efficie...
1,What is Md.ai?,MD.ai is a cloud platform for medical image an...
2,What should I have for dinner?,I do not know the answer to that.
3,Do you like dolphin?,I do not know the answer to that.
4,Does MD.ai support de-id?,"Yes, MD.ai provides a suite of tools for all y..."
5,Show me the example code to download MD.ai ann...,The following code snippet shows how to downlo...
6,How can I perform segmentation on <some anatom...,To perform segmentation on <some anatomical st...
7,How can I quickly segment <some region of inte...,To quickly segment <some region of interest> a...
8,Show me how to render images from one plane to...,To render images from one plane to other plane...
9,How would I provide sub-classification on a la...,"To provide sub-classification on a label, you ..."


In [None]:
df = pd.DataFrame({'prompt': question_list, 'response': response_list})
df

Unnamed: 0,prompt,response
0,What is Md.ai command line interface tool?,The MD.ai CLI is a command line interface tool...
1,What is Md.ai?,MD.ai is a cloud platform for medical image an...
2,What should I have for dinner?,I do not know the answer to that.
3,Do you like dolphin?,I do not know the answer to that.
4,Does MD.ai support de-id?,"Yes, MD.ai supports de-id. It provides a suite..."
5,Show me the example code to download MD.ai ann...,"To download MD.ai annotations, you can use the..."
6,How can I perform segmentation on <some anatom...,To perform segmentation on <some anatomical st...
7,How can I quickly segment <some region of inte...,Interpolation helps solve this problem by aski...
8,Show me how to render images from one plane to...,To render images from one plane to other plane...
9,How would I provide sub-classification on a la...,"To provide sub-classification on a label, you ..."


No charts were generated by quickchart


In [None]:
ask("what is the code to export annotations into DICOM SR format?")



[1m> Entering new  chain...[0m
Waiting


[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSYSTEM: You are an intelligent assistant helping the users with their questions on techniques related to md.ai cloud platform products.

Question: what is the code to export annotations into DICOM SR format?

There is a set of rules:

1. Use the following pieces of context to answer the question at the end. Think step-by-step and then answer.

2. If the answer is about coding, please make sure it is easy to separate text and code.

3. If there are parts that the context does not answer, and it is about coding, you can use your coding knowledge.

4. If the context has a link, always directly include the link in the response

5. But for conceptual questions, Do not try to make up an answer and strictly answer based on the context.

6. If the context is empty, just say "I do not know the answer to that."


## MD.AI Annotation to SR Export T

"The code to export annotations into DICOM SR format is as follows:  ```python import mdai.SR_SEG_Export\nmdai.SR_SEG_EXPORT.SR_SEG_EXPORT.SR_Export(   annotation_json =\n'mdai_staging_project_L1NpnQBv_annotations_2023-06-28-135404.json',   metadata_json =\n'mdai_staging_project_L1NpnQBv_annotations_2023-06-28-165932.json' ) ```"

In [None]:
ask("what is the code to export annotations into DICOM SR format?")



[1m> Entering new  chain...[0m
Waiting


[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSYSTEM: You are an intelligent assistant helping the users with their questions on techniques related to md.ai cloud platform products.

Question: what is the code to export annotations into DICOM SR format?

There is a set of rules: 

1. Use the following pieces of context to answer the question at the end. Think step-by-step and then answer.

2. If the answer is about coding, please make sure it is easy to separate text and code.

3. If there are parts that the context does not answer, and it is about coding, you can use your coding knowledge.

4. If the context has a link, always directly include the link in the response

5. But for conceptual questions, Do not try to make up an answer and strictly answer based on the context.

6. If the context is empty, just say "I do not know the answer to that."


## MD.AI Annotation to SR Export 

"To export annotations into DICOM SR format, you can use the following code:  ```python import mdai.SR_SEG_Export\nmdai.SR_SEG_EXPORT.SR_SEG_EXPORT.SR_Export(   annotation_json =\n'mdai_staging_project_L1NpnQBv_annotations_2023-06-28-135404.json',   metadata_json =\n'mdai_staging_project_L1NpnQBv_annotations_2023-06-28-165932.json' ) ```  This code will export the annotations in the\nspecified JSON files into DICOM SR format."

In [None]:
! gcloud ai indexes list --project="mdai-interns-summer-2023" --region="us-central1"

### This is the part that deletes and undeploy index, be careful when running it

In [None]:
# Cleanup
CLEANUP_RESOURCES = True

In [None]:
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

ME_INDEX_ID=projects/837432346917/locations/us-central1/indexes/5471935119906308096
ME_INDEX_ENDPOINT_ID=projects/837432346917/locations/us-central1/indexEndpoints/3505550932605665280


In [None]:
# undeploy
if CLEANUP_RESOURCES and "mengine" in globals():
    print(
        f"Undeploying all indexes and deleting the index endpoint {ME_INDEX_ENDPOINT_ID}"
    )
    mengine.delete_index_endpoint()

Undeploying all indexes and deleting the index endpoint projects/837432346917/locations/us-central1/indexEndpoints/3505550932605665280


In [None]:
# delete
if CLEANUP_RESOURCES and "mengine" in globals():
    print(f"Deleting the index {ME_INDEX_ID}")
    mengine.delete_index()

Deleting the index projects/837432346917/locations/us-central1/indexes/5471935119906308096


In [None]:
if CLEANUP_RESOURCES and "ME_EMBEDDING_DIR" in globals():
    print(f"Deleting contents from the Cloud Storage bucket {ME_EMBEDDING_DIR}")
    ME_EMBEDDING_BUCKET = "/".join(ME_EMBEDDING_DIR.split("/")[:3])

    shell_output = ! gsutil du -ash $ME_EMBEDDING_BUCKET
    print(
        f"Size of the bucket {ME_EMBEDDING_BUCKET} before deleting = {' '.join(shell_output[0].split()[:2])}"
    )

    # uncomment below line to delete contents of the bucket
    ! gsutil rm -r $ME_EMBEDDING_BUCKET

    shell_output = ! gsutil du -ash $ME_EMBEDDING_BUCKET
    if float(shell_output[0].split()[0]) > 0:
        print(
            "PLEASE UNCOMMENT LINE TO DELETE BUCKET. CONTENT FROM THE BUCKET NOT DELETED"
        )

    print(
        f"Size of the bucket {ME_EMBEDDING_BUCKET} after deleting = {' '.join(shell_output[0].split()[:2])}"
    )

In [None]:
! gcloud ai indexes list --project="mdai-interns-summer-2023" --region="us-central1"

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Listed 0 items.


# Information Summarization

In [None]:
# Initialize mdai client with necessary identification information
%%capture
! pip install --upgrade mdai

import mdai
import json
domain = # your domain
access_token = # your access token
mdai_client = mdai.Client(domain= domain, access_token= access_token)

In [None]:
"""
1. Use the following pieces of context to answer the question at the end. Think step-by-step and then answer.

2. If the answer is about coding, please make sure it is easy to separate text and code.

3. If there are parts that the context does not answer, and it is about coding, you can use your coding knowledge.

4. If the context has a link, always directly include the link in the response

5. But for conceptual questions, Do not try to make up an answer and strictly answer based on the context.

6. If the context is empty, just say "I do not know the answer to that."
"""

context = ["project_id = x9N2KMqZ"]
system_prompt = """You are a bot helping give summary information based on context.

There is a set of rules:

=============
{context}
=============

"""
formatted_context = system_prompt.format(context='\n'.join(context))
formatted_context




In [None]:
prompt = "what is my project id?"
messages = [ {"role": "system", "content": formatted_context},
          {"role": "user", "content": prompt}]
result = mdai_client.chat_completion.create(messages, model='gpt-3.5-turbo', temperature=0)
result["choices"][0]["message"]["content"]

"I'm sorry, but as a language model AI, I don't have access to personal information or the ability to retrieve your project ID. Please check with the relevant platform or service you are using to find your project ID."

In [None]:
prompt = "what is my project id?"
messages = [ {"role": "system", "content": formatted_context},
          {"role": "user", "content": prompt}]
result = mdai_client.chat_completion.create(messages, model='gpt-4', temperature=0)
result["choices"][0]["message"]["content"]

'Your project ID is x9N2KMqZ.'