In [None]:
%%capture
!pip install pinecone-client sentence-transformers langchain-openai datasets openai groq

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key='INSERT_PINECONE_API_KEY_HERE')

In [None]:
index_name = "anatomy-book-b"

if index_name not in pc.list_indexes().names():
  pc.create_index(
      name=index_name,
      dimension=1536,
      metric="cosine",
      spec=ServerlessSpec(
          cloud='aws',
          region='us-east-1'
      )
  )

index = pc.Index(index_name)

In [None]:
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

In [None]:
from langchain_openai import OpenAIEmbeddings
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key="INSERT_OPEN_API_KEY_HERE")

def get_relevant_context(text, k=1):
  embeds = embed_model.embed_documents([text])
  results_1 = index.query(
      vector=embeds,
      top_k=k,
      include_metadata=True
  )

  results = []

  for result in results_1["matches"]:
    results.append(result["metadata"]["text"])
  return results

In [None]:
from datasets import load_dataset
dataset = load_dataset("openlifescienceai/medmcqa", split="train")

In [None]:
medmcqa_texts = []
for sample in dataset:
  text = f'''{sample["question"]} (A) {sample["opa"]}, (B) {sample["opb"]} (C) {sample["opc"]} (D) {sample["opd"]}. Answer with explanation: {sample["exp"]}'''
  medmcqa_texts.append(text)

In [None]:
with open("anatomybook1.txt", "r", encoding="utf-8") as file:
    anatomy_book_text = file.read()

anatomy_book_words = anatomy_book_text.split()

anatomy_docs = []

for i in range(0, len(anatomy_book_words), 100):

  i_end = min(i + 100, len(anatomy_book_words))
  new_doc1 = " ".join(anatomy_book_words[i: i_end])
  anatomy_docs.append(new_doc1)

  if (i_end + 50 < len(anatomy_book_words)):
    new_doc2 = " ".join(anatomy_book_words[i + 50: i_end + 50])
    anatomy_docs.append(new_doc2)

In [None]:
from openai import OpenAI
client = OpenAI(api_key="INSERT_OPEN_API_KEY_HERE")

def prompt_gpt(prompt):
  completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You rewrite text in a way that is more informative and fill in missing information. You are not conversational i.e. you simply respond with the re written text and nothing else."},
        {"role": "user", "content": prompt}]
  )
  return completion.choices[0].message.content

In [None]:
from tqdm import tqdm

instruction = "The following is a piece of text scraped from an Anatomy text book. It may be containing missing information and start and end in the middle of a sentence. If the text speaks about a figure or an image, try to instead explain what it is mentioning in a way that does not require access to the image. Respond with a clean form of the information suitable for use as a document in a RAG anatomy knowledgebase: "
refined_docs = []

for doc in tqdm(anatomy_docs):
  refined_docs.append(prompt_gpt(instruction + doc))

100%|██████████| 4440/4440 [4:03:42<00:00,  3.29s/it]


In [None]:
from tqdm import tqdm

texts = refined_docs

batch_size = 100

for i in tqdm(range(0, len(texts), batch_size)):
    i_end = min(len(texts), i+batch_size)
    batch = texts[i:i_end]
    ids = ["anatomy-book-b-" + str(i) for i in range(i, i_end)]
    embeds = embed_model.embed_documents(batch)
    metadata = [{'text': text} for text in batch]
    index.upsert(vectors=zip(ids, embeds, metadata))

100%|██████████| 45/45 [01:48<00:00,  2.40s/it]
