In [None]:
from pinecone import Pinecone, ServerlessSpec

OPEN_API_KEY = 'INSERT_OPEN_API_KEY_HERE'

PINECONE_KEY = 'INSERT_PINECONE_API_KEY_HERE'

pc = Pinecone(api_key=PINECONE_KEY)

In [None]:
index_name = "anatomy-definitions-1-gram"

if index_name not in pc.list_indexes().names():
  pc.create_index(
      name=index_name,
      dimension=1536,
      metric="cosine",
      spec=ServerlessSpec(
          cloud='aws',
          region='us-east-1'
      )
  )

index = pc.Index(index_name)

In [None]:
from langchain_openai import OpenAIEmbeddings
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPEN_API_KEY)

def get_relevant_context(text, k=1):
  embeds = embed_model.embed_documents([text])
  results_1 = index.query(
      vector=embeds,
      top_k=k,
      include_metadata=True
  )

  results = []

  for result in results_1["matches"]:
    results.append(result["metadata"]["text"])
  return results

In [None]:
import json

data = []
with open('data/dev.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

data = [d for d in data if d["subject_name"] == "Anatomy"]

terms = [d["opa"] for d in data] + [d["opb"] for d in data] + [d["opc"] for d in data] + [d["opd"] for d in data]

terms = list(filter(lambda x: len(x.split()) < 2, terms))
unique_terms = list(set(terms))

print(len(terms))
print(len(unique_terms))

In [None]:
from openai import OpenAI
client = OpenAI(api_key=OPEN_API_KEY)

def prompt_gpt(prompt):
  completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": prompt}]
  )
  return completion.choices[0].message.content

In [None]:
from tqdm import tqdm

instruction = "Provide a short definition for the following medical term: "

definitions = []

for term in tqdm(unique_terms):
  definitions.append(prompt_gpt(instruction + term))

In [None]:
for q in unique_terms[:10]:
  print(q, "\n")

In [None]:
for i in definitions:
  print(i, '\n\n')

In [None]:
from tqdm import tqdm

texts = definitions

batch_size = 100

for i in tqdm(range(0, len(texts), batch_size)):
    i_end = min(len(texts), i+batch_size)
    batch = texts[i:i_end]
    ids = ["definition-1" + str(i) for i in range(i, i_end)]
    embeds = embed_model.embed_documents(batch)
    metadata = [{'text': text} for text in batch]
    index.upsert(vectors=zip(ids, embeds, metadata))

In [None]:
import json
 
unique_terms
dictionary = dict(zip(unique_terms, definitions))

with open('word_defs.json', 'w') as json_file:
    json.dump(dictionary, json_file, indent=4)
