In [6]:
from datasets import load_dataset

import pandas as pd

# https://huggingface.co/datasets/AIatMongoDB/embedded\_movies

dataset = load_dataset("YBXL/MultiCaRe_PMC_Patients_PMC_CaseReport_diagnosis")

# Convert the dataset to a pandas dataframe

dataset_df = pd.DataFrame(dataset['train'])

dataset_df.head(5)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,query,answer
0,PMC_Patients_Reasoning3866,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Chronic pancreat...
1,PMC_Patients_Reasoning3867,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Invasive pneumoc...
2,PMC_Patients_Reasoning3868,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Pulmonary valve ...
3,PMC_Patients_Reasoning3870,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Short stature du...
4,PMC_Patients_Reasoning3871,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Empty Follicle S...


In [7]:
# Remove data point where plot column is missing

dataset_df = dataset_df.dropna(subset=['query'])

print("\nNumber of missing values in each column after removal:")

print(dataset_df.isnull().sum())

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with the new OpenAI embedding Model "text-embedding-3-small"

# dataset_df = dataset_df.drop(columns=['id'])

dataset_df.head(5)



Number of missing values in each column after removal:
id        0
query     0
answer    0
dtype: int64


Unnamed: 0,id,query,answer
0,PMC_Patients_Reasoning3866,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Chronic pancreat...
1,PMC_Patients_Reasoning3867,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Invasive pneumoc...
2,PMC_Patients_Reasoning3868,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Pulmonary valve ...
3,PMC_Patients_Reasoning3870,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Short stature du...
4,PMC_Patients_Reasoning3871,Your task is to provide at least 10 accurate a...,Differential diagnosis: \n1. Empty Follicle S...


In [12]:
from llama_index.core.settings import Settings

from llama_index.llms.openai import OpenAI

from llama_index.embeddings.openai import OpenAIEmbedding

embed_model=OpenAIEmbedding(model="text-embedding-3-small",dimensions=256)

llm=OpenAI()

Settings.llm=llm

# embedding model initialized
Settings.embed_model=embed_model


### now we need to convert to json for Mongo to be able to understand

In [16]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient='records')

# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

for document in documents_list:
    # Since you only have 'query' and 'answer', we don't need complex conversions.
    # However, we still ensure that metadata is in a serialized format.
    document["query"] = json.dumps(document["query"])
    document["answer"] = json.dumps(document["answer"])

    # Create a Document object for LLM and embedding models without exclusions
    llama_document = Document(
        text=document["query"] + "\n" + document["answer"],
        metadata=document,  # 'answer' will be part of metadata
        metadata_template="{key}=>{value}",  # Formatting metadata as 'key=>value'
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",  # Custom template
    )

    llama_documents.append(llama_document)

# Observing an example of what the LLM and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)



The LLM sees this: 
 Metadata: id=>PMC_Patients_Reasoning3866
query=>"Your task is to provide at least 10 accurate and distinct patient diagnoses based on the input case report. Ensure you provide at least 10 most likely diagnoses, listed in order of likelihood, and cover a wide range of unique possibilities. \n Follow the guidelines for a generation: 1. Each diagnosis should be precise and unique, ensuring a variety of at least 10 possibilities. 2. List one diagnosis per line. 3. Generate 10 differential diagnoses related to the input case report. Think step by step. \n \n***Output format***:Differential diagnosis: 1. \n2. \n3.\n4. \n5. \n6. \n7. \n8. \n9. \n10.INPUT: Age: 34.0\nSex: F\nTitle: Pancreaticobiliary maljunction diagnosed long after laparotomy in the neonatal period for annular pancreas: report of a case\nDescription: \nA 34-year-old woman presented to a physician with chief complaints of abdominal pain and fever. The physician suspected acute exacerbation of chronic panc

In [21]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(chunk_size=20000)
nodes = parser.get_nodes_from_documents(llama_documents)

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

Retrying llama_index.embeddings.openai.base.get_embedding in 0.9695657087842995 seconds as it raised APIConnectionError: Connection error..


APIConnectionError: Connection error.

In [3]:
from openai import OpenAI

client = OpenAI(
  api_key=''
)
