## Step 1: Install required libraries

In [1]:
!pip install -qU datasets llama-index llama-index-llms-openai llama-index-vector-stores-mongodb pymongo arize-phoenix "openai>=1" "openinference-instrumentation-llama-index>=2.0.0"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Step 2: Setup prerequisities

In [2]:
import os
import getpass
from pymongo import MongoClient

In [3]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key:  ········


In [4]:
os.environ["PHOENIX_API_KEY"] = getpass.getpass("Enter your Phoenix API key: ")

Enter your Phoenix API key:  ········


In [5]:
MONGODB_URI = getpass.getpass("Enter your MongoDB URI: ")
mongodb_client = MongoClient(
    MONGODB_URI, appname="devrel.content.retrieval_strategies_llamaindex"
)

Enter your MongoDB URI:  ········


## Step 3: Setup tracing

In [6]:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register
import phoenix as px

In [7]:
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={os.environ['PHOENIX_API_KEY']}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"
tracer_provider = register()
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider, skip_dep_check=True)
px_client = px.Client()

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: default
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: https://app.phoenix.arize.com/v1/traces
|  Transport: HTTP
|  Transport Headers: {'api_key': '****', 'authorization': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.





## Step 4: Load the dataset

In [8]:
from datasets import load_dataset
import pandas as pd
from llama_index.core import Document

In [9]:
data = load_dataset("BeIR/scifact-generated-queries", split="train", streaming=True)
data_head = data.take(1000)
df = pd.DataFrame(data_head)

In [10]:
df.head()

Unnamed: 0,_id,title,text,query
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,what is the diffusion coefficient of cerebral ...
1,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,what is diffusion tensor
2,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,what is the diffusion coefficient of the cereb...
3,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,which type of hematopoiesis is characterized b...
4,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,which cell types have hematopoiesis


In [11]:
len(df)

1000

In [12]:
queries = df["query"].tolist()

In [13]:
df.drop(columns=["_id", "query"], inplace=True)

In [14]:
documents = [Document(text=row["text"], metadata={"title": row["title"]}) for _, row in df.iterrows()]

In [15]:
documents[0]

Document(id_='994b64ec-15cf-45c0-bd25-5ae69df5e630', embedding=None, metadata={'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion

## Step 5: Create MongoDB Atlas Vector Store

In [46]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex, StorageContext
from pymongo.operations import SearchIndexModel

In [17]:
Settings.llm = OpenAI()
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.chunk_size = 200
Settings.chunk_overlap = 30

In [38]:
VS_INDEX_NAME = "vector_index"
FTS_INDEX_NAME = "fts_index"
DB_NAME = "llamaindex"
COLLECTION_NAME = "retrieval_comp"
collection = mongodb_client[DB_NAME][COLLECTION_NAME]

In [31]:
vector_store = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = DB_NAME,
    collection_name = COLLECTION_NAME,
    vector_index_name = VS_INDEX_NAME,
    fulltext_index_name = FTS_INDEX_NAME
)
vector_store_context = StorageContext.from_defaults(vector_store=vector_store)
vector_store_index = VectorStoreIndex.from_documents(
   documents, storage_context=vector_store_context, show_progress=True
)

Parsing nodes:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/353 [00:00<?, ?it/s]

In [47]:
vs_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 1536,
                "similarity": "cosine",
            }
        ]
    },
    name=VS_INDEX_NAME,
    type="vectorSearch"
)

In [61]:
fts_model = SearchIndexModel(
    definition={
        "mappings": {
            "dynamic": False,
            "fields": {
                "text": {"type": "string"}
            }
        }
    },
    name=FTS_INDEX_NAME,
    type="search"
)

In [63]:
collection.create_search_indexes(models=[vs_model, fts_model])

['vector_index', 'fts_index']

In [102]:
query_engine = vector_store_index.as_query_engine(similarity_top_k=5)

In [105]:
response = query_engine.query(queries[10])

In [106]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='12889f9e-c3e5-4f10-8f92-28829d875ff4', embedding=None, metadata={'title': 'The DNA Methylome of Human Peripheral Blood Mononuclear Cells'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b7b52032-97e9-4eab-9beb-8db754b04e97', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': 'The DNA Methylome of Human Peripheral Blood Mononuclear Cells'}, hash='b8c3869580e3ad633922cecf4a10e93cfce1a5ff204391f19bf7bfae4c77571c'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='16bcbf14-86d9-439e-bf2c-056b18948016', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='e7a73fd95e55cb2492f57035fb9bc1bfd6d7e051cefd363e8a24bc8790aac0b8')}, text='DNA methylation plays an important role in biological processes in human health and disease. Recent technological advances allow unbiased whole-genome DNA methylation (methylome) analysis to be carried out on human cells. Using whol

In [93]:
spans_df = px_client.get_spans_dataframe()