## Step 1: Install required libraries

In [1]:
!pip install -qU datasets llama-index llama-index-llms-openai llama-index-vector-stores-mongodb pymongo arize-phoenix "openai>=1" "openinference-instrumentation-llama-index>=2.0.0"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Step 2: Setup prerequisities

In [2]:
import os
import getpass
from pymongo import MongoClient

In [3]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key:  ········


In [4]:
os.environ["PHOENIX_API_KEY"] = getpass.getpass("Enter your Phoenix API key: ")

Enter your Phoenix API key:  ········


In [5]:
MONGODB_URI = getpass.getpass("Enter your MongoDB URI: ")
mongodb_client = MongoClient(
    MONGODB_URI, appname="devrel.content.retrieval_strategies_llamaindex"
)

Enter your MongoDB URI:  ········


## Step 3: Setup tracing

In [6]:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register
import phoenix as px

In [7]:
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={os.environ['PHOENIX_API_KEY']}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"
tracer_provider = register()
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider, skip_dep_check=True)
px_client = px.Client()

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: default
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: https://app.phoenix.arize.com/v1/traces
|  Transport: HTTP
|  Transport Headers: {'api_key': '****', 'authorization': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.





## Step 4: Load the dataset

In [8]:
from datasets import load_dataset
import pandas as pd
from llama_index.core import Document

In [9]:
data = load_dataset("BeIR/scifact-generated-queries", split="train", streaming=True)
data_head = data.take(1000)
df = pd.DataFrame(data_head)

In [10]:
df.head()

Unnamed: 0,_id,title,text,query
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,what is the diffusion coefficient of cerebral ...
1,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,what is diffusion tensor
2,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,what is the diffusion coefficient of the cereb...
3,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,which type of hematopoiesis is characterized b...
4,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,which cell types have hematopoiesis


In [11]:
len(df)

1000

In [12]:
queries = df["query"].tolist()

In [13]:
df.drop(columns=["_id", "query"], inplace=True)

In [53]:
df.drop_duplicates(inplace=True)

In [62]:
documents = [Document(text=row["text"], metadata={"title": row["title"]}) for _, row in df.iterrows()]

In [63]:
len(documents)

337

## Step 5: Create MongoDB Atlas Vector Store

In [64]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex, StorageContext
from pymongo.operations import SearchIndexModel

In [65]:
Settings.llm = OpenAI()
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.chunk_size = 200
Settings.chunk_overlap = 30

In [66]:
VS_INDEX_NAME = "vector_index"
FTS_INDEX_NAME = "fts_index"
DB_NAME = "llamaindex"
COLLECTION_NAME = "retrieval_comp"
collection = mongodb_client[DB_NAME][COLLECTION_NAME]
collection.drop()

In [67]:
vector_store = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = DB_NAME,
    collection_name = COLLECTION_NAME,
    vector_index_name = VS_INDEX_NAME,
    fulltext_index_name = FTS_INDEX_NAME
)
vector_store_context = StorageContext.from_defaults(vector_store=vector_store)
vector_store_index = VectorStoreIndex.from_documents(
   documents, storage_context=vector_store_context, show_progress=True
)

Parsing nodes:   0%|          | 0/337 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/807 [00:00<?, ?it/s]

In [68]:
vs_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 1536,
                "similarity": "cosine",
            }
        ]
    },
    name=VS_INDEX_NAME,
    type="vectorSearch"
)

In [69]:
fts_model = SearchIndexModel(
    definition={
        "mappings": {
            "dynamic": False,
            "fields": {
                "text": {"type": "string"}
            }
        }
    },
    name=FTS_INDEX_NAME,
    type="search"
)

In [70]:
collection.create_search_indexes(models=[vs_model, fts_model])

['vector_index', 'fts_index']

In [71]:
query_engine = vector_store_index.as_query_engine(similarity_top_k=5)

In [72]:
response = query_engine.query(queries[0])

In [73]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='a6964f73-0ada-4411-a425-fc8de915b7c4', embedding=None, metadata={'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='058bf794-5bc9-48f4-a9f3-08baa0061356', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.'}, hash='ebcda3465172f2db2cd9c2b170681f76aefecfa8308088030e5c1db9d72e5a25'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='6be52a55-ca6b-434a-9ba9-afd596c1d56d', node_type=<ObjectType.TEXT: '1'>, metadata={'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.'}, hash='0b2596c57747d8bfa4d97

In [74]:
from phoenix.session.evaluation import get_retrieved_documents
retrieved_documents = get_retrieved_documents(px.Client())



In [75]:
retrieved_documents

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,document_score
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a2db003b1ae09acc,0,13490c026a0fdf2bc055f86cb0816ddf,which type of methylation occurs in peripheral...,DNA methylation plays an important role in bio...,0.863293
a2db003b1ae09acc,1,13490c026a0fdf2bc055f86cb0816ddf,which type of methylation occurs in peripheral...,Analysis of the PBMC methylome revealed a rich...,0.831969
72fc58a2ff8f89b1,0,241e3c912ba46af4c035fdba190d7814,which type of methylation occurs in peripheral...,DNA methylation plays an important role in bio...,0.863293
1daf632ac908b09c,0,1760b4c81d0adec54d2dd918e89b2871,which of the following transcription sites con...,The human Golli-MBP gene contains two transcri...,0.823784
84c3b3b5c7de0178,0,f8c314f52c90ab603946600c21327231,which of the following transcription sites con...,The human Golli-MBP gene contains two transcri...,0.823784
84c3b3b5c7de0178,1,f8c314f52c90ab603946600c21327231,which of the following transcription sites con...,Two human Golli (for gene expressed in the oli...,0.812735
84c3b3b5c7de0178,2,f8c314f52c90ab603946600c21327231,which of the following transcription sites con...,These findings clearly link the expression of ...,0.810222
84c3b3b5c7de0178,3,f8c314f52c90ab603946600c21327231,which of the following transcription sites con...,ID elements are short interspersed elements (S...,0.740433
2bd7e38207508be9,0,a41bbcd7f047bf1c16773f67cb6d5383,which type of methylation occurs in peripheral...,DNA methylation plays an important role in bio...,0.863293
2bd7e38207508be9,1,a41bbcd7f047bf1c16773f67cb6d5383,which type of methylation occurs in peripheral...,Analysis of the PBMC methylome revealed a rich...,0.831969
