## OpenSearch-index_search

In [None]:
# pip install opensearch-py
# pip install opensearch-py-ml

In [None]:
from opensearchpy import OpenSearch
# import opensearch_py_ml as oml

In [None]:
import certifi

host = 'localhost'
port = 9200
auth = ('admin', 'admin') # For testing only. Don't store credentials in code.
ca_certs_path = certifi.where() # Provide a CA bundle if you use intermediate CAs with your root CA.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
    ca_certs = ca_certs_path
)


In [None]:
index_name = 'pubmed-articles'#'pubmed-articles-combined'
index_body = {
  'settings': {
    'index': {
      'number_of_shards': 4
    }
  }
}

# Check if the index already exists
if not client.indices.exists(index=index_name):
    # If the index does not exist, create it
    response = client.indices.create(index=index_name, body=index_body)
    print("Index created successfully:", index_name)
else:
    print("Index", index_name, "already exists.")

In [None]:
# Upload the data to the OpenSearch index
import pandas as pd
from tqdm import tqdm
import json


# Function to remove NaN values recursively
def remove_nan(obj):
    if isinstance(obj, dict):
        return {k: remove_nan(v) for k, v in obj.items() if v is not None and not isinstance(v, float) and not (isinstance(v, str) and v.lower() == 'nan')}
    elif isinstance(obj, list):
        return [remove_nan(elem) for elem in obj if elem is not None and not isinstance(elem, float) and not (isinstance(elem, str) and elem.lower() == 'nan')]
    else:
        return obj
    
file_name = 'additional_data' #articles
# Read the CSV file into a DataFrame
df = pd.read_csv(f'{file_name}.csv')

# Convert the DataFrame to a list of dictionaries (JSON format)
docs = df.to_dict(orient='records')

# Remove NaN values from the data
cleaned_data = remove_nan(docs)

# Save the cleaned data back to a JSON file
with open(f'{file_name}.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)

# Read the JSON file
with open(f'{file_name}.json', 'r') as f:
    docs = json.load(f)

# Prepare the actions for bulk indexing
actions = []
for idx, doc in enumerate(docs):
    action = {
        'index': {
            '_index': index_name,  # Specify the index name
            '_id': idx#doc['PMID']  # Use a unique identifier for each document
        }
    }
    actions.append(action)
    actions.append(doc)  # Add the document itself as the next action

# Perform the bulk operation
batch_size = 1000  

for i in tqdm(range(0, len(actions), batch_size)):
    batch_actions = actions[i:i+batch_size]
    client.bulk(batch_actions) 



In [None]:
"""
  -Dataset: additional_data.csv
  -Columns: PMID, CD, source
  -Index: pubmed-articles-combined
  *****************************
  -Dataset: articles.csv
  -Columns: PMID,TI,AB,PB,FAU,FED,DP,OTO,OT,OWN,DCOM,LR,JT,MH,ISBN
  -Index: pubmed-articles
"""
q = 'which article is published in 2015?'#'Peripheral plasma'
query = {
  'size': 5,  # Return only 5 documents
  'query': {
    'multi_match': {
      'query': q,
      'fields': ['CD']
    }
  }
}

response = client.search(
    body=query,
    index=index_name
)

response


In [None]:
response = client.delete(
    index = 'python-test-index',
    id = '1'
)

In [None]:
response = client.indices.delete(
    index = 'pubmed-articles'
)

## OpenSerach- Match_serach- Neural_search- Hybrid_search

In [None]:
"https://opensearch.org/docs/latest/search-plugins/neural-search-tutorial/#prerequisites"
from opensearchpy import OpenSearch
# import opensearch_py_ml as oml

In [None]:
import certifi

host = 'localhost'
port = 9200
auth = ('admin', 'admin') # For testing only. Don't store credentials in code.
ca_certs_path = certifi.where() # Provide a CA bundle if you use intermediate CAs with your root CA.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
    ca_certs = ca_certs_path
)


In [None]:
"""Prerequisites
 update ML-related cluster settings:
 using an OpenSearch-provided machine learning (ML) model and a cluster with no dedicated ML nodes.
   To ensure that this basic local setup works.
  """
# Define the request body
cluster_settings = {
  "persistent": {
    "plugins": {
      "ml_commons": {
        "only_run_on_ml_node": "false",
        "model_access_control_enabled": "true",
        "native_memory_threshold": "99"
      }
    }
  }
}

# Send the request to set cluster settings
response = client.cluster.put_settings(body=cluster_settings)

response


In [None]:
"""
register a model group with the access mode set to public
    """

# Define the request body
body_data = {
    "name": "NLP_model_group",
    "description": "A model group for NLP models",
    "access_mode": "public"
}

# Send the request to register a model group
response = client.transport.perform_request(
    method="POST", url="/_plugins/_ml/model_groups/_register", body=body_data
)

response



In [None]:
"""
register the model to the model group, provide the model group ID in the register request
    """

# Define the request body
body_data = {
  "name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b",
  "version": "1.0.1",
  "model_group_id": "C83zwo0B8eW0xsky_4nW",#f"{response['model_group_id']}",# "C83zwo0B8eW0xsky_4nW",
  "model_format": "TORCH_SCRIPT"
}
# Send the request to register a model group
response = client.transport.perform_request(
    method="POST", url="/_plugins/_ml/models/_register", body=body_data
)

response

In [None]:
"Check if the task is completed"

# Define the URL for the GET request
url = f"/_plugins/_ml/tasks/X7L6zI0BvZ3wLFxzveia"#{response['task_id']}" #G835wo0B8eW0xskyPomh"

# Send the request to register a model group
response = client.transport.perform_request(method="GET", url=url)

response

In [None]:
"""When the model is registered, it is saved in the model index.
 Next, deploy the model. Deploying a model creates a model instance 
 and caches the model in memory."""

# Define the URL for the GET request {response['model_id']}
url = f"/_plugins/_ml/models/ZnT6zI0B5EQu2yufwsQj/_deploy" #nfH5wo0BdZ4UfKGGQ7gm

# Send the request to register a model group
response = client.transport.perform_request(method="POST", url=url)

response

In [None]:
"Check if the task is completed"

# Define the URL for the GET request
url = f"/_plugins/_ml/tasks/UW6T640BWePCYCOKL35W"#{response['task_id']}" #Mc3_wo0B8eW0xskyRIm5"

# Send the request to register a model group
response = client.transport.perform_request(method="GET", url=url)

response

### Neural Search

In [None]:
"""Create an ingest pipeline for neural search
Neural search uses a language model to transform text into vector embeddings.
 During ingestion, neural search creates vector embeddings for the text fields
   in the request. During search, you can generate vector embeddings for the query
     text by applying the same model, allowing you to perform vector similarity search
       on the documents.
       
set up a text_embedding processor that creates vector embeddings from text.
 need the model_id of the model that was set up in the previous section and
   a field_map, which specifies the name of the field from which to take the text
     (text) and the name of the field in which to record embeddings (vector):
       """

# Define the URL for the PUT request
url = "/_ingest/pipeline/nlp-ingest-pipeline"

# Define the request body
request_body = {
    "description": "An NLP ingest pipeline",
    "processors": [
        {
            "text_embedding": {
                "model_id": f"{response['model_id']}",#"nfH5wo0BdZ4UfKGGQ7gm",
                "field_map": {
                    "text": "vector"
                }
            }
        }
    ]
}

# Send the PUT request to create the NLP ingest pipeline
response = client.transport.perform_request(method="PUT", url=url, body=request_body)

response

In [None]:
"""Create a k-NN index
create a k-NN index with a field named text, and a knn_vector field named vector,
 which contains the vector embedding of the text. Additionally, set the default ingest pipeline
   to the nlp-ingest-pipeline."""

"""
Types of splitting and embedding-max_length-Dimension
-pubmed-embedding-256-384 Models: huggingface/sentence-transformers/all-MiniLM-L6-v2
-pubmed-embedding-384-768 Models: huggingface/sentence-transformers/all-mpnet-base-v2
-pubmed-embedding-512-768 Models: huggingface/sentence-transformers/all-distilroberta-v1

"""


""" 
    Settings: This section contains configuration settings for the index.

    index: This sub-section contains index-level settings.

    knn: This setting enables K-Nearest Neighbors (KNN) functionality on the index. When set to true, it allows the index to store and search KNN vectors efficiently.

    knn.algo_param.ef_search: This parameter specifies the "ef_search" value used by the KNN algorithm. "ef_search" controls the quality of the search results and 
        the speed of the search operation. Higher values generally improve search quality but may increase search time. In this case, it's set to 100.
        "settings": {
                "index": {
                "knn": True,
                "knn.algo_param.ef_search": 100
                }

    Properties: This is where you define the fields and their properties within your index. Each property represents a field in your documents.

    id: This is a field of type text. It could be used to store a unique identifier for each document.

    vector: This is a field of type knn_vector. KNN vectors are used to store dense vectors that are suitable for similarity searches
      using K-Nearest Neighbors (KNN) algorithms.

    dimension: Specifies the dimensionality of the vectors. In this case, it's set to 768, indicating that each embedding vector has 768 dimensions.

    method: Specifies the method used to index and search the vectors.

    engine: The underlying search engine. In this case, it's set to "lucene", indicating that the Lucene engine is used.

    space_type: The space type used for similarity calculation. In this case, it's set to "l2", which refers to Euclidean distance.

    name: The name of the algorithm used for indexing and searching vectors. Here, it's set to "hnsw", which stands for Hierarchical Navigable Small World.

    parameters: Additional parameters for the chosen algorithm. These parameters control the construction process of the index structure and affect its performance and accuracy.
            "parameters": {
              "ef_construction": 128,
              "m": 24

    text: This is another field of type text. It could be used to store textual data such as the content of documents."""
# Define the index name
#pubmed-distilroberta-recursive-500-50  max=512 dim=768
#pubmed-mpnet-recursive-380-50          max=384 dim=768
#pubmed-minlmv6-recursive-250-25        max=256 dim=384
index_name = 'pubmed-distilroberta-recursive-400-50'


# Define the index configuration
index_body = {
    "settings": {
        "index.knn": True,
        "default_pipeline": "nlp-ingest-pipeline"
    },
    "mappings": {
        "properties": {
            "id": {"type": "text"},
            "vector": {
                "type": "knn_vector",
                "dimension": 768,
                "method": {
                    "engine": "lucene",
                    "space_type": "cosinesimil",#"l2",
                    "name": "hnsw",
                    "parameters": {}
                }
            },
            "text": {"type": "text"},
            "resource": {"type": "text"}
        }
    }
}

# Check if the index already exists
if not client.indices.exists(index=index_name):
    # If the index does not exist, create it
    response = client.indices.create(index=index_name, body=index_body)
    print(response)
else:
    print("Index", index_name, "already exists.")


In [None]:
" Get the settings of the index"
# Define the URL for the GET request
url = f"/{index_name}/_settings"

# Send the GET request to retrieve the index settings
response = client.transport.perform_request(method="GET", url=url)

response

In [None]:
" Get the mappins of the index"
# Define the URL for the GET request
url = f"/{index_name}/_mappings"

# Send the GET request to retrieve the index settings
response = client.transport.perform_request(method="GET", url=url)

response

In [None]:
# Upload the data to the OpenSearch index
import pandas as pd
from tqdm import tqdm
import json


# Function to remove NaN values recursively
def remove_nan(obj):
    if isinstance(obj, dict):
        return {k: remove_nan(v) for k, v in obj.items() if v is not None and not isinstance(v, float) and not (isinstance(v, str) and v.lower() == 'nan')}
    elif isinstance(obj, list):
        return [remove_nan(elem) for elem in obj if elem is not None and not isinstance(elem, float) and not (isinstance(elem, str) and elem.lower() == 'nan')]
    else:
        return obj

#data_distilroberta_recursive_500_50
#data_mpnet_recursive_380_50
#data_minilmv6_recursive_250_25

file_name = 'data_distilroberta_recursive_400_50'
# # Read the CSV file into a DataFrame
# df = pd.read_csv(f'{file_name}.csv')

# # Convert the DataFrame to a list of dictionaries (JSON format)
# docs = df.to_dict(orient='records')

# # Remove NaN values from the data
# cleaned_data = remove_nan(docs)

# # Save the cleaned data back to a JSON file
# with open(f'{file_name}.json', 'w') as f:
#     json.dump(cleaned_data, f, indent=4)

# Read the JSON file
with open(f'{file_name}.json', 'r') as f:
    docs = json.load(f)

# Prepare the actions for bulk indexing
actions = []
for idx, doc in enumerate(docs):
    action = {
        'index': {
            '_index': index_name,  # Specify the index name
            '_id': doc['id']  # Use a unique identifier for each document
        }
    }
    actions.append(action)
    actions.append(doc)  # Add the document itself as the next action

# Perform the bulk operation
batch_size = 10

for i in tqdm(range(0, len(actions), batch_size)):
    batch_actions = actions[i:i+batch_size]
    client.bulk(batch_actions) 



In [None]:
actions[1]['text']

In [None]:


response = client.index(
    index = index_name,
    body = actions[3],
    id = actions[3]['id'],
    refresh = True
)

In [None]:
# Define the URL for the GET request
url = f"/{index_name}/_doc/{actions[1]['id']}"

# Send the GET request to retrieve the document
response = client.transport.perform_request(method="GET", url=url)

# Print the response
response

In [None]:
"Search using a keyword search"

# Define the URL for the GET request
url = f"/{index_name}/_search"
q = "who is Moog?"
# Define the request body
request_body = {
    "_source": {
        "excludes": ["vector"]
    },
    "query": {
        "match": {
            "text": {
                "query": q
            }
        }
    }
}

# Send the GET request to search for documents
response = client.transport.perform_request(method="GET", url=url, body=request_body)

# Print the response
response

In [None]:
"Search using a keyword search"
# Define the search query
query = {
    "size": 3,  # Number of hits to return
    "query": {
        "match": {
            "text": {
                "query": q
            }
        }
    }
}


respose = client.search(body = query, index= index_name)
response

In [None]:
"Search using a neural search"
# Define the URL for the GET request
url = f"/{index_name}/_search"

# Define the request body
request_body = {
    "_source": {
        "excludes": ["vevtor"]
    },
    "size": 3,
    "query": {
        "neural": {
            "vector": {
                "query_text": q,
                "model_id": "ZnT6zI0B5EQu2yufwsQj",
                "k": 3
            }
        }
    }
}

# Send the GET request to search for similar documents
response = client.transport.perform_request(method="GET", url=url, body=request_body)

# Print the response
response

In [None]:
"Configure a search pipeline"

# Define the URL for the PUT request
url = "/_search/pipeline/nlp-search-pipeline"

# Define the request body
request_body = {
  "description": "Post processor for hybrid search",
  "phase_results_processors": [
    {
      "normalization-processor": {
        "normalization": {
          "technique": "min_max"
        },
        "combination": {
          "technique": "arithmetic_mean",
          "parameters": {
            "weights": [
              0.3,
              0.7
            ]
          }
        }
      }
    }
  ]
}

# Send the PUT request to create the pipeline
response = client.transport.perform_request(method="PUT", url=url, body=request_body)

# Print the response
response


In [None]:
"""When the model is registered, it is saved in the model index.
 Next, deploy the model. Deploying a model creates a model instance 
 and caches the model in memory."""

# Define the URL for the GET request {response['model_id']}
url = f"/_plugins/_ml/models/ZnT6zI0B5EQu2yufwsQj/_deploy" #nfH5wo0BdZ4UfKGGQ7gm

# Send the request to register a model group
response = client.transport.perform_request(method="POST", url=url)

response

In [None]:


index_name = 'pubmed-distilroberta-recursive-400-50'
q = "who is Moog?"
# Define the URL for the GET request including the search pipeline
url = f"/{index_name}/_search?search_pipeline=nlp-search-pipeline"

# Define the request body
request_body = {
    "_source": {
        "exclude": ["vector"]
    },
    "size": 3,
    "query": {
        "hybrid": {
            "queries": [
                {
                    "match": {
                        "text": {
                            "query": q
                        }
                    }
                },
                {
                    "neural": {
                        "vector": {
                            "query_text": q,
                            "model_id": "ZnT6zI0B5EQu2yufwsQj",
                            "k": 8
                        }
                    }
                }
            ]
        }
    }
}

# Send the GET request with the search pipeline
response = client.transport.perform_request(method="GET", url=url, body=request_body)

# Print the response
response


In [None]:
response['hits']['hits'][0]['_source']['text']

In [None]:
import api
connector = api.gpt.OpenAIConnector()

context = []
for i, doc in enumerate(response['hits']['hits']):
    context.append({
        'id': doc['_id'],
        'text': doc['_source']['text'],
        'source': doc['_source']['resource']
    })
system_prompt1 = "You are a researcher on Medical Intelligence, that can answer questions based on the provided articles."
system_prompt2 = "You are a friendly Assistant that will answer Questions based on given Contexts."
user_prompt1 = f"1- Answer the question with the regarding all chunked Contexts.2- If there are more than one answer provide all of them with resources.\n 3- If it is not possible to answer based on given contexts, Explicitly say that and answer based on your knowledge.\n4- provide the resources for each chunk at the end of your message.\n 5- If there is previous answers from you use them as well with reference \nQuestion:\n{q} \nContexts:\n{context}"
user_prompt2 = f"Answer the following question: {q} based on the following chunked texts: {context}. 1- If there is no answer based on provided texts, just say 'I cannot provide an answer based on the provided text.'"
message = [
    {"role": "system", "content": system_prompt1},  
    {"role": "user", "content": user_prompt1},
]
correction = connector.get_completions(message)
correction.choices[0].message.content


In [None]:
# response = client.delete(
#     index = 'python-test-index',
#     id = '1'
# )

In [None]:
# response = client.indices.delete(
#     index = 'pubmed-articles'
# )

In [None]:
# "Delete the ingest pipeline"
# # Define the URL for the DELETE request
# url = "/_ingest/pipeline/nlp-ingest-pipeline"

# # Send the DELETE request to remove the NLP ingest pipeline
# response = client.transport.perform_request(method="DELETE", url=url)

# response