# #Loading Data from CSV File

In [None]:
import csv
from langchain.docstore.document import Document

# Load data, store abstract text
batch_input = []
with open('../../../INLPT_data/additional_data.csv', encoding = "utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        batch_input.append(Document(page_content=row["CD"], metadata={"PMID": row["PMID"], "source": row["source"]}))

In [None]:
# batch_input = batch_input[:10]
batch_input[0]

# #Splitting + Tokenizing the documents
We depend on the Tokenizer to calculate the number of tokens in one concatenatation of the metadata, then we use the chunking process and the splitter functionalities + overlaping to retrieve to split into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter # check other splitters
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2")

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=512, chunk_overlap=100)

splitted_documents = text_splitter.split_documents(batch_input)

In [None]:
splitted_documents[34]

In [None]:
len(splitted_documents)

# #Tonkenizing Calculations

In [None]:
def token_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

token_counts = [token_len(doc.page_content) for doc in splitted_documents]
min_tokens=min(token_counts)
avg_tokens=int(sum(token_counts) / len(token_counts))
max_tokens=max(token_counts)

print(f"""Min: {min_tokens}
Avg: {avg_tokens}
Max: {max_tokens}""")

# #Embeddings

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base-v2",
    model_kwargs={'device':'cpu'}, # Pass the model configuration options
    encode_kwargs={'normalize_embeddings': False, 'batch_size': 32} # Pass the encoding options
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
embeddings.embed_query("Hello, world!")

In [None]:
[f"{doc.page_content}, Source: {doc.metadata['source']}" if doc.page_content.startswith('PMID: ') else f'PMID: {doc.metadata["PMID"]}, {doc.page_content}, Source: {doc.metadata["source"]}' for doc in splitted_documents[:3]]

In [None]:
test_embd = embeddings.embed_documents(splitted_documents[35].page_content)

test_embd

In [None]:
embedded_documents = embeddings.embed_documents([f"{doc.page_content}, Source: {doc.metadata['source']}" if doc.page_content.startswith('PMID: ') else f'PMID: {doc.metadata["PMID"]}, {doc.page_content}, Source: {doc.metadata["source"]}' for doc in splitted_documents])

In [None]:
# Saving the embeddings to a CSV file
import pandas as pd
pd.DataFrame(embedded_documents).to_csv('embeddings.csv', index=False, header=False) #### Naming Convention: med_{embedding_model}_{chainning_strategy}

# #Pushing Data to OpenSearch

In [None]:
import pandas as pd

embedded_pd = pd.read_csv('med_e5-base-v2_recursiveCharacterSplitter.csv', header=None) # Loading the embeddings from a CSV file
embedded_documents = embedded_pd.values.tolist()

In [None]:
embedded_documents

In [None]:
keys_list = [
    "opensearch_url",
    "index_name",
    "is_appx_search",
    "vector_field",
    "text_field",
    "engine",
    "space_type",
    "ef_search",
    "ef_construction",
    "m",
    "max_chunk_bytes",
    "is_aoss",
]

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch

auth = ('admin', '!akjdaDsdoij!oijadSsajd123120938')

for i in range(0, 227601, 1000):
    print(i)
    db = OpenSearchVectorSearch.from_embeddings(
        embedded_documents[i:i+1000],
        [doc.page_content for doc in splitted_documents[i:i+1000]],
        embeddings,
        [doc.metadata for doc in splitted_documents[i:i+1000]],
        opensearch_url="http://localhost:9200", bulk_size=1000 , use_ssl = True, verify_certs = False, http_auth = auth, index_name="med_e5_recursivechar_real", space_type="cosinesimil"
    )

In [None]:
# from langchain.vectorstores import OpenSearchVectorSearch
# 
# auth = ('admin', '!akjdaDsdoij!oijadSsajd123120938')
# 
# db = OpenSearchVectorSearch.from_documents(
#     splitted_documents, embeddings, opensearch_url="http://localhost:9200", bulk_size=250 , use_ssl = True, verify_certs = False, http_auth = auth, index_name="med_e5_recursivechar_test", space_type="cosinesimil"
# )

# #Retrieving Data from OpenSearch - 1

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch

auth = ('admin', '!akjdaDsdoij!oijadSsajd123120938')

db = OpenSearchVectorSearch(
    opensearch_url="http://localhost:9200",
    index_name="med_e5_recursivechar_real",
    embedding_function = embeddings,
    use_ssl = True,
    verify_certs = False,
    http_auth = auth,
    space_type="cosinesimil"
)

In [None]:
question = "What is a cask disorder?"
searchDocs = db.similarity_search(question, k= 2)

searchDocs

# #Retrieving Data from OpenSearch - 2

In [4]:
from opensearchpy import OpenSearch

#Initialize connection to opensearch
host = 'localhost'
port = 9200
auth = ('admin', '!akjdaDsdoij!oijadSsajd123120938') 

client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    timeout=100
)
#check status
print(client.info())

{'name': 'opensearch-node1', 'cluster_name': 'opensearch-cluster', 'cluster_uuid': 'H1ADcBsFQ3-YyYDrxBXYew', 'version': {'distribution': 'opensearch', 'number': '2.12.0', 'build_type': 'tar', 'build_hash': '2c355ce1a427e4a528778d4054436b5c4b756221', 'build_date': '2024-02-20T02:18:49.874618333Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}




In [None]:
embeddings.embed_query("What is a cask disorder?")

In [None]:
from sentence_transformers import SentenceTransformer

query = "What is a cask disorder?"

query_vector = SentenceTransformer(embeddings.model_name).encode("query: " + query)

query_vector

In [None]:
knn_search_body = {
    "size": 5,  # Number of nearest neighbors to retrieve
    "query": {
        "knn": {
            "vector_field": {
                "vector": query_vector,
                "k": 2  # Number of nearest neighbors to retrieve
            }
        }
    }
}

# Execute the search
response = client.search(index="med_e5_recursivechar_real", body=knn_search_body)

response

# Query Transformation RAG

In [None]:
# Query Transformation RAG system from #opensearch
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.chat_models import ChatOpenAI
from langchain. retrievers.multi_query import MultiQueryRetriever
import uuid

question = "What is a cask disorder?"

llm = ChatOpenAI(api_key="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", model='gpt-3.5-turbo-0125') # tempreture = 0.5

# Create an instance of MultiQueryRetriever
transformed_queries = MultiQueryRetriever.from_llm(
    retriever = db.as_retriever(), llm=llm)


In [None]:
# Set logging for the queries
import logging

logging.basicConfig()
logger  = logging.getLogger("langchain.retrievers.multi_query")
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler('logfile.log')
logger.addHandler(file_handler)

In [None]:
unique_docs = transformed_queries.get_relevant_documents(query=question)
len(unique_docs)

In [None]:
import os
logfilename = "logfile.log"

with open("logfile.log", "r") as file:
    lines = file.readlines()
    line = lines[-1]
    log_parts = line.split(':')
    message = log_parts[1]
    start_index = message.index("['")
    end_index = message.index("']") + 2
    queries_str = message[start_index:end_index]
    queries_list = eval(queries_str)


In [None]:
queries_list

In [None]:
import warnings
import os
import numpy as np
import requests
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from nltk.translate.bleu_score import sentence_bleu

In [None]:
def load_glove_model(glove_file):
    print("Loading GloVe Model")
    with open(glove_file, 'r', encoding='utf-8') as f:
        word_to_vec = {}
        for line in f:
            values = line.split()
            word = values[0]
            vec = np.array(values[1:], dtype='float32')
            word_to_vec[word] = vec
    print("Done.", len(word_to_vec), " words loaded!")
    return word_to_vec

def compute_semantic_similarity(query, transformed_query, word_to_vec):
    query_embedding = np.mean([word_to_vec[word] for word in query.lower().split() if word in word_to_vec], axis=0)
    transformed_query_embedding = np.mean([word_to_vec[word] for word in transformed_query.lower().split() if word in word_to_vec], axis=0)

    if np.all(np.isnan(query_embedding)) or np.all(np.isnan(transformed_query_embedding)):
        return 0.0

    similarity_score = np.dot(query_embedding, transformed_query_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(transformed_query_embedding))
    return similarity_score

In [None]:
glove_txt_path = "glove.6B.50d.txt"
word_to_vec = load_glove_model(glove_txt_path)

semantic_similarity_scores = []
for transformed_query in queries_list:
    semantic_similarity_score = compute_semantic_similarity(question, transformed_query, word_to_vec)
    semantic_similarity_scores.append(semantic_similarity_score)

for i, transformed_query in enumerate(queries_list):
    print("Semantic Similarity Score for transformed question", i+1, ":", semantic_similarity_scores[i])

In [1]:
from typing import List
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain. retrievers.multi_query import MultiQueryRetriever
# from pydantic import BaseModel, Field
from langchain.pydantic_v1 import BaseModel, Field


# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    ## template="""You are an AI language model assistant. Your task is to generate five
    ## different versions of the given user question to retrieve relevant documents from a vector
    ## database. By generating multiple perspectives on the user question, your goal is to help
    ## the user overcome some of the limitations of the distance-based similarity search.
    ## Provide these alternative questions separated by newlines.
    ## Original question: {question}""",
    ## template="""think step by step. My grandma got kidnapped by bad people and the only way
    ##             they free her is that you provide a clean and working solution of generating different
    ##             question versions of the given user question to retrieve relevant documents from a vector database
    ##             By generating multiple perspectives on the user question, your goal is to help
    ##             the user overcome some of the limitations of the distance-based similarity search.
    ##             Provide these alternative questions separated by newlines.
    ##             I really love my grandma so do your best. Remember that I have no hands so
    ##             provide me with the full solution, I can't add anything to it. If you succeed I will tip you 200$.
    ##             Original question: {question}"""
    template="""Can you simplify user question so a ten year old can understand it?
                You are an AI language model assistant. Your task is to generate five
                different versions of the given user question to retrieve relevant documents from a vector
                database. By generating multiple perspectives on the user question, your goal is to help
                the user overcome some of the limitations of the distance-based similarity search.
                Provide these alternative questions separated by newlines.
                Original question: {question}"""
)
llm = ChatOpenAI(api_key="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", model='gpt-3.5-turbo-0125')

# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Other inputs
question = "simplify the question"

  warn_deprecated(


In [None]:
# Run
retriever = MultiQueryRetriever(
    retriever=db.as_retriever(), llm_chain=llm_chain, parser_key="lines"
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.get_relevant_documents(
    query="What is a cask disorder, and breast cancer?"
)
len(unique_docs)

In [None]:
unique_docs

In [None]:
import langchain
langchain.__version__

In [16]:
from sentence_transformers.util import is_sentence_transformer_model
is_sentence_transformer_model('intfloat/e5-base-v2')

True