In [13]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

True

In [14]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

In [12]:
import cohere
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader
from langchain.prompts import ChatPromptTemplate, PromptTemplate

In [6]:
def load_data(url):
    loader = WebBaseLoader(url)
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap= 50)
    chunks = text_splitter.split_documents(data)

    return chunks

# Create vector store and BM25 retriever

In [None]:
# Using 'nomic-ai/nomic-embed-text-v1.5' model from Hugging Face under its respective license.

def create_retrivers(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1.5", model_kwargs={'trust_remote_code': True})

    faiss_vector_store = FAISS.from_documents(chunks, embeddings)

    bm25 = BM25Retriever.from_documents(chunks)
    bm25.k = 5

    enseble_retriver = EnsembleRetriever(
        retrievers=[bm25, faiss_vector_store.as_retriever(search_kwargs={"k": 5})],
        weights=[0.5, 0.5]
    )

    return enseble_retriver

In [30]:
cohere_client = cohere.Client()

In [34]:
def safe_llm_call(prompt, **kwargs):
    try:
        # Format the prompt with the provided context and query
        formatted_prompt = prompt.format(**kwargs)
        
        # Use Cohere's generate API to generate the response
        response = cohere_client.generate(
            model='command-xlarge',  # You can choose a specific model here (e.g., 'xlarge', 'large', 'base')
            prompt=formatted_prompt,  # Pass the formatted prompt here
            max_tokens=500,  
            temperature=0.5  
        )
        
        # Return the generated text
        return response.generations[0].text if response.generations else "No response generated."
    except Exception as e:
        print(f"Error in LLM call: {e}")
        return "An error occurred while generating the response."


In [26]:
def hybrid_search(query, enseble_retriver, llm):
    retrieved_doc = enseble_retriver.invoke(query)
    context = "\n\n".join([doc.page_content for doc in retrieved_doc])

    print(context)

# sample retrived content 

In [18]:
query = "what is machine learning"
hybrid_search(query, ensemble_retriver, llm)

such as Platt scaling exist to use SVM in a probabilistic classification setting. In addition to performing linear classification, SVMs can efficiently perform a non-linear classification using what is called the kernel trick, implicitly mapping their inputs into high-dimensional feature spaces.

See also[edit]
Automated machine learning – Process of automating the application of machine learning
Big data – Extremely large or complex datasets
Deep learning — branch of ML concerned with artificial neural networks
Differentiable programming – Programming paradigm
List of datasets for machine-learning research
M-theory (learning framework)
Machine unlearning
References[edit]

than defining the field in cognitive terms. This follows Alan Turing's proposal in his paper "Computing Machinery and Intelligence", in which the question "Can machines think?" is replaced with the question "Can machines do what we (as thinking entities) can do?".[19]

^ "What is Machine Learning?". IBM. 22 September

# getting data

In [27]:
url = "https://en.wikipedia.org/wiki/Machine_learning"
chunks = load_data(url)

ensemble_retriver = create_retrivers(chunks)
ensemble_retriver

<All keys matched successfully>


EnsembleRetriever(retrievers=[BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x0000020BB9E10C10>, k=5), VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000020BB9E10460>, search_kwargs={'k': 5})], weights=[0.5, 0.5])

In [37]:
chunks

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Machine_learning', 'title': 'Machine learning - Wikipedia', 'language': 'en'}, page_content='Machine learning - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in'),
 Document(metadata={'source': 'https://en.wikipedia.org/wiki/Machine_learning', 'title': 'Machine learning - Wikipedia', 'language': 'en'}, page_content='Donate Create account Log in\n\n\n\n\n\

In [38]:
# Assuming 'chunks' is your list of Document objects
for doc in chunks:
    print(doc.page_content)  # This prints the content of the document
    print("-" * 80)  # A separator line for better readability between documents


Machine learning - Wikipedia



























Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in
--------------------------------------------------------------------------------
Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
History








2
Relationships to other fields




Toggle Relationships to other fields subsection





2.1
Artificial intelligence








2.2
Data compression








2.3
Data mining








2.4
Generalization








2.5
Statistics








2.6
Statisti

# Hybrid search

In [35]:
def hybrid_search_rag(query, ensemble_retriver, llm):
    # Hybrid retrieval
    retrieved_docs = ensemble_retriver.invoke(query)
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    # Generate response using the hybrid search context and the query
    response_prompt = PromptTemplate.from_template(
        "You are an AI assistant tasked with answering questions based on the provided context. "
        "The context contains information retrieved using a hybrid search method combining keyword-based and semantic search. "
        "Please provide a comprehensive answer to the question, using the context when relevant "
        "and your general knowledge when necessary.\n\n"
        "Context:\n{context}\n\n"
        "Question: {query}\n"
        "Answer:"
    )
    final_answer = safe_llm_call(response_prompt, context=context, query=query)

    # Generate explanation of the hybrid search process
    explanation_prompt = PromptTemplate.from_template(
        "Explain how the hybrid search process, combining keyword-based and semantic search, "
        "might have improved the retrieval of relevant information for answering the given query. "
        "Consider the potential benefits of this approach compared to using only one search method.\n\n"
        "Query: {query}\n"
        "Explanation:"
    )
    hybrid_search_explanation = safe_llm_call(explanation_prompt, query=query)

    return {
        "query": query,
        "final_answer": final_answer,
        "hybrid_search_explanation": hybrid_search_explanation,
        "retrieved_context": context
    }

In [36]:
queries = [
        "What are the main applications of machine learning in healthcare?",
        "Explain the concept of machine learning and its relationship to AI.",
        "Discuss the ethical implications of ML in decision-making processes."
    ]

# Run Hybrid-search RAG for each query
for query in queries:
  print(f"\nQuery: {query}")
  result = hybrid_search_rag(query, ensemble_retriver, llm)
  print("Final Answer:")
  print(result["final_answer"])
  print("\nHybrid Search Explanation:")
  print(result["hybrid_search_explanation"])
  print("\nRetrieved Context (first 300 characters):")
  print(result["retrieved_context"][:300] + "...")


Query: What are the main applications of machine learning in healthcare?
Final Answer:
 Machine learning has the potential to provide healthcare professionals with additional tools for diagnosing, treating, and planning recovery paths for patients. Some of the main applications include:

1. Diagnostic imaging - ML can help analyze images like X-rays, MRIs, and CT scans, potentially identifying abnormalities and assisting in diagnosis. 

2. Disease diagnosis and prediction - ML algorithms can analyze patient data, such as symptoms, medical history, and lab results, to assist in diagnosing diseases or predicting illness. 

3. Drug design - ML techniques can contribute to discovering and designing new drugs by analyzing large datasets of chemical structures and biological data. 

4. Clinical decision support - ML can provide clinicians with predictive models and risk assessments to assist in making informed decisions regarding patient care. 

5. Patient monitoring and remote care - ML al

In [39]:
queries = [
        "What is data mining?",
        "Explain the concept of Dimensionality reduction machine learning ."
    ]

# Run Hybrid-search RAG for each query
for query in queries:
  print(f"\nQuery: {query}")
  result = hybrid_search_rag(query, ensemble_retriver, llm)
  print("Final Answer:")
  print(result["final_answer"])
  # print("\nHybrid Search Explanation:")
  # print(result["hybrid_search_explanation"])
  print("\nRetrieved Context (first 300 characters):")
  print(result["retrieved_context"][:300] + "...")


Query: What is data mining?
Final Answer:
 Data mining is a field of study focused on exploratory data analysis through unsupervised learning techniques. It is used to discover unknown properties in data and can be considered as the analysis step of knowledge discovery in databases. This differentiates it from machine learning, which is typically focused on prediction based on known properties learned from training data. Data mining uses many machine learning methods but with different goals. Machine learning also uses data mining methods in areas such as unsupervised learning or as a pre-processing step to improve learner accuracy.

Hopefully, this answer is helpful for you! 

Retrieved Context (first 300 characters):
Machine learning and data mining often employ the same methods and overlap significantly, but while machine learning focuses on prediction, based on known properties learned from the training data, data mining focuses on the discovery of (previously) unknown properties 