### Section 1: Import Required Libraries and Modules

In [None]:
# Import Milvus utilities for connections, schema creation, and managing collections.
from pymilvus import connections, utility, Collection, CollectionSchema, FieldSchema, DataType

# Import HuggingFace embeddings to convert text into vector representations.
from langchain_community.embeddings import HuggingFaceEmbeddings  # Adjust if necessary.

# Import parser to ensure the model outputs are formatted as strings.
from langchain_core.output_parsers import StrOutputParser

# Import PromptTemplate to structure and format prompts for the language model.
from langchain_core.prompts import PromptTemplate

# Import RunnablePassthrough to allow passing data unchanged through multiple steps.
from langchain_core.runnables import RunnablePassthrough

# Import Milvus to interact with the vector database for storage and retrieval.
from langchain_milvus import Milvus

# Import hybrid search retriever from Milvus, combining dense and sparse search techniques.
from langchain_milvus.retrievers import MilvusCollectionHybridSearchRetriever

# Import BM25 for sparse embedding, typically used for keyword-based search.
from langchain_milvus.utils.sparse import BM25SparseEmbedding

# Import ChatMistralAI to enable conversational interaction with the Mistral AI model.
from langchain_mistralai.chat_models import ChatMistralAI

# Import BeautifulSoup to parse and extract text content from HTML web pages.
from bs4 import BeautifulSoup

# Import requests to make HTTP requests for web scraping or API calls.
import requests

# Import nltk (Natural Language Toolkit) for tokenizing text into sentences.
import nltk

# Import os for interacting with the operating system, such as handling environment variables.
import os

# Import urljoin to join relative and absolute URLs for web scraping.
from urllib.parse import urljoin

# Import load_dotenv to load environment variables from a .env file into the program.
from dotenv import load_dotenv

### Section 2: Load Environment Variables and Setup NLTK
* Load environment variables from a .env file to access configuration
* Download the 'punkt' tokenizer, which allows nltk to split text into sentences.


In [None]:

# Load environment variables and setup nltk
load_dotenv()
nltk.download('punkt')


### Section 3: Configure Mistral API Key
* Retrieve the Mistral API key from the environment variables using os.getenv().
* Set the Mistral API key as an environment variable so it can be accessed throughout the code.

In [None]:

# Retrieve and set the Mistral API key
mistral_api_key = os.getenv("MISTRAL_API_KEY")
os.environ["MISTRAL_API_KEY"] = mistral_api_key


### Section 4: Connect to Milvus and Setup Collection Schema

* stablish a connection to Milvus, using localhost as the host and default port 19530
* Check if the collection "my_collection" already exists in Milvus.
* Define fields for the Milvus collection schema. The ID field acts as the primary key.
* Create a collection schema using the defined fields.
* Initialize the collection with the name "my_collection" and the created schema.


In [None]:

# Connect to Milvus and set up the collection schema
connections.connect(alias="default", host="localhost", port="19530")
if utility.has_collection("my_collection"):
    utility.drop_collection("my_collection")

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768)
]
schema = CollectionSchema(fields, description="My Collection for Embeddings")
collection = Collection(name="my_collection", schema=schema)


### Section 5: Generate Embeddings and Insert Data into Milvus
* Iinitialize the HuggingFace embeddings model to generate vector representations from text and set it to embeddings.

* Define some sample data to insert into the Milvus collection and makes a list of IDs for the data points set to ids.

* Generate embeddings for three sample sentences and gives it to Vectors
* Insert the IDs and corresponding vector embeddings into the Milvus collection.

In [None]:

# Generate embeddings and insert data into Milvus
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
ids = [1, 2, 3]
vectors = embeddings.embed_documents(["Hello world", "How are you?", "Goodbye"])
collection.insert([ids, vectors])


### Section 6: Load the Mistral Chat Model
* Load the ChatMistralAI model using the Mistral API key for authorization

In [None]:

# Load ChatMistralAI model
chat_model = ChatMistralAI(api_key=mistral_api_key)


### Section 7: Configure the Milvus Hybrid Search Retriever
* Set up a hybrid search retriever using the Milvus collection and various embeddings.
* `collection=collection`   Specify the Milvus collection to search.
* `dense_embedding=embeddings`  Use HuggingFace dense embeddings for semantic search.
* `sparse_embedding=BM25SparseEmbedding()`,  Use BM25 for  sparse, keyword-based search.
* `top_k=2`  Return the top 2 most relevant documents.


In [None]:

# Configure Milvus hybrid search retriever
retriever = MilvusCollectionHybridSearchRetriever(
    collection=collection,
    dense_embedding=embeddings,
    sparse_embedding=BM25SparseEmbedding(),
    top_k=2
)


### Section 8: Create a Prompt Template and Output Parser

* Define a prompt template to structure user queries for the language model
* `input_variables=["question"]` Specify the expected input variable.
* `template="Answer the following question: {question}" ` Format of the prompt.
* Create an output parser to ensure the model response is converted to a string. `output_parser = StrOutputParser()`



In [None]:

# Set up prompt template and output parser
prompt = PromptTemplate(
    input_variables=["question"],
    template="Answer the following question: {question}"
)
output_parser = StrOutputParser()


### Section 9: Set Up a RunnablePassthrough
* Create a passthrough to pass data unchanged through different steps of the workflow

In [None]:

# Create passthrough for data routing in the workflow
passthrough = RunnablePassthrough()


### Section 10: Define a Web Scraping Function
* Define a function to scrape text from a given website URL.

* Send an HTTP GET request to the specified URL.
* Parse the HTML content using BeautifulSoup
* Extract and join all visible text from the HTML.
*  Tokenize the extracted text into individual sentences using nltk.
* Return the list of sentences extracted from the webpage.


In [None]:

# Define a function to scrape a website
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join(soup.stripped_strings)
    sentences = nltk.sent_tokenize(text)
    return sentences


### Section 11: Scrape a Website and Insert Data into Milvus

* Define a website URL for scraping (example URL).
* Scrape the website to get a list of sentences
* Generate embeddings for the scraped sentences
* Insert the sentences and their embeddings into the Milvus collection with autogenerated IDs



In [None]:

# Example URL to scrape
website_url = "https://example.com"
sentences = scrape_website(website_url)
sentence_embeddings = embeddings.embed_documents(sentences)
collection.insert([list(range(len(sentences))), sentence_embeddings])


### Section 12: Perform a Hybrid Search and Display Results
* Define a user query to search the collection
* Use the retriever to get the most relevant documents based on the query
* Iterate through the retrieved results and print their content. 

In [None]:

# Perform a hybrid search query and display results
query = "What is this website about?"
results = retriever.get_relevant_documents(query)
for result in results:
    print(result.page_content)
