In [1]:
# [CELL-1] importing the json file in which the scraped data from both the parent links and their corresponding sub-links (up to a depth of 5 levels)

import json
import pandas as pd

file_path= "./nvidia_crawler/output.json"

with open(file_path,"r") as file:
    data= json.load(file)
    
df = pd.DataFrame(data)
print(df.head())

                                                 url  \
0                      https://docs.nvidia.com/cuda/   
1  https://docs.nvidia.com/cuda/cuda-toolkit-rele...   
2  https://docs.nvidia.com/cuda/cuda-features-arc...   
3       https://docs.nvidia.com/cuda/eula/index.html   
4  https://docs.nvidia.com/cuda/cuda-quick-start-...   

                                               title  \
0                    CUDA Toolkit Documentation 12.5   
1                   CUDA 12.5 Update 1 Release Notes   
2                              CUDA Features Archive   
3                                               EULA   
4  1. Introduction — Quick Start Guide 12.5 docum...   

                                             content  
0  <body class="wy-body-for-nav"> \r\n  <div clas...  
1  <body class="wy-body-for-nav"> \r\n  <div clas...  
2  <body class="wy-body-for-nav"> \n  <div class=...  
3  <body class="wy-body-for-nav"> \n  <div class=...  
4  <body class="wy-body-for-nav"> \r\n  <div clas..

In [2]:
#[CELL-2] Changind the content in text format from HTML foramt and then updating

from bs4 import BeautifulSoup
def extract_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=" ", strip=True)
    return text

# Update the 'content' column with extracted text
df['content'] = df['content'].apply(extract_text)

In [9]:
#[CELL-3] Data Chunking

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

model = SentenceTransformer('all-MiniLM-L6-v2')


def advanced_chunk_text(text, chunk_size=128):
    sentences = text.split('. ')
    if len(sentences) == 1:
        return [text]

    embeddings = model.encode(sentences)
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5).fit(embeddings)
    
    chunks = []
    current_chunk = []
    current_cluster = clustering.labels_[0]

    for sentence, cluster in zip(sentences, clustering.labels_):
        if cluster != current_cluster or len(current_chunk) >= chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_cluster = cluster
        current_chunk.append(sentence)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [5]:
#[CELL-4] importing necessary items from pymilvus

from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

In [8]:
#[CELL-5] Starting the milvus server

from milvus import default_server

try:
    default_server.start()
    print("Milvus server started successfully!")
except Exception as e:
    print(f"Failed to start Milvus server: {str(e)}")
    exit(1)

Milvus server started successfully!


In [9]:
#[CELL-6] Connecting to the Milvus server using the host and the default port.

from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility, MilvusException
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    connections.connect(alias="default", host='192.168.1.4', port='19530')
    logger.info("Connected to Milvus server.")
except MilvusException as e:
    logger.error(f"Failed to connect to Milvus server: {e}")


INFO:__main__:Connected to Milvus server.


In [10]:
#[CELL-7] Providing name to the collection
collection_name = "new_docs_v2"


In [11]:
#[CELL-8] Defining Schema 
id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True)
embedding_field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384)
url_field = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=512)

schema = CollectionSchema(fields=[id_field, embedding_field, url_field], description="Collection for CUDA documentation")

collection = Collection(name=collection_name, schema=schema)
logger.info(f"Collection '{collection_name}' created with schema: {schema}")



INFO:__main__:Collection 'new_docs_v2' created with schema: {'auto_id': True, 'description': 'Collection for CUDA documentation', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'url', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}}], 'enable_dynamic_field': False}


In [15]:
#[CELL-9] Vector Database Creation and Inserting in the collection
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def insert_chunks_into_milvus(chunks, urls, collection, model, batch_size=50):
    try:
        entities = []
        for i in tqdm(range(0, len(chunks), batch_size)):
            batch_chunks = chunks[i:i + batch_size]
            batch_urls = urls[i:i + batch_size]

            batch_embeddings = model.encode(batch_chunks)

            for chunk, url, embedding in zip(batch_chunks, batch_urls, batch_embeddings):
                entities.append({"embedding": embedding.tolist(), "url": url})

        ids = collection.insert(entities)
        logger.info(f"Inserted {len(entities)} entities into Milvus")
        return ids
    except Exception as e:
        logger.error(f"Error inserting entities into Milvus: {str(e)}")
        return None

chunks = df['content'].tolist()  
urls = df['url'].tolist() 

inserted_ids = insert_chunks_into_milvus(chunks, urls, collection, model, batch_size=50)

if inserted_ids:
    print(f"Inserted IDs: {inserted_ids}")



INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
  0%|          | 0/132 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  1%|          | 1/132 [00:21<47:46, 21.88s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  2%|▏         | 2/132 [00:35<36:43, 16.95s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  2%|▏         | 3/132 [00:48<32:48, 15.26s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  3%|▎         | 4/132 [01:01<30:43, 14.40s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  4%|▍         | 5/132 [01:18<32:27, 15.33s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  5%|▍         | 6/132 [01:32<30:50, 14.69s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  5%|▌         | 7/132 [01:46<30:24, 14.59s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  6%|▌         | 8/132 [01:59<29:15, 14.16s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  7%|▋         | 9/132 [02:13<28:54, 14.10s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  8%|▊         | 10/132 [02:26<28:03, 13.80s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  8%|▊         | 11/132 [02:40<27:36, 13.69s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  9%|▉         | 12/132 [02:54<27:35, 13.80s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 10%|▉         | 13/132 [03:08<27:40, 13.95s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 11%|█         | 14/132 [03:22<27:14, 13.85s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 11%|█▏        | 15/132 [03:35<26:30, 13.60s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 12%|█▏        | 16/132 [03:49<26:26, 13.68s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 13%|█▎        | 17/132 [04:03<26:23, 13.77s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 14%|█▎        | 18/132 [04:18<27:04, 14.25s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 14%|█▍        | 19/132 [04:31<25:57, 13.78s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 15%|█▌        | 20/132 [04:44<25:12, 13.50s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 16%|█▌        | 21/132 [05:03<28:19, 15.31s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 17%|█▋        | 22/132 [05:21<29:15, 15.96s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 17%|█▋        | 23/132 [05:49<35:34, 19.58s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 18%|█▊        | 24/132 [06:17<40:06, 22.29s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 19%|█▉        | 25/132 [06:35<37:11, 20.85s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 20%|█▉        | 26/132 [06:50<34:01, 19.26s/it]

Batches:   0%|          | 0/2 [00:01<?, ?it/s]

 20%|██        | 27/132 [07:04<30:53, 17.65s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 21%|██        | 28/132 [07:17<28:01, 16.17s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 22%|██▏       | 29/132 [07:28<25:14, 14.71s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 23%|██▎       | 30/132 [07:40<23:44, 13.96s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 23%|██▎       | 31/132 [07:53<22:56, 13.63s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 24%|██▍       | 32/132 [08:06<22:08, 13.28s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 25%|██▌       | 33/132 [08:20<22:12, 13.46s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 26%|██▌       | 34/132 [08:33<21:48, 13.35s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 27%|██▋       | 35/132 [08:50<23:28, 14.52s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 27%|██▋       | 36/132 [09:10<25:41, 16.05s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 28%|██▊       | 37/132 [09:28<26:32, 16.77s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 29%|██▉       | 38/132 [09:56<31:25, 20.06s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 30%|██▉       | 39/132 [10:20<33:09, 21.39s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 30%|███       | 40/132 [10:34<29:11, 19.04s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 31%|███       | 41/132 [11:00<32:18, 21.31s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 32%|███▏      | 42/132 [11:17<29:42, 19.81s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 33%|███▎      | 43/132 [11:29<25:53, 17.45s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 33%|███▎      | 44/132 [11:39<22:29, 15.34s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 34%|███▍      | 45/132 [11:50<20:27, 14.11s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 35%|███▍      | 46/132 [12:02<19:21, 13.51s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 36%|███▌      | 47/132 [12:14<18:20, 12.95s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 36%|███▋      | 48/132 [12:26<17:46, 12.70s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 37%|███▋      | 49/132 [12:44<19:34, 14.15s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 38%|███▊      | 50/132 [13:00<20:02, 14.67s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 39%|███▊      | 51/132 [13:11<18:41, 13.85s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 39%|███▉      | 52/132 [13:24<17:46, 13.33s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 40%|████      | 53/132 [13:37<17:38, 13.40s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 41%|████      | 54/132 [13:52<17:48, 13.70s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 42%|████▏     | 55/132 [14:05<17:17, 13.48s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 42%|████▏     | 56/132 [14:20<17:54, 14.14s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 43%|████▎     | 57/132 [14:38<18:57, 15.17s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 44%|████▍     | 58/132 [14:50<17:44, 14.38s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 45%|████▍     | 59/132 [15:04<17:11, 14.13s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 45%|████▌     | 60/132 [15:15<16:02, 13.37s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 46%|████▌     | 61/132 [15:27<15:18, 12.94s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 47%|████▋     | 62/132 [15:39<14:43, 12.62s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 48%|████▊     | 63/132 [15:54<15:16, 13.28s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 48%|████▊     | 64/132 [16:11<16:26, 14.51s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 49%|████▉     | 65/132 [16:23<15:20, 13.74s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 50%|█████     | 66/132 [16:39<15:34, 14.17s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 51%|█████     | 67/132 [16:58<17:07, 15.81s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 52%|█████▏    | 68/132 [17:16<17:24, 16.32s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 52%|█████▏    | 69/132 [17:28<15:49, 15.07s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 53%|█████▎    | 70/132 [17:42<15:09, 14.67s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 54%|█████▍    | 71/132 [17:54<14:20, 14.10s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 55%|█████▍    | 72/132 [18:07<13:30, 13.51s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 55%|█████▌    | 73/132 [18:19<12:59, 13.22s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 56%|█████▌    | 74/132 [18:31<12:26, 12.87s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 57%|█████▋    | 75/132 [18:43<12:01, 12.65s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 58%|█████▊    | 76/132 [18:56<11:46, 12.62s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 58%|█████▊    | 77/132 [19:09<11:39, 12.72s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 59%|█████▉    | 78/132 [19:23<11:44, 13.05s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 60%|█████▉    | 79/132 [19:37<11:54, 13.49s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 61%|██████    | 80/132 [19:54<12:32, 14.47s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 61%|██████▏   | 81/132 [20:12<13:20, 15.70s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 62%|██████▏   | 82/132 [20:31<13:50, 16.61s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 63%|██████▎   | 83/132 [20:43<12:28, 15.28s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 64%|██████▎   | 84/132 [20:55<11:24, 14.26s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 64%|██████▍   | 85/132 [21:07<10:42, 13.66s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 65%|██████▌   | 86/132 [21:20<10:06, 13.18s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 66%|██████▌   | 87/132 [21:32<09:39, 12.87s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 67%|██████▋   | 88/132 [21:44<09:13, 12.57s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 67%|██████▋   | 89/132 [21:57<09:13, 12.86s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 68%|██████▊   | 90/132 [22:22<11:32, 16.48s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 69%|██████▉   | 91/132 [22:36<10:43, 15.69s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 70%|██████▉   | 92/132 [22:48<09:45, 14.63s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 70%|███████   | 93/132 [23:00<09:03, 13.93s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 71%|███████   | 94/132 [23:14<08:49, 13.93s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 72%|███████▏  | 95/132 [23:31<09:08, 14.82s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 73%|███████▎  | 96/132 [23:53<10:04, 16.79s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 73%|███████▎  | 97/132 [24:08<09:36, 16.47s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 74%|███████▍  | 98/132 [24:20<08:33, 15.11s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 75%|███████▌  | 99/132 [24:32<07:49, 14.24s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 76%|███████▌  | 100/132 [24:44<07:14, 13.57s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 77%|███████▋  | 101/132 [24:58<07:03, 13.66s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 77%|███████▋  | 102/132 [25:11<06:43, 13.46s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 78%|███████▊  | 103/132 [25:23<06:17, 13.00s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 79%|███████▉  | 104/132 [25:36<05:58, 12.81s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 80%|███████▉  | 105/132 [26:02<07:36, 16.90s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 80%|████████  | 106/132 [26:16<06:58, 16.09s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 81%|████████  | 107/132 [26:28<06:11, 14.84s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 82%|████████▏ | 108/132 [26:41<05:42, 14.27s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 83%|████████▎ | 109/132 [26:55<05:25, 14.16s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 83%|████████▎ | 110/132 [27:08<05:00, 13.68s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 84%|████████▍ | 111/132 [27:24<05:02, 14.39s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 85%|████████▍ | 112/132 [27:42<05:10, 15.52s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 86%|████████▌ | 113/132 [28:02<05:24, 17.08s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 86%|████████▋ | 114/132 [28:14<04:39, 15.55s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 87%|████████▋ | 115/132 [28:26<04:06, 14.49s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 88%|████████▊ | 116/132 [28:38<03:38, 13.68s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 89%|████████▊ | 117/132 [28:50<03:17, 13.18s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 89%|████████▉ | 118/132 [29:02<02:59, 12.85s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 90%|█████████ | 119/132 [29:21<03:10, 14.66s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 91%|█████████ | 120/132 [29:35<02:51, 14.29s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 92%|█████████▏| 121/132 [29:46<02:29, 13.55s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 92%|█████████▏| 122/132 [29:59<02:11, 13.15s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 93%|█████████▎| 123/132 [30:12<01:59, 13.27s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 94%|█████████▍| 124/132 [30:24<01:43, 12.91s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 95%|█████████▍| 125/132 [30:36<01:28, 12.63s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 95%|█████████▌| 126/132 [30:48<01:14, 12.42s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 96%|█████████▌| 127/132 [31:00<01:00, 12.13s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 97%|█████████▋| 128/132 [31:14<00:51, 12.92s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 98%|█████████▊| 129/132 [31:36<00:46, 15.54s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 98%|█████████▊| 130/132 [31:55<00:32, 16.48s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 99%|█████████▉| 131/132 [32:07<00:15, 15.27s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 132/132 [32:18<00:00, 14.69s/it]
INFO:__main__:Inserted 6592 entities into Milvus


Inserted IDs: (insert count: 6592, delete count: 0, upsert count: 0, timestamp: 451199856348495875, success count: 6592, err count: 0, cost: 0)


In [14]:
#[CELL-10] Puhing the collection into the index

from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

if utility.has_collection(collection_name):
    
    collection = Collection(name=collection_name)

    try:
        collection.flush()
        logger.info(f"Data in collection '{collection_name}' flushed successfully.")
    except Exception as e:
        logger.error(f"Error flushing collection '{collection_name}': {str(e)}")

    index_params = {
        "index_type": "HNSW",
        "params": {"M": 16, "efConstruction": 200},
        "metric_type": "L2"
    }

    try:
        collection.create_index(field_name="embedding", index_params=index_params)
        logger.info(f"Index created successfully for collection '{collection_name}'.")
    except Exception as e:
        logger.error(f"Error creating index for collection '{collection_name}': {str(e)}")

    try:
        collection.load()
        logger.info(f"Collection '{collection_name}' loaded successfully.")
    except Exception as e:
        logger.error(f"Error loading collection '{collection_name}': {str(e)}")

else:
    logger.error(f"Collection '{collection_name}' does not exist.")



INFO:__main__:Connected to Milvus server.
INFO:__main__:Data in collection 'new_docs_v2' flushed successfully.
INFO:__main__:Index created successfully for collection 'new_docs_v2'.
INFO:__main__:Collection 'new_docs_v2' loaded successfully.


In [41]:
#[CELL-11] Checking if the data is inserted or not in the index.

num_entities = collection.num_entities
print(f"Number of entities in collection: {num_entities}")


Number of entities in collection: 6592


In [58]:
#[CELL-12] Retrieval and Re-ranking

from concurrent.futures import ThreadPoolExecutor
from transformers import pipeline, DistilBertTokenizer, DistilBertForQuestionAnswering
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from pymilvus import Collection, connections, utility

collection_name = "new_docs_v2"
collection = Collection(name=collection_name)

index_params = {
    "index_type": "HNSW",
    "params": {"M": 16, "efConstruction": 200},
    "metric_type": "L2"
}
collection.create_index(field_name="embedding", index_params=index_params)

model = SentenceTransformer('all-MiniLM-L6-v2')

chunks = df['content'].tolist()
tokenized_chunks = [chunk.split() for chunk in chunks]
bm25 = BM25Okapi(tokenized_chunks)

def retrieve_and_rerank(query, k=3):
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_ranked_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:k]
    bm25_results = [chunks[i] for i in bm25_ranked_indices]

    query_embedding = model.encode(query)

    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}  
    results = collection.search([query_embedding.tolist()], "embedding", param=search_params, limit=k, output_fields=["url"])

    milvus_results = [result.entity.get("url") for result in results[0]]

    combined_results = list(set(bm25_results + milvus_results))


    combined_embeddings = [model.encode(text) for text in combined_results]
    similarity_scores = [float(query_embedding @ embedding.T) for embedding in combined_embeddings]
    reranked_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)
    reranked_results = [combined_results[i] for i in reranked_indices]

    return reranked_results




INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


In [59]:
#[CELL-13] Question Answering

qa_model_name = 'distilbert-base-cased-distilled-squad'
qa_pipeline = pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='distilbert-base-cased')

def retrieve_and_answer_questions(query, k=3):
    reranked_results = retrieve_and_rerank(query, k=k)

    answers = []
    for context in reranked_results:
        answer = qa_pipeline(question=query, context=context)
        answers.append(answer['answer'])

    return answers


In [60]:
# #[CELL-14] Sample Question Answering
query = "how to install cuda?"
answers = retrieve_and_answer_questions(query)
print("Answers to '{}' based on retrieved documents:".format(query))
for i, answer in enumerate(answers):
    print("{}. {}".format(i+1, answer))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]