In [1]:
!pip install pymupdf nltk sentence-transformers scikit-learn pymilvus openai rank_bm25


Collecting pymupdf
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pymilvus
  Downloading pymilvus-2.4.4-py3-none-any.whl.metadata (5.4 kB)
Collecting openai
  Downloading openai-1.37.0-py3-none-any.whl.metadata (22 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting PyMuPDFb==1.24.9 (from pymupdf)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Collecting grpcio<=1.63.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.63.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

In [None]:
import fitz  # PyMuPDF
import os

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Directory containing PDF files
pdf_dir = '/content'

# Extract text from each PDF in the directory
all_texts = []
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        text = extract_text_from_pdf(pdf_path)
        all_texts.append(text)

In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

model = SentenceTransformer('all-MiniLM-L6-v2')

def chunk_text(text, chunk_size=100):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_chunk_size = 0
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        if current_chunk_size + len(tokens) <= chunk_size:
            current_chunk.extend(tokens)
            current_chunk_size += len(tokens)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = tokens
            current_chunk_size = len(tokens)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def embed_chunks(chunks):
    return model.encode(chunks, convert_to_tensor=True)


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
from sklearn.mixture import GaussianMixture

def cluster_chunks(embeddings, n_clusters=10):
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full')
    gmm.fit(embeddings)
    cluster_assignments = gmm.predict_proba(embeddings)
    return cluster_assignments, gmm


In [4]:
import openai

openai.api_key = 'sk-proj-PlMfNQ6zLfp1VHT4pwb1T3BlbkFJq2egqZZWGgEYm4r0qfDh'  # Replace with your actual OpenAI API key

def summarize_clusters(cluster_texts):
    summaries = []
    for texts in cluster_texts:
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=f"Summarize the following texts:\n{texts}",
            max_tokens=150
        )
        summary = response['choices'][0]['text'].strip()
        summaries.append(summary)
    return summaries


In [6]:
def recursive_clustering_and_summarization(texts, level=0, max_levels=3, n_clusters=10):
    if level >= max_levels:
        return texts

    chunks = [chunk_text(text) for text in texts]
    flat_chunks = [item for sublist in chunks for item in sublist]
    embeddings = embed_chunks(flat_chunks)

    cluster_assignments, gmm = cluster_chunks(embeddings, n_clusters=n_clusters)

    cluster_texts = [[] for _ in range(n_clusters)]
    for i, assignment in enumerate(cluster_assignments):
        cluster_texts[np.argmax(assignment)].append(flat_chunks[i])

    summaries = summarize_clusters([" ".join(text) for text in cluster_texts])

    return recursive_clustering_and_summarization(summaries, level + 1, max_levels, n_clusters)


In [None]:
from pymilvus import (
    connections, FieldSchema, CollectionSchema, DataType, Collection
)

def create_milvus_collection(collection_name):
    connections.connect("default", host="your-milvus-host", port="19530")

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
        FieldSchema(name="metadata", dtype=DataType.VARCHAR, max_length=65535)
    ]
    schema = CollectionSchema(fields, description="RAPTOR index collection")
    collection = Collection(name=collection_name, schema=schema)
    collection.create_index(field_name="embedding", index_params={
        "metric_type": "L2",
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128},
    })
    return collection

def insert_data_to_milvus(collection, embeddings, metadata):
    entities = [
        embeddings.tolist(),
        metadata
    ]
    collection.insert(entities)
    collection.load()

def search_milvus(collection, query_embedding, top_k=10):
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        [query_embedding],
        "embedding",
        param=search_params,
        limit=top_k,
        output_fields=["metadata"]
    )
    return results


In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

def bm25_retrieval(corpus, query, top_k=10):
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return ranked_indices, scores


In [None]:
def hybrid_retrieval(collection, corpus, query, top_k=10):
    # Step 1: BM25 Retrieval
    bm25_indices, bm25_scores = bm25_retrieval(corpus, query, top_k=top_k)
    bm25_results = [(corpus[i], bm25_scores[i]) for i in bm25_indices]

    # Step 2: BERT-based Retrieval
    query_embedding = embed_chunks([query])[0]
    milvus_results = search_milvus(collection, query_embedding, top_k=top_k)

    # Step 3: Combine Results
    combined_results = bm25_results + [(result.entity.get('metadata'), result.distance) for result in milvus_results[0]]

    # Re-rank based on combined scores (e.g., weighted sum or another fusion method)
    combined_results = sorted(combined_results, key=lambda x: x[1], reverse=True)[:top_k]
    return combined_results


In [None]:

pdf_paths = list(uploaded.keys())

collection_name = "textbook_collection"
collection = create_milvus_collection(collection_name)

all_texts = []
for pdf_path in pdf_paths:
    text = extract_text_from_pdf(pdf_path)
    all_texts.append(text)

summarized_texts = recursive_clustering_and_summarization(all_texts)

chunks = [chunk_text(text) for text in summarized_texts]
flat_chunks = [item for sublist in chunks for item in sublist]
embeddings = embed_chunks(flat_chunks)
metadata = [{"textbook": "summary", "chunk": chunk} for chunk in flat_chunks]

insert_data_to_milvus(collection, embeddings, metadata)


In [None]:
query = "what is attention is all you need?"
corpus = flat_chunks
results = hybrid_retrieval(collection, corpus, query)




In [None]:
print(results)

Attention is a powerful tool in NLP, but it is not the only thing you need to build a successful model. While attention mechanisms like the one described in the passage can help the model focus on relevant parts of the input, they do not address other important aspects of language processing, such as syntax and semantics. To build a truly robust NLP model, you will need to incorporate a variety of techniques, including attention, as well as other types of neural network layers and traditional NLP methods.


In [None]:
query = "what is YOLO?"
corpus = flat_chunks
results = hybrid_retrieval(collection, corpus, query)



In [None]:
print(results)

YOLO is a real-time object detection system that uses a single neural network to predict bounding boxes and class probabilities directly from full images. It is simple, fast, and achieves high performance on object detection tasks. YOLO is trained on full images and directly optimizes detection performance, making it different from traditional object detection methods. It is also shown to be effective in detecting objects in artwork, where other methods struggle.
