In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Tuple

# Sample documents
documents = [
    "The process of photosynthesis in plants converts light energy into chemical energy.",
    "Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties of nature at the scale of atoms and subatomic particles.",
    "The American Civil War was fought from 1861 to 1865 between the Union and the Confederacy.",
    "Machine learning is a subset of artificial intelligence that focuses on the development of algorithms that can learn from and make predictions or decisions based on data.",
    "The human digestive system breaks down food into nutrients that can be absorbed by the body.",
    "Climate change refers to long-term shifts in global weather patterns and average temperatures.",
    "The Renaissance was a period of cultural, artistic, political, and economic revival following the Middle Ages.",
    "DNA, or deoxyribonucleic acid, is a molecule composed of two chains that coil around each other to form a double helix carrying genetic instructions.",
    "The theory of relativity, proposed by Albert Einstein, describes the relationship between space and time.",
    "Artificial neural networks are computing systems inspired by the biological neural networks that constitute animal brains."
]

# Function to generate embeddings
def generate_embeddings(texts: List[str], model: SentenceTransformer) -> np.ndarray:
    return model.encode(texts)

# Function to perform semantic search
def semantic_search(query: str, documents: List[str], embeddings: np.ndarray, model: SentenceTransformer, top_k: int = 3) -> List[Tuple[int, float]]:
    query_embedding = model.encode([query])
    similarities = np.dot(embeddings, query_embedding.T).squeeze()
    top_results = similarities.argsort()[::-1][:top_k]
    return [(i, similarities[i]) for i in top_results]

# Load BGE model
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

# Generate embeddings for documents
document_embeddings = generate_embeddings(documents, model)

# Example queries
queries = [
    "How do plants produce energy?",
    "What is the basic principle of quantum physics?",
    "Tell me about machine learning and AI.",
    "What caused global warming?",
    "Explain the structure of DNA.",
]

# Perform semantic search for each query
for query in queries:
    print(f"\nQuery: {query}")
    results = semantic_search(query, documents, document_embeddings, model)
    for idx, score in results:
        print(f"Score: {score:.4f} - {documents[idx]}")

  from tqdm.autonotebook import tqdm, trange



Query: How do plants produce energy?
Score: 0.8446 - The process of photosynthesis in plants converts light energy into chemical energy.
Score: 0.6265 - The human digestive system breaks down food into nutrients that can be absorbed by the body.
Score: 0.5665 - DNA, or deoxyribonucleic acid, is a molecule composed of two chains that coil around each other to form a double helix carrying genetic instructions.

Query: What is the basic principle of quantum physics?
Score: 0.8054 - Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties of nature at the scale of atoms and subatomic particles.
Score: 0.5768 - The theory of relativity, proposed by Albert Einstein, describes the relationship between space and time.
Score: 0.5477 - DNA, or deoxyribonucleic acid, is a molecule composed of two chains that coil around each other to form a double helix carrying genetic instructions.

Query: Tell me about machine learning and AI.
Score: 0.8553 -