In [1]:
# imports
import os
import glob
from dotenv import load_dotenv
import numpy as np
import pandas as pd

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score


import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama

import faiss

from IPython.display import display, Markdown

import gradio as gr

import warnings


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings('ignore')


In [3]:
load_dotenv(override=True)

True

In [4]:
# API keys and their display names
api_keys = {
    'LLAMA_API_KEY': {'name': 'Llama', 'prefix_len': 8},
    'QWEN_API_KEY': {'name': 'Qwen', 'prefix_len': 7},
    'GOOGLE_API_KEY': {'name': 'Google', 'prefix_len': 8},
    'DEEPSEEK_API_KEY': {'name': 'DeepSeek R1', 'prefix_len': 8}
}

# Check and display status for each API key
for key_name, info in api_keys.items():
    api_key = os.getenv(key_name)
    if api_key:
        print(f"{info['name']} API Key exists and begins {api_key[:info['prefix_len']]}")
    else:
        print(f"{info['name']} API Key not set")

# Alternative Ollama configuration 
"""
meta_model = "llama3.2"
meta = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
"""

Llama API Key exists and begins sk-or-v1
Qwen API Key exists and begins sk-or-v
Google API Key exists and begins sk-or-v1
DeepSeek R1 API Key exists and begins sk-or-v1


'\nmeta_model = "llama3.2"\nmeta = OpenAI(base_url=\'http://localhost:11434/v1\', api_key=\'ollama\')\n'

### The name of the vector database that will store our document embeddings for semantic search


In [5]:
db_name = "vector_store"

In [6]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase
folders = glob.glob("knowledge-base2/")
text_loader_kwargs = {'encoding': 'utf-8'}
# text_loader_kwargs={'autodetect_encoding': True}
documents = []
for folder in folders:  
    loader = DirectoryLoader(folder, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        file_name = os.path.basename(doc.metadata["source"])
        doc.metadata["doc_type"] = os.path.splitext(file_name)[0]
        documents.append(doc)

### The total number of documents loaded from the knowledge base folders


In [7]:
len(documents)

51

In [8]:
#documents

In [9]:
documents[0].metadata

{'source': 'knowledge-base2\\Analytical Chemistry.txt',
 'doc_type': 'Analytical Chemistry'}

In [10]:
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=5000, chunk_overlap=300)
chunks = text_splitter.split_documents(documents)

### Get the total number of text chunks after splitting the documents


In [11]:
len(chunks)

51

In [12]:
chunks[5]

Document(metadata={'source': 'knowledge-base2\\Biotechnology.txt', 'doc_type': 'Biotechnology'}, page_content="Biotechnology is a broad field that harnesses biological processes, organisms, or systems to produce products or services that benefit humanity. It integrates principles from biology, chemistry, engineering, and computer science to develop innovative solutions in various sectors, including medicine, agriculture, industry, and environmental protection. At its core, biotechnology involves manipulating living organisms or their components to achieve specific goals.\nIn medicine, biotechnology has revolutionized the diagnosis, prevention, and treatment of diseases. Recombinant DNA technology is used to produce therapeutic proteins like insulin and growth hormone. Monoclonal antibodies are engineered to target specific disease cells. Gene therapy aims to correct genetic defects. Diagnostic tests based on molecular techniques allow for early and accurate disease detection. The devel

In [13]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: Electromagnetism, Semiconductor Physics, Calculus_, fluid_dynamics, structural_engineering, organic_chemistry, pharmacokinetics, Fluid mechanics, Probability and Statistics, quantum_computing, Control Systems Engineering, Linear algebra, Numerical Methods, Thermodynamics, Immunology, Pharmaceutical Chemistry, semiconductor_physics, thermodynamics_1, immunology_1, machine_learning, Materials Science Engineering, Chemical Thermodynamics, Pharmacology, Biotechnology, Ecology, Numerical Analysis, Clinical Pharmacy, nanotechnology, Analytical Chemistry, molecular_biology, differential_equations, cryptography, Molecular biology, Organic Chemistry, Biomaterials Engineering, astrophysics, biochemistry, Data structures, neural_networks, database_systems, genetic_engineering, Renewable energy, optics, Protein Chemistry, Quantum Mechanics, Artificial Intelligence, polymer_science, Genetic Engineering, Software Engineering, Physical Chemistry, Genetics


#### This is a naive string search approach that has several limitations:
#### 1. Case sensitivity - will miss "Who is Rasheed" or "WHO IS RASHEED"
#### 2. No semantic understanding - won't find variations like "tell me about Rasheed" or "Rasheed's background"
#### 3. No fuzzy matching - won't catch typos or slight variations
#### 4. Inefficient - scans every chunk sequentially
#### 5. No ranking of results by relevance

In [14]:
# Normal search
for chunk in chunks:
    if 'who is Rasheed' in chunk.page_content:
        print(chunk.metadata['source'], chunk)
        print("--------------------------------------")

### `From the lecutres`

In [15]:

# embeddings = OpenAIEmbeddings()
# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# A text embedding is a numerical representation of a text, usually as a dense vector of real numbers. 
# It captures semantic meaning in a format models can understand and compare.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [16]:
# A tokenizer is a tool that breaks text into smaller units called tokens, which are the basic pieces of input that a language model processes.
# The following code is only to understand tokens (It is not part of the RAG example code)
# from transformers import AutoTokenizer
# query = "who is Mohammad Rasheed?"

# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# tokens = tokenizer.tokenize(query)
# token_ids = tokenizer.encode(query, add_special_tokens=True)

# print("Token Strings:", tokens)
# print("Token IDs:", token_ids)

# decoded = tokenizer.decode(token_ids[1])
# print(decoded)  # You get the original text back

### Check if we already have a Chroma database - if yes, remove it so we can start fresh


In [17]:

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

#### `Let's create our Chroma vector database!`
#### We'll store our text chunks in a vector database, where each chunk gets its own vector embedding

In [18]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 51 documents


In [19]:
# To read data from the vector store
collection = vectorstore._collection
ds = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(ds['embeddings'])
docs = ds['documents']
metadatas = ds['metadatas']

# Get the dimension of our vectors
d = vectors.shape[1]  # This should be 384 for our embeddings


doc_types = [metadata['doc_type'] for metadata in ds['metadatas']]
# colors = [['blue', 'green', 'red'][['departments_and_staff', 'about', 'services'].index(t)] for t in doc_types]

#### `The dimension of the vector`

In [20]:
d

384

In [21]:
# Get one vector and find how many dimensions it has
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


t-SNE stands for t-distributed Stochastic Neighbor Embedding. It is a dimensionality reduction algorithm commonly used for visualizing high-dimensional data.

t-SNE takes data with many dimensions (e.g., 100D word embeddings or image features) and reduces it to 2 or 3 dimensions, making it easier to visualize patterns and clusters.
Example: Visualizing 300-dimensional word embeddings (like Word2Vec or GloVe) in 2D space to understand how similar words are grouped.


---
### Data Visualization and Analysis Dashboard
### We have an interactive dashboard to visualize and analyze document embeddings using t-SNE dimensionality reduction
### The dashboard includes multiple plots to help understand document distributions, lengths, and clustering patterns


In [22]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
tsne = TSNE(n_components=2, perplexity=5, random_state=42, 
           max_iter=1000, learning_rate=200)
reduced_vectors = tsne.fit_transform(vectors)

# Create a DataFrame for easier handling
df = pd.DataFrame({
    'x': reduced_vectors[:, 0],
    'y': reduced_vectors[:, 1],
    'doc_type': doc_types,
    'text': [doc[:200] + "..." if len(doc) > 200 else doc for doc in docs],
    'text_length': [len(doc) for doc in docs]
})

# Create color mapping for different document types
unique_types = df['doc_type'].unique()

# Combine multiple color palettes to ensure we have enough colors
all_colors = (px.colors.qualitative.Set3 + 
              px.colors.qualitative.Pastel + 
              px.colors.qualitative.Set1 + 
              px.colors.qualitative.Set2 +
              px.colors.qualitative.Dark2 +
              px.colors.qualitative.Plotly)

# Ensure we have enough colors, cycle through if needed
colors = [all_colors[i % len(all_colors)] for i in range(len(unique_types))]
color_map = dict(zip(unique_types, colors))


# Create ansubplot with document length distribution
fig_combined = make_subplots(
    rows=2, cols=2,
    subplot_titles=('t-SNE Visualization', 'Document Length Distribution', 
                   'Document Type Counts', 'Cluster Density'),
    specs=[[{"colspan": 2}, None],
           [{"type": "bar"}, {"type": "histogram"}]],
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

# Add the main scatter plot
for doc_type in unique_types:
    mask = df['doc_type'] == doc_type
    subset = df[mask]
    
    fig_combined.add_trace(go.Scatter(
        x=subset['x'],
        y=subset['y'],
        mode='markers',
        name=doc_type,
        marker=dict(
            size=10,
            color=color_map[doc_type],
            opacity=0.8,
            line=dict(width=1, color='white')
        ),
        text=[f"<b>Type:</b> {t}<br><b>Length:</b> {l} chars<br><b>Preview:</b><br>{txt}" 
              for t, l, txt in zip(subset['doc_type'], subset['text_length'], subset['text'])],
        hovertemplate='%{text}<extra></extra>',
        showlegend=True
    ), row=1, col=1)

# Add document type counts
type_counts = df['doc_type'].value_counts()
fig_combined.add_trace(go.Bar(
    x=type_counts.index,
    y=type_counts.values,
    marker_color=[color_map[t] for t in type_counts.index],
    name='Document Counts',
    showlegend=False
), row=2, col=1)

# Add text length distribution
fig_combined.add_trace(go.Histogram(
    x=df['text_length'],
    nbinsx=20,
    marker_color='lightblue',
    name='Length Distribution',
    showlegend=False
), row=2, col=2)

fig_combined.update_layout(
    title={
        'text': '2d Document Vector Analysis Dashboard ',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 18, 'family': 'Arial Black'}
    },
    height=800,
    width=1200,
    showlegend=True
)

# Display the figure
fig_combined.show()

In [23]:
gemma_model_name = "google/gemma-3-27b-it:free"

<a id="import"></a>
# <p style="background-color:#E7E8D1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px; color:#000000">Step 1 | Chat with LangChain Agent </p>



In [24]:
# create a new Chat with OpenAI / Ollama

#llama_model = ChatOllama(model="llama3.2:latest")

llm = ChatOpenAI(base_url="https://openrouter.ai/api/v1"
                ,api_key=os.getenv('GOOGLE_API_KEY'),
                model_name=gemma_model_name)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [25]:
def ask_question(question, conversation_chain=None, display_format="text"):
    """
    Ask a question to the conversation chain and return/display the response.
    
    Args:
        question (str or list): The question(s) to ask. Can be a single string or list of strings
        conversation_chain: The conversation chain instance (optional)
        display_format (str): How to display the output ("text" or "markdown")
    
    Returns:
        dict or list: The full conversation chain response(s)
    """
    if conversation_chain is None:
        conversation_chain = globals().get('conversation_chain')
        if conversation_chain is None:
            raise ValueError("No conversation chain provided or found in globals")
    
    # Handle both single questions and lists of questions
    if isinstance(question, list):
        results = []
        for q in question:
            result = conversation_chain.invoke({"question": str(q)})
            results.append(result)
            
            if display_format.lower() == "markdown":
                display(Markdown(f"**Question:** {q}\n\n**Answer:** {result['answer']}"))
            else:
                print(f"\nQuestion: {q}")
                print(f"Answer: {result['answer']}\n")
        
        return results
    else:
        # Handle single question
        result = conversation_chain.invoke({"question": str(question)})
        
        if display_format.lower() == "markdown":
            display(Markdown(f"**Question:** {question}\n\n**Answer:** {result['answer']}"))
        else:
            print(f"\nQuestion: {question}")
            print(f"Answer: {result['answer']}\n")
        
        return result

In [26]:
# Ask two different questions
questions = [
    "Talk about Artificial Intelligence related to AI Agentic",
    "What are the main topics covered in the analytical chemistry documents?"
]
ask_question(question=questions, conversation_chain=conversation_chain, display_format="markdown")

**Question:** Talk about Artificial Intelligence related to AI Agentic

**Answer:** 
Okay, let's talk about AI, specifically focusing on the concept of "AI Agents" as described within these provided texts.

**AI Agents: The Core Idea**

The text defines the goal of AI as developing **intelligent agents**. These aren't robots necessarily (though they *can* be embodied in robots), but rather **systems that can perceive their environment and take actions to maximize their chances of achieving a goal.**  This is a foundational concept in AI.

Here’s a breakdown of key aspects related to AI agents, based on the provided information:

*   **Perception & Action:** Agents need to be able to *sense* things – gather information about their environment. They also need to be able to *act* – to do things that affect that environment.
*   **Goal-Oriented:** The “maximize their chances of achieving a goal” part is critical. Unlike a simple program that just executes instructions, an agent has a defined objective and tries to *achieve* it.
*   **Intelligence:** This is where the "AI" comes in. The agent's intelligence determines how effectively it can perceive, reason, plan, and act to reach its goal.

**How AI Subfields contribute to Agent Development:**

The text shows how several areas of AI contribute to building effective agents:

*   **Machine Learning (ML):** Is incredibly important. Agents often *learn* how to achieve their goals. 
    *   **Reinforcement Learning** is specifically highlighted as a way for agents to learn through “trial and error,” which is very relevant to agent behavior. An agent takes actions, gets feedback (rewards or penalties), and adjusts its strategy to improve over time.
*   **Natural Language Processing (NLP):** If an agent needs to interact with humans (understand commands, provide explanations), NLP is essential.
*   **Computer Vision:** If the agent needs to "see" the environment (e.g., a self-driving car), computer vision provides that capability.
*   **Robotics:** If the agent is a physical robot, robotics provides the design and programming aspects.



**The connection to Neural Networks**

Neural networks, particularly *deep learning*, play a huge role in modern AI agents.  They provide the pattern recognition and complex decision-making capabilities that are needed for agents to operate in real-world environments.
The text highlights that neural networks can learn *hierarchical representations from raw data*, meaning they can automatically figure out important features and patterns without explicit programming. This is a massive advantage for agents dealing with complex, unstructured environments.

**Challenges & Considerations**

The text also points out challenges relating to agent development:

*   **"Black Box" Nature:** Understanding *why* an AI agent made a particular decision can be difficult, which is a concern for applications where transparency is crucial.
*   **Data & Resources:** Training these agents, especially those using deep learning, usually requires a lot of data and computing power.



In short, AI agents represent a powerful paradigm for building intelligent systems. The combination of AI techniques (especially machine learning and neural networks) continues to drive advances in this exciting field.

**Question:** What are the main topics covered in the analytical chemistry documents?

**Answer:** 
According to the provided text, the main topics covered in analytical chemistry are:

* **Identification and quantification of chemical substances:** Determining *what* is in a sample and *how much* of it is there.
* **Classical and Instrumental Methods:** The different techniques used for analysis (gravimetric, volumetric, spectroscopy, chromatography, electrochemistry).
* **The Analytical Process:** Sampling, sample preparation, measurement, and data interpretation (including statistical analysis).
* **Error Analysis & Validation:** Ensuring reliable results through careful consideration of errors and method validation.
* **Development of New Techniques:** Ongoing research to improve sensitivity and analyze complex samples.





[{'question': 'Talk about Artificial Intelligence related to AI Agentic',
  'chat_history': [HumanMessage(content='Talk about Artificial Intelligence related to AI Agentic', additional_kwargs={}, response_metadata={}),
   AIMessage(content='\nOkay, let\'s talk about AI, specifically focusing on the concept of "AI Agents" as described within these provided texts.\n\n**AI Agents: The Core Idea**\n\nThe text defines the goal of AI as developing **intelligent agents**. These aren\'t robots necessarily (though they *can* be embodied in robots), but rather **systems that can perceive their environment and take actions to maximize their chances of achieving a goal.**  This is a foundational concept in AI.\n\nHere’s a breakdown of key aspects related to AI agents, based on the provided information:\n\n*   **Perception & Action:** Agents need to be able to *sense* things – gather information about their environment. They also need to be able to *act* – to do things that affect that environment.

In [27]:
# Define system prompt 
SYSTEM_PROMPT = """You are a helpful AI assistant with access to a knowledge base of documents. 
Your role is to:
- Provide accurate and relevant information based on the documents
- Be concise but thorough in your responses
- Acknowledge when you don't have enough information
- Maintain a professional and helpful tone
- Focus on answering the specific question asked
- You have to act as a data minning expert
- Your name is Amro 
"""

In [28]:
def chat_interface(message, history):
    # Convert history to the format expected by the conversation chain
    chat_history = []
    for user, ai in history:
        chat_history.append({"role": "user", "content": user})
        chat_history.append({"role": "assistant", "content": ai})
    
    # Add system prompt to the message
    full_message = f"{SYSTEM_PROMPT}\n\nUser question: {message}"
    
    # Get response from the conversation chain
    result = conversation_chain.invoke({"question": full_message})
    
    return result['answer']

In [29]:
# Create the Gradio interface
demo = gr.ChatInterface(
    fn=chat_interface,
    title="RAG Chat Interface",
    description="Ask questions about the documents in the knowledge base. I'll help you find relevant information!",
    theme="soft",
    examples=[
        "What is astophsics?",
        "Tell me about analytical chemistry",
        "How can I be a calculus expert"
    ]
)


In [30]:

# Launch the interface
demo.launch(share=True)



* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




<a id="import"></a>
# <p style="background-color:#E7E8D1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px; color:#000000">Step 2 | Search Using FAISS LSH </p>



### LSH Evaluation Metrics - Quick Reference
### Key Metrics

### `Precision@k: Of k results returned, how many are relevant?`

### `Recall@k: Of all relevant items, how many did you find?`


### `NDCG@10: How well are relevant results ranked?`


#### `MRR: How quickly do you find the first relevant result?`


#### LSH Parameter Choice

Lower nbits (8,16): Faster, less accurate

Higher nbits (48,64): Slower, more accurate

In [31]:
test_queries = [
    ("What is analytical chemistry?", [0]),
    ("What is artificial intelligence?", [1]),
    ("What is biotechnology?", [5]),
    ("Explain linear algebra", [23]), 
]

In [32]:
nbits = 32  # Number of bits for LSH (adjust based on our evaluation technices down)
index = faiss.IndexLSH(d, nbits)

index.add(vectors)    # Add vectors to the index

In [33]:

nbits_list = [8, 16, 24, 32, 48, 64]
k_values = [5, 10, 20]
results = {}

for nbits in nbits_list:
    index = faiss.IndexLSH(vectors.shape[1], nbits)
    index.add(vectors)
    
    metrics = {f'precision@{k}': [] for k in k_values}
    metrics.update({f'recall@{k}': [] for k in k_values})
    metrics['ndcg@10'] = []
    metrics['mrr'] = []  # Mean Reciprocal Rank
    
    for query_text, expected_indices in test_queries:
        query_vector = np.array([embeddings.embed_query(query_text)], dtype='float32')
        distances, indices = index.search(query_vector, max(k_values))
        
        expected_set = set(expected_indices)
        
        for k in k_values:
            retrieved = set(indices[0][:k])
            
            # Precision@k
            precision = len(retrieved & expected_set) / k if k > 0 else 0
            metrics[f'precision@{k}'].append(precision)
            
            # Recall@k
            recall = len(retrieved & expected_set) / len(expected_set) if expected_set else 0
            metrics[f'recall@{k}'].append(recall)
        
        # NDCG@10 (relevance-based ranking quality)
        if len(expected_indices) > 0:
            y_true = [1 if idx in expected_set else 0 for idx in indices[0][:10]]
            y_score = [-distances[0][i] for i in range(min(10, len(distances[0])))]  # Use actual scores
            if sum(y_true) > 0 and len(y_true) == len(y_score):
                ndcg = ndcg_score([y_true], [y_score])
                metrics['ndcg@10'].append(ndcg)
        
        # Mean Reciprocal Rank
        for i, idx in enumerate(indices[0][:20]):
            if idx in expected_set:
                metrics['mrr'].append(1.0 / (i + 1))
                break
        else:
            metrics['mrr'].append(0.0)
    
    # Average all metrics
    avg_metrics = {metric: np.mean(values) for metric, values in metrics.items()}
    results[nbits] = avg_metrics
    
    print(f"nbits={nbits:2d}: P@5={avg_metrics['precision@5']:.3f}, "
            f"R@10={avg_metrics['recall@10']:.3f}, "
            f"NDCG@10={avg_metrics['ndcg@10']:.3f}, "
            f"MRR={avg_metrics['mrr']:.3f}")

# Best by different criteria
best_precision = max(results, key=lambda x: results[x]['precision@5'])
best_recall = max(results, key=lambda x: results[x]['recall@10'])
best_ndcg = max(results, key=lambda x: results[x]['ndcg@10'])

print(f"\nBest for precision@5: {best_precision}")
print(f"Best for recall@10: {best_recall}")
print(f"Best for NDCG@10: {best_ndcg}")

nbits= 8: P@5=0.050, R@10=0.250, NDCG@10=0.461, MRR=0.098
nbits=16: P@5=0.150, R@10=0.750, NDCG@10=0.467, MRR=0.350
nbits=24: P@5=0.100, R@10=0.750, NDCG@10=0.609, MRR=0.365
nbits=32: P@5=0.200, R@10=1.000, NDCG@10=1.000, MRR=1.000
nbits=48: P@5=0.200, R@10=1.000, NDCG@10=0.757, MRR=0.833
nbits=64: P@5=0.200, R@10=1.000, NDCG@10=0.881, MRR=1.000

Best for precision@5: 32
Best for recall@10: 32
Best for NDCG@10: 32


In [40]:
def search_and_visualize(query_text, k=3):
    print("="*60)
    print(f"   SEARCHING FOR: '{query_text}'")
    print("="*60)
    
    # Get query embedding
    query_embedding = embeddings.embed_query(query_text)
    query_vector = np.array([query_embedding]).astype('float32')
    
    # Calculate distances for ALL documents (not just LSH results)
    distances_all = np.linalg.norm(vectors - query_vector, axis=1)
    similarities = 1 / (1 + distances_all)  # Convert to similarity scores
    normalized_similarities = (similarities - similarities.min()) / (similarities.max() - similarities.min())
    
    # Get top k results based on normalized similarity (highest first)
    top_indices = np.argsort(normalized_similarities)[::-1][:k]
    
    # Get the results with enhanced formatting
    results = []
    print("\nSEARCH RESULTS (Top 3 by Normalized Similarity):")
    print("-" * 50)
    for i, idx in enumerate(top_indices):
        dist = distances_all[idx]
        norm_sim = normalized_similarities[idx]
        results.append({
            'text': docs[idx],
            'metadata': metadatas[idx],
            'distance': float(dist),
            'normalized_similarity': float(norm_sim),
            'index': idx
        })
        print(f"\nResult #{i+1}")
        print(f"   Distance: {dist:.4f}")
        print(f"   Normalized Similarity: {norm_sim:.4f}")
        print(f"   Meta Data: {metadatas[idx]}")
        print(f"   Text: {docs[idx]}")
    
    # Create visualization with enhanced aesthetics
    all_vectors = vectors.copy()
    query_point = query_vector[0]
    
    # Add query point to visualization
    viz_vectors = np.vstack([all_vectors, query_point])
    
    # Reduce dimensionality with t-SNE (enhanced parameters)
    tsne = TSNE(n_components=2, perplexity=min(30, len(viz_vectors)-1), 
                random_state=42, max_iter=1000, learning_rate=200)
    reduced_vectors = tsne.fit_transform(viz_vectors)
    
    # Separate the query point and document points
    doc_points = reduced_vectors[:-1]
    query_point_2d = reduced_vectors[-1]
    
    # Create the stunning plot :)
    fig = go.Figure()
    
    # Add document points with gradient coloring and enhanced hover
    fig.add_trace(go.Scatter(
        x=doc_points[:, 0],
        y=doc_points[:, 1],
        mode='markers',
        marker=dict(
            size=12,
            color=normalized_similarities,
            colorscale='plasma',  # More vibrant colorscale
            showscale=True,
            opacity=0.8,
            line=dict(width=1, color='white'),
            colorbar=dict(
                title=dict(
                    text="Similarity Score",
                    font=dict(size=14, family="Arial Black")
                ),
                thickness=25,
                len=0.7,
                x=1.02,
                tickfont=dict(size=12)
            )
        ),
        text=[f"<b>Type:</b> {m['doc_type']}<br><b>Similarity:</b> {sim:.2%}<br><b>Preview:</b><br>{dim[:200]}{'...' if len(dim) > 200 else ''}" 
              for m, dim, sim in zip(metadatas, docs, normalized_similarities)],
        hovertemplate='%{text}<extra></extra>',
        name='Document Corpus',
        hoverlabel=dict(
            bgcolor="rgba(255,255,255,0.95)",
            bordercolor="rgba(0,0,0,0.2)",
            font=dict(size=13, family="Arial"),
            namelength=-1
        )
    ))
    
    # Add query point with stunning styling
    fig.add_trace(go.Scatter(
        x=[query_point_2d[0]],
        y=[query_point_2d[1]],
        mode='markers+text',
        marker=dict(
            size=25,
            color='gold',
            symbol='star',
            line=dict(width=3, color='darkorange'),
            opacity=1.0
        ),
        text=['QUERY'],
        textposition="top center",
        textfont=dict(size=14, color='darkorange', family="Arial Black"),
        name='Search Query',
        hovertemplate=f'<b>Search Query</b><br>"{query_text}"<extra></extra>',
        hoverlabel=dict(
            bgcolor="gold",
            bordercolor="darkorange",
            font=dict(size=13, color='black', family="Arial Black")
        )
    ))
    
    # Add result points with enhanced styling and animations
    result_points = doc_points[top_indices]
    result_similarities = normalized_similarities[top_indices]
    
    # Create different markers for top results
    symbols = ['diamond', 'circle', 'square', 'triangle-up', 'pentagon']
    colors = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4', '#ffeaa7']
    rank_labels = ['#1', '#2', '#3', '#4', '#5']
    
    for i, (point, idx, sim) in enumerate(zip(result_points, top_indices, result_similarities)):
        rank_label = rank_labels[i] if i < 5 else f'#{i+1}'
        dist = distances_all[idx]
        
        fig.add_trace(go.Scatter(
            x=[point[0]],
            y=[point[1]],
            mode='markers+text',
            marker=dict(
                size=20,
                color=colors[i % len(colors)],
                symbol=symbols[i % len(symbols)],
                line=dict(width=3, color='white'),
                opacity=0.9
            ),
            text=[rank_label],
            textposition="middle center",
            textfont=dict(size=16, color='white', family="Arial Black"),
            name=f'Result {rank_label}',
            hovertemplate=f'<b>Result {rank_label}</b><br>' +
                         f'Distance: {dist:.4f}<br>' +
                         f'Normalized Similarity: {sim:.4f}<br>' +
                         f'Type: {metadatas[idx]["doc_type"]}<br>' +
                         f'Text: {docs[idx][:150]}{"..." if len(docs[idx]) > 150 else ""}<extra></extra>',
            hoverlabel=dict(
                bgcolor=colors[i % len(colors)],
                bordercolor='white',
                font=dict(size=12, color='white', family="Arial")
            )
        ))
    
    # Add connection lines from query to results
    for i, point in enumerate(result_points):
        fig.add_trace(go.Scatter(
            x=[query_point_2d[0], point[0]],
            y=[query_point_2d[1], point[1]],
            mode='lines',
            line=dict(
                color=colors[i % len(colors)],
                width=2,
                dash='dot'
            ),
            opacity=0.6,
            showlegend=False,
            hoverinfo='skip'
        ))
    
    # add more sytling layout  :)
    fig.update_layout(
        title=dict(
            text='Semantic Search Visualization<br><sub>Document Similarity Explorer</sub>',
            x=0.5,
            y=0.95,
            xanchor='center',
            yanchor='top',
            font=dict(size=24, family="Arial Black", color='#2c3e50')
        ),
        xaxis=dict(
            title=dict(text='t-SNE Dimension 1', font=dict(size=14, family="Arial")),
            showgrid=True,
            gridcolor='rgba(200,200,200,0.3)',
            gridwidth=1,
            zeroline=True,
            zerolinecolor='rgba(0,0,0,0.3)',
            zerolinewidth=2,
            showspikes=True,
            spikecolor='rgba(0,0,0,0.5)',
            spikesnap='cursor',
            spikemode='across',
            spikedash='solid',
            spikethickness=1
        ),
        yaxis=dict(
            title=dict(text='t-SNE Dimension 2', font=dict(size=14, family="Arial")),
            showgrid=True,
            gridcolor='rgba(200,200,200,0.3)',
            gridwidth=1,
            zeroline=True,
            zerolinecolor='rgba(0,0,0,0.3)',
            zerolinewidth=2,
            showspikes=True,
            spikecolor='rgba(0,0,0,0.5)',
            spikesnap='cursor',
            spikemode='across',
            spikedash='solid',
            spikethickness=1
        ),
        showlegend=True,
        width=1200,
        height=900,
        margin=dict(l=80, r=150, t=120, b=80),
        legend=dict(
            yanchor="top",
            y=0.98,
            xanchor="left",
            x=1.02,
            bgcolor="rgba(255,255,255,0.9)",
            bordercolor="rgba(0,0,0,0.2)",
            borderwidth=1,
            font=dict(size=11)
        ),
        plot_bgcolor='rgba(248,249,250,0.8)',
        paper_bgcolor='white',
        hovermode='closest',
        hoverdistance=50,
        font=dict(family="Arial")
    )
    
    # Add enhanced annotations with modern styling
    fig.add_annotation(
        text="<b>Visual Guide</b><br>" +
             "Gold Star = Your Query<br>" +
             "Colored Markers = Top Results<br>" +
             "Dotted Lines = Query Connections",
        xref="paper", yref="paper",
        x=0.02, y=0.02,
        showarrow=False,
        font=dict(size=11, family="Arial"),
        bgcolor="rgba(255,255,255,0.95)",
        bordercolor="rgba(0,0,0,0.2)",
        borderwidth=1,
        borderpad=8,
        align="left"
    )
    
    # Add performance metrics annotation
    fig.add_annotation(
        text=f"<b>Search Performance</b><br>" +
             f"Results Found: {k}<br>" +
             f"Query: \"{query_text[:30]}{'...' if len(query_text) > 30 else ''}\"",
        xref="paper", yref="paper",
        x=0.98, y=0.02,
        showarrow=False,
        font=dict(size=11, family="Arial"),
        bgcolor="rgba(240,248,255,0.95)",
        bordercolor="rgba(70,130,180,0.3)",
        borderwidth=1,
        borderpad=8,
        align="left"
    )
    
    print(f"\nVisualization created with {len(doc_points)} documents")
    print("="*60)

    return results, fig

In [41]:
query = "What is Supervised learning algorithm ??"
results, fig = search_and_visualize(query)
fig.show()

   SEARCHING FOR: 'What is Supervised learning algorithm ??'

SEARCH RESULTS (Top 3 by Normalized Similarity):
--------------------------------------------------

Result #1
   Distance: 0.9269
   Normalized Similarity: 1.0000
   Meta Data: {'source': 'knowledge-base2\\machine_learning.txt', 'doc_type': 'machine_learning'}
   Text: # Machine Learning
Machine learning enables computers to improve their performance on tasks through experience without explicit programming. This computational approach transforms data into actionable intelligence by identifying patterns and making decisions with minimal human intervention. From recommendation systems to autonomous vehicles, machine learning algorithms power increasingly sophisticated applications that adapt to new information and changing environments.
Supervised learning algorithms learn mappings between inputs and outputs using labeled training data. Classification algorithms assign inputs to discrete categories, exemplified by support vec

<a id="import"></a>
# <p style="background-color:#E7E8D1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px; color:#000000">Step 3 | Search Using Cosine Similarity </p>



In [44]:
def cosine_search_and_visualize(query_text, k=3):
    # 1. Embed the query
    query_embedding = embeddings.embed_query(query_text)
    query_vector = np.array([query_embedding]).astype('float32')
    
    # 2. Compute cosine similarity
    sims = cosine_similarity(query_vector, vectors)[0]  # shape: (num_docs,)
    
    # 3. Get top k indices
    top_k_idx = np.argsort(sims)[::-1][:k]
    
    # 4. Prepare results
    results = []
    for idx in top_k_idx:
        results.append({
            'text': docs[idx],
            'metadata': metadatas[idx],
            'score': float(sims[idx]),
            'index': idx
        })
        print(f"\nResult {len(results)}:")
        print(f"Score: {sims[idx]:.4f}")
        print(f"Meta Data: {metadatas[idx]}")
        print(f"Text: {docs[idx]}")
    
    # 5. Visualization
    # Stack query vector with all document vectors for t-SNE
    viz_vectors = np.vstack([vectors, query_vector])
    tsne = TSNE(n_components=2, perplexity=5, random_state=42)
    reduced_vectors = tsne.fit_transform(viz_vectors)
    doc_points = reduced_vectors[:-1]
    query_point_2d = reduced_vectors[-1]
    
    # Normalize similarity for coloring
    norm_sims = (sims - sims.min()) / (sims.max() - sims.min())
    
    fig = go.Figure()
    # All document points
    fig.add_trace(go.Scatter(
        x=doc_points[:, 0],
        y=doc_points[:, 1],
        mode='markers',
        marker=dict(
            size=8,
            color=norm_sims,
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(
                title='Cosine Similarity',
                thickness=20,
                len=0.8,
                y=0.5
            )
        ),
        text=[f"<b>Type:</b> {m['doc_type']}<br><b>Text:</b> {dim[:150]}..." for m, dim in zip(metadatas, docs)],
        hoverinfo='text',
        name='Documents',
        hoverlabel=dict(
            bgcolor="white",
            font_size=12,
            font_family="Arial"
        )
    ))
    # Query point
    fig.add_trace(go.Scatter(
        x=[query_point_2d[0]],
        y=[query_point_2d[1]],
        mode='markers+text',
        marker=dict(
            size=18,
            color='red',
            symbol='star'
        ),
        text=["Query"],
        textposition="top center",
        name='Query',
        hoverinfo='text'
    ))
    # Top k results (highlight)
    result_points = doc_points[top_k_idx]
    fig.add_trace(go.Scatter(
        x=result_points[:, 0],
        y=result_points[:, 1],
        mode='markers+text',
        marker=dict(
            size=15,
            color='green',
            symbol='circle-open',
            line=dict(width=2, color='black')
        ),
        text=[f"Top {i+1}" for i in range(len(result_points))],
        textposition="top center",
        name='Top Results',
        hoverinfo='text',
        hovertext=[f"<b>Top {i+1}</b><br>Score: {sims[idx]:.4f}<br><b>Type:</b> {metadatas[idx]['doc_type']}<br><b>Text:</b> {docs[idx][:150]}..." 
                   for i, idx in enumerate(top_k_idx)],
        hoverlabel=dict(
            bgcolor="white",
            font_size=12,
            font_family="Arial"
        )
    ))
    fig.update_layout(
        title=dict(
            text='t-SNE Visualization of Cosine Similarity',
            x=0.5,
            font=dict(size=20)
        ),
        xaxis_title='t-SNE 1',
        yaxis_title='t-SNE 2',
        showlegend=True,
        width=1000,
        height=800,
        margin=dict(l=50, r=50, t=100, b=50),
        plot_bgcolor='white',
        paper_bgcolor='white',
        hovermode='closest'
    )
    fig.add_annotation(
        text="<b>Color Guide:</b><br>Brighter = More similar to query",
        xref="paper", yref="paper",
        x=0.02, y=0.02,
        showarrow=False,
        font=dict(size=12),
        bgcolor="white",
        bordercolor="black",
        borderwidth=1,
        borderpad=4
    )
    return results, fig

In [43]:
def cosine_search_3d(query_text, k=3):
    # Embed query and compute similarities
    query_embedding = embeddings.embed_query(query_text)
    query_vector = np.array([query_embedding]).astype('float32')
    sims = cosine_similarity(query_vector, vectors)[0]
    top_k_idx = np.argsort(sims)[::-1][:k]
    
    # Prepare results
    results = []
    for i, idx in enumerate(top_k_idx):
        results.append({
            'text': docs[idx], 'metadata': metadatas[idx],
            'score': float(sims[idx]), 'index': idx
        })
        print(f"Top {i+1} (Score: {sims[idx]:.4f}): {docs[idx][:100]}...")
    
    # 3D t-SNE visualization
    viz_vectors = np.vstack([vectors, query_vector])
    tsne_3d = TSNE(n_components=3, perplexity=min(30, len(vectors)//4), random_state=42)
    coords_3d = tsne_3d.fit_transform(viz_vectors)
    
    doc_coords, query_coord = coords_3d[:-1], coords_3d[-1]
    norm_sims = (sims - sims.min()) / (sims.max() - sims.min() + 1e-8)
    
    # Create stunning 3D plot
    fig = go.Figure()
    
    # All documents with gradient coloring
    fig.add_trace(go.Scatter3d(
        x=doc_coords[:, 0], y=doc_coords[:, 1], z=doc_coords[:, 2],
        mode='markers',
        marker=dict(
            size=6, color=norm_sims, colorscale='Plasma',
            opacity=0.7, showscale=True,
            colorbar=dict(title="Similarity", thickness=15, x=1.02)
        ),
        text=[f"{m['doc_type']}: {dim[:80]}..." for m, dim in zip(metadatas, docs)],
        name='Documents', hovertemplate='<b>%{text}</b><extra></extra>'
    ))
    
    # Query as glowing star
    fig.add_trace(go.Scatter3d(
        x=[query_coord[0]], y=[query_coord[1]], z=[query_coord[2]],
        mode='markers+text',
        marker=dict(size=15, color='gold', symbol='diamond', 
                   line=dict(width=3, color='red')),
        text=["QUERY"], textposition="top center",
        name='Query', hovertemplate='<b>Query: %{customdata}</b><extra></extra>',
        customdata=[query_text[:50] + "..."]
    ))
    
    # Top results with pulsing effect
    top_coords = doc_coords[top_k_idx]
    fig.add_trace(go.Scatter3d(
        x=top_coords[:, 0], y=top_coords[:, 1], z=top_coords[:, 2],
        mode='markers',
        marker=dict(size=12, color='lime', symbol='circle-open',
                   line=dict(width=4, color='darkgreen')),
        text=[f"#{i+1} ({sims[idx]:.3f})" for i, idx in enumerate(top_k_idx)],
        name='Top Matches',
        hovertemplate='<b>Rank %{text}</b><br>%{customdata}<extra></extra>',
        customdata=[f"{metadatas[idx]['doc_type']}: {docs[idx][:80]}..." 
                    for idx in top_k_idx]
    ))
    
    # Styling for maximum visual impact
    fig.update_layout(
        title=dict(text='🔍 3D Semantic Search Space', x=0.5, font=dict(size=24)),
        scene=dict(
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, title=''),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, title=''),
            zaxis=dict(showgrid=False, zeroline=False, showticklabels=False, title=''),
            bgcolor='black', camera=dict(eye=dict(x=1.5, y=1.5, z=1.5))
        ),
        width=1000, height=700, paper_bgcolor='black', font=dict(color='white'),
        showlegend=True, legend=dict(x=0, y=1, bgcolor='rgba(0,0,0,0.5)')
    )
    
    return results, fig

In [45]:
query = "What is Supervised learning algorithm ??"
results, fig = cosine_search_and_visualize(query)
fig.show()


Result 1:
Score: 0.5704
Meta Data: {'source': 'knowledge-base2\\machine_learning.txt', 'doc_type': 'machine_learning'}
Text: # Machine Learning
Machine learning enables computers to improve their performance on tasks through experience without explicit programming. This computational approach transforms data into actionable intelligence by identifying patterns and making decisions with minimal human intervention. From recommendation systems to autonomous vehicles, machine learning algorithms power increasingly sophisticated applications that adapt to new information and changing environments.
Supervised learning algorithms learn mappings between inputs and outputs using labeled training data. Classification algorithms assign inputs to discrete categories, exemplified by support vector machines establishing decision boundaries in high-dimensional spaces and decision trees recursively partitioning feature spaces based on information gain. Regression models predict continuous values, wit

In [39]:
results, fig = cosine_search_3d(query)
fig.show()

Top 1 (Score: 0.5704): # Machine Learning
Machine learning enables computers to improve their performance on tasks through ...
Top 2 (Score: 0.5475): Artificial Intelligence (AI) in computer science is the field dedicated to creating systems that can...
Top 3 (Score: 0.3622): Data structures and algorithms are foundational concepts in computer science, crucial for efficient ...
