# Rag Basics

1. Embeddings
2. Chunking
3. Vector similarity search

## Chunk size?

<img src="./assets-resources/optimal-chunk-size.png" width=50%>

In [1]:
import ipywidgets as widgets
from IPython.display import HTML
from IPython.display import display

# Sample text to chunk
DEFAULT_TEXT = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction 
between computers and human language. It enables computers to understand, interpret, and generate human 
language in a valuable way. NLP combines computational linguistics, machine learning, and deep learning 
to process and analyze large amounts of natural language data.

Some common applications of NLP include machine translation, sentiment analysis, chatbots, text 
summarization, and question-answering systems. As technology advances, NLP continues to improve and 
find new applications in various industries.
"""

def chunk_text(text, chunk_size):
    # Remove extra whitespace and split into chunks
    text = ' '.join(text.split())
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append((start, end, chunk))
        start += chunk_size
        if start >= len(text) and end > len(text):
            chunks[-1] = (start, len(text), text[start:])
            break

    return chunks

def display_colored_chunks(chunk_size, input_text):
    # Use input text if provided, otherwise use default
    text_to_use = input_text if input_text.strip() else DEFAULT_TEXT
    
    # First get the cleaned text (with extra whitespace removed)
    cleaned_text = ' '.join(text_to_use.split())
    chunks = chunk_text(cleaned_text, chunk_size)
    print(f"Number of chunks: {len(chunks)}\n")
    
    # Define a list of colors for chunks
    colors = ['#FFB3BA', '#BAFFC9', '#4B9FE1', '#FFFFBA', '#E5BAFF', '#FFD1DC', '#1E90FF', '#FFFFD1', '#F0D1FF', '#FFE4E1', '#98FB98', '#00BFFF', '#F0E68C', '#DDA0DD']
    
    # Create HTML with colored spans
    html_output = cleaned_text
    
    # Add the chunk colors
    for i, (start_idx, end_idx, chunk) in enumerate(reversed(chunks)):
        color = colors[i % len(colors)]
        html_output = (
            html_output[:start_idx] +
            f'<span style="background-color: {color}">' +
            html_output[start_idx:end_idx] +
            '</span>' +
            html_output[end_idx:]
        )
    
    display(HTML(f"<p>{html_output}</p>"))

# Create and display the interactive widgets
chunk_size_slider = widgets.IntSlider(
    value=200,
    min=10,
    max=1000,
    step=5,
    description='Chunk Size:',
    style={'description_width': 'initial'}
)

text_area = widgets.Textarea(
    value='',
    placeholder='Enter your text here (leave empty to use default text)',
    description='Input Text:',
    layout={'width': '100%', 'height': '100px'},
    style={'description_width': 'initial'}
)

# Display the interactive widget
interactive_widget = widgets.interactive(display_colored_chunks, 
                   chunk_size=chunk_size_slider,
                   input_text=text_area)

display(interactive_widget)

interactive(children=(IntSlider(value=200, description='Chunk Size:', max=1000, min=10, step=5, style=SliderStâ€¦

# Embeddings

In [2]:
from langchain.embeddings import OpenAIEmbeddings
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample text about transformers
text = """The Transformer architecture has become a cornerstone of modern natural language processing. 
It introduced self-attention mechanisms that allow models to weigh the importance of different words in a sequence. 
This innovation led to breakthrough models like BERT and GPT, which achieve remarkable results across many language tasks."""

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20
)
chunks = text_splitter.split_text(text)

# Get embeddings
embeddings = OpenAIEmbeddings()
vectors = embeddings.embed_documents(chunks)

# Convert vectors list to numpy array
vectors_array = np.array(vectors)

# Reduce dimensionality with t-SNE to 3D
perplexity = min(5, len(vectors_array) - 1)  # Safeguard against too few chunks
tsne = TSNE(n_components=3, random_state=42, perplexity=perplexity)
reduced_vectors = tsne.fit_transform(vectors_array)

# Define distinct colors for better visualization
colors = ['#FF0000', '#00FF00', '#0000FF', '#FF00FF', '#FFFF00', '#00FFFF', 
          '#FFA500', '#800080', '#008000', '#FFC0CB']

# Create 3D visualization
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers+text',
    text=[f"Chunk {i+1}" for i in range(len(chunks))],
    hovertext=chunks,
    marker=dict(
        size=12,
        color=[colors[i % len(colors)] for i in range(len(chunks))],
        opacity=0.8,
        symbol='circle'
    ),
    textposition='top center'
)])

# Update layout for better readability
fig.update_layout(
    scene=dict(
        xaxis_title="t-SNE Dimension 1",
        yaxis_title="t-SNE Dimension 2", 
        zaxis_title="t-SNE Dimension 3",
        camera=dict(
            up=dict(x=0, y=0, z=1),
            center=dict(x=0, y=0, z=0),
            eye=dict(x=1.5, y=1.5, z=1.5)
        )
    ),
    title={
        'text': "3D Visualization of Text Chunk Embeddings",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    showlegend=False,
    margin=dict(l=0, r=0, t=30, b=0)
)

fig.show()

  embeddings = OpenAIEmbeddings()


In [4]:
import numpy as np
import plotly.graph_objects as go
from sklearn.metrics.pairwise import euclidean_distances
from langchain.embeddings import OpenAIEmbeddings
from sklearn.manifold import TSNE
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample text about transformers
text = """
The Transformer architecture has become a cornerstone of modern natural language processing. 
It introduced self-attention mechanisms that allow models to weigh the importance of different words in a sequence. 
This innovation led to breakthrough models like BERT and GPT, which achieve remarkable results across many language tasks."""

# Function to process the text and generate embeddings
def process_text(text, chunk_size=100, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    embeddings = OpenAIEmbeddings()
    vectors = embeddings.embed_documents(chunks)
    return chunks, np.array(vectors)

# Process the original text
chunks, vectors_array = process_text(text)

# Reduce dimensionality with t-SNE for original chunks
perplexity = min(5, len(vectors_array) - 1)
tsne = TSNE(n_components=3, random_state=42, perplexity=perplexity)
reduced_vectors = tsne.fit_transform(vectors_array)

# Define colors
colors = ['#FF0000', '#00FF00', '#0000FF', '#FF00FF', '#FFFF00', '#00FFFF', 
          '#FFA500', '#800080', '#008000', '#FFC0CB']

# Visualization function
def visualize_with_new_text(new_text, chunk_size=100, chunk_overlap=20):
    # Embed the new text
    embeddings = OpenAIEmbeddings()
    new_embedding = embeddings.embed_query(new_text)  # Use embed_query for single text
    new_embedding_array = np.array(new_embedding).reshape(1, -1)
    
    # Combine the new embedding with original embeddings for visualization
    combined_vectors = np.vstack([vectors_array, new_embedding_array])
    
    # Apply t-SNE again with the combined embeddings
    perplexity = min(5, len(combined_vectors) - 1)
    tsne = TSNE(n_components=3, random_state=42, perplexity=perplexity)
    reduced_combined_vectors = tsne.fit_transform(combined_vectors)
    
    # Extract original and new reduced vectors
    reduced_vectors = reduced_combined_vectors[:-1]
    new_reduced_vector = reduced_combined_vectors[-1]
    
    # Find the closest chunk using Euclidean distance
    distances = euclidean_distances([new_reduced_vector], reduced_vectors).flatten()
    closest_idx = np.argmin(distances)
    closest_chunk = chunks[closest_idx]
    
    # Create 3D visualization
    fig = go.Figure(data=[
        # Original chunks
        go.Scatter3d(
            x=reduced_vectors[:, 0],
            y=reduced_vectors[:, 1],
            z=reduced_vectors[:, 2],
            mode='markers+text',
            text=[f"Chunk {i+1}" for i in range(len(chunks))],
            hovertext=chunks,
            marker=dict(
                size=12,
                color=[colors[i % len(colors)] for i in range(len(chunks))],
                opacity=0.8,
                symbol='circle'
            ),
            textposition='top center'
        ),
        # New text
        go.Scatter3d(
            x=[new_reduced_vector[0]],
            y=[new_reduced_vector[1]],
            z=[new_reduced_vector[2]],
            mode='markers+text',
            text=["New Text"],
            hovertext=[new_text],
            marker=dict(
                size=14,
                color='black',
                opacity=1.0,
                symbol='diamond'
            ),
            textposition='top center'
        ),
        # Line connecting new text to closest chunk
        go.Scatter3d(
            x=[new_reduced_vector[0], reduced_vectors[closest_idx, 0]],
            y=[new_reduced_vector[1], reduced_vectors[closest_idx, 1]],
            z=[new_reduced_vector[2], reduced_vectors[closest_idx, 2]],
            mode='lines',
            line=dict(
                color='black',
                width=2,
                dash='dash'
            ),
            hoverinfo='none'
        )
    ])

    # Update layout for better readability
    fig.update_layout(
        scene=dict(
            xaxis_title="t-SNE Dimension 1",
            yaxis_title="t-SNE Dimension 2", 
            zaxis_title="t-SNE Dimension 3",
            camera=dict(
                up=dict(x=0, y=0, z=1),
                center=dict(x=0, y=0, z=0),
                eye=dict(x=1.5, y=1.5, z=1.5)
            )
        ),
        title={
            'text': "3D Visualization of Text Chunk Embeddings with Closest Chunk Highlight",
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        showlegend=False,
        margin=dict(l=0, r=0, t=30, b=0)
    )

    fig.show()
    print(f"Closest chunk to the new text: {closest_chunk}\nDistance: {distances[closest_idx]:.4f}")

# Example usage with a new text
new_text = "What is the role of self-attention in transformers?"
visualize_with_new_text(new_text)

Closest chunk to the new text: It introduced self-attention mechanisms that allow models to weigh the importance of different
Distance: 226.1416
