In [None]:
import os
from typing import Optional

# os.environ["OPEN_AI_KEY"] = "sk-..."

from utils import *

# 1. Python in 10 minutes

This interface allows you to run Python code interactively and view the results immediately, along with any visualizations or text explanations. Each block of code or text you see is contained in what we call a "cell."

## Basic Operations

- **Running a Cell**: You can run the code or render the markdown in a cell by selecting it and pressing `Shift + Enter`, or by clicking the "Run" button in the toolbar.
- **Adding New Cells**: Add a new cell by clicking the "+" button in the toolbar.
- **Cell Types**: Cells can be code cells or markdown cells. Switch the type using the dropdown in the toolbar.


In [None]:
# Simple Python Example

# Printing a message
print("Hello, World!")

# Basic arithmetic
result = 7 * 6
print("7 multiplied by 6 is", result)

In [None]:
# Using Variables

# Store a value in a variable
a = 10

# Use the variable in a calculation
b = a * 2

# Print the result
print("The result of a multiplied by 2 is", b)

In [None]:
# Basic Data Structures

# List: an ordered collection of items
fruits = ["apple", "banana", "cherry"]
print("Fruits List:", fruits)

# Dictionary: key-value pairs
prices = {"apple": 0.40, "banana": 0.50, "cherry": 0.30}
print("Fruit Prices:", prices)

In [None]:
# Looping through a list
for fruit in fruits:
    print(fruit, "costs", prices[fruit], "each")

# Conditional: if statement
if "banana" in fruits:
    print("Yes, we have bananas!")

### Introduction to Functions

Functions are a way to organize your code into blocks that can be called multiple times throughout your program. They allow you to write cleaner, more modular code and make your scripts easier to maintain and debug. Functions in Python are defined using the `def` keyword.


In [None]:
# Defining a Simple Function


def greet(name):
    """This function greets the person whose name is passed as a parameter"""
    return f"Hello, {name}! Welcome to our notebook."


# Calling the function
greeting = greet("Alice")
print(greeting)

In [None]:
# Function with Parameters and Return Value


def calculate_area(length, width):
    """This function returns the area of a rectangle given its length and width."""
    area = length * width
    return area


# Using the function
rect_area = calculate_area(10, 5)
print("The area of the rectangle is:", rect_area)

### Leveraging Jupyter-AI for Code Generation

Jupyter-AI is an advanced feature integrated into Jupyter Notebooks that helps users write code more efficiently. It utilizes AI technology to suggest code snippets, complete code blocks, and even generate complex code structures.

#### How to Use Jupyter-AI to Write Code

1. **Initiating Code Suggestions**: Simply start typing your code or a description of the function you need in a code cell. Jupyter-AI will automatically suggest completions.
2. **Accepting Suggestions**: When a code suggestion appears, you can press `Tab` to accept it, instantly filling in the suggestion.
3. **Chat Interface**: You can also interact with Jupyter-AI using the chat interface on the left.


In [None]:
# try using the autocomplete functioanlity to write a function that adds two numbers


def add_numbers(a: int, b: int) -> int:
    """Try having jupyter AI autocomplete this function."""
    pass


# Assert statements to check the correctness of the function
assert add_numbers(1, 2) == 3, "Function add_numbers does not work correctly!"
print("Function add_numbers works correctly!")

In [None]:
# We're not limited to simple functions. Here's a tricky function with a bug in it. Try pasting it into the chat bar on the left and asking the AI to fix it


def factorial(n: int) -> int:
    """This function has a bug in it. Can you find and fix it with AI?"""
    if n == 0:
        return 1
    else:
        result = 1
        for i in range(n):
            result *= i
        return result


# Assert statements to check the correctness of the function
assert factorial(0) == 1, "The factorial of 0 should be 1"
assert factorial(1) == 1, "The factorial of 1 should be 1"
assert factorial(5) == 120, "The factorial of 5 should be 120"

### Let's get started with the case study!


# High Level Architecture

The architecture of the system is as follows:

1. We chunk up the document into distinct “sections” and embed those sections
2. Then, we embed the user query and find the most similar part of the document.
3. We feed the original question along with context we found to the LLM and receive an answer


# 2. What exactly is an embedding?


In [None]:
from openai import OpenAI, NOT_GIVEN
import instructor
import plotly.graph_objects as go

#########################
### UTILITY FUNCTIONS ###
#########################

# instantiating the OpenAI client
client = instructor.patch(OpenAI(api_key=os.getenv("OPEN_AI_KEY")))
batch_size = 250
embedding_model = "text-embedding-3-large"


# wrapper function around openai to directly return embedding of text
def get_embedding(text: str | list[str], dimensions: int = NOT_GIVEN) -> list[float]:
    """Get the embedding of the input text."""
    if dimensions:
        assert dimensions <= 3072, "The maximum number of dimensions is 3072."

    response = client.embeddings.create(
        input=text, model=embedding_model, dimensions=dimensions
    )
    return response.data[0].embedding


def get_many_embeddings(texts: list[str]) -> list[list[float]]:
    """Get the embeddings of multiple texts."""
    batch_size = 250
    res = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        api_resp = client.embeddings.create(input=batch_texts, model=embedding_model)
        batch_res = [val.embedding for val in api_resp.data]
        res.extend(batch_res)

    return res


# simple utility function to add a vector to a 3D plot
def add_vector_to_graph(
    fig: go.Figure, vector: list[float], color: str = "red", name: Optional[str] = None
) -> go.Figure:
    # Ensure vector has exactly three components
    assert len(vector) == 3, "Vector must have exactly 3 components."

    # Origin point
    origin = [0, 0, 0]

    # Components of the vector
    x_component, y_component, z_component = vector

    # Adding the line part of the vector
    fig.add_trace(
        go.Scatter3d(
            x=[origin[0], x_component],
            y=[origin[1], y_component],
            z=[origin[2], z_component],
            mode="lines",
            line=dict(color=color, width=5),
            name=name,
        )
    )

    # Adding the cone at the tip of the vector
    fig.add_trace(
        go.Cone(
            x=[x_component],
            y=[y_component],
            z=[z_component],
            u=[x_component],
            v=[y_component],
            w=[z_component],
            sizemode="scaled",
            sizeref=0.1,
            showscale=False,
            colorscale=[[0, color], [1, color]],
            hoverinfo="none",
        )
    )
    return fig


def create_new_graph() -> go.Figure:
    """Create a 3D plotly figure with a simple layout."""
    fig = go.Figure()

    # make sure the plot isn't rotated
    fig.update_layout(
        scene=dict(
            camera=dict(
                eye=dict(x=1.5, y=1.5, z=0.5),  # Adjust the camera position
                up=dict(x=0, y=0, z=1),  # Sets the z-axis as "up"
                center=dict(x=0, y=0, z=0),  # Focuses the camera on the origin
            ),
            aspectmode="cube",
        )
    )

    # Add a dot at the origin
    fig.add_trace(
        go.Scatter3d(
            x=[0],
            y=[0],
            z=[0],
            mode="markers",
            marker=dict(size=6, color="black", symbol="circle"),
            name="Origin",
        )
    )

    return fig

#### Let's get started

For the purpose of the notebook we're going to use an OpenAI approach.

In [None]:
# let's try using the get_embedding function
result = get_embedding("Hello, World!")
print(result)

That's a lot of numbers! OpenAI embedding support built in dimensionality reduction - let's try using that and visualizing the result


In [None]:
graph = create_new_graph()

text = "Atlanta"

# Get the embedding of the text
vector = get_embedding(text, dimensions=3)

# Add the vector to the plot
add_vector_to_graph(graph, vector, name=text)

# Show the plot
graph.show()

Let's try plotting a couple vectors at once to see if we can see any patterns

In [None]:
graph = create_new_graph()

text = "Atlanta"
atlanta_vector = get_embedding(text, dimensions=3)
add_vector_to_graph(graph, atlanta_vector, name=text, color="purple")

text = "Georgia, USA"
georgia_vector = get_embedding(text, dimensions=3)
add_vector_to_graph(graph, georgia_vector, name=text, color="blue")

text = "Skiing in japan"
ski_vector = get_embedding(text, dimensions=3)
add_vector_to_graph(graph, ski_vector, name=text, color="red")

# Show the plot
graph.show()

How we can quantify the similarity between two vectors? One common way is to use the cosine similarity. The cosine similarity between two vectors is the cosine of the angle between them. It ranges from -1 (opposite directions) to 1 (same direction), with 0 indicating orthogonality.

In [None]:
import numpy as np


def cosine_similarity(a: list[float], b: list[float]) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


# We can use the cosine similarity to compare the similarity between two vectors
similarity = cosine_similarity(atlanta_vector, georgia_vector)
print(f"The similarity between 'Atlanta' and 'Georgia, USA' is {similarity:.2f}")

similarity = cosine_similarity(atlanta_vector, ski_vector)
print(f"The similarity between 'Atlanta' and 'Skiing in Japan' is {similarity:.2f}")

### Advanced Challenges (Optional)

#### 1. Sentence Embeddings

How does adding words to a sentence affect the embedding vector? Try creating a for loop that adds a word to the text and plots the resulting embedding vector.


In [None]:
# Try it out (implemented solutions can be found in the solutions.ipynb notebook)

color_scale = [
    "#E6E6FA",  # Lavender
    "#D8BFD8",  # Thistle
    "#DDA0DD",  # Plum
    "#DA70D6",  # Orchid
    "#BA55D3",  # Medium Orchid
    "#9932CC",  # Dark Orchid
    "#9400D3",  # Dark Violet
    "#8A2BE2",  # Blue Violet
    "#800080",  # Purple
    "#4B0082"   # Indigo
] # each one is progressively darker

# choose a sentence

# split the sentence into words

# iterate across each values, hint use enumerate

# create a sub_sentence with the first n words ->  " ".join(words[:ix])

# get the embedding of the sub_sentence

# add the vector to the graph, set the color = color_scale[ix]

# visualize!

# 3. Parsing Documents

Large language models are currently primarly optimized for working with text. As a result when dealing with documents like PDF's we need to first convert them into a text format before we can feed them into the model.

We maintain a popular open source library for doing this called [openparse](https://github.com/Filimoa/open-parse/). It is a simple and easy to use.


In [None]:
import openparse

#########################
### UTILITY FUNCTIONS ###
#########################


class VectorDatabase:
    """
    A simple in-memory database to store nodes along with their vectors and perform similarity search.
    """

    def __init__(self):
        self.nodes = []

    def add_node(self, node: openparse.Node) -> None:
        """Add a node along with its vector to the database."""
        assert node.embedding is not None, "Node must have an embedding."

        for existing_node in self.nodes:
            if existing_node.text == node.text:
                print(f"Node with id {node.node_id} already exists. Skipping")
                return

        self.nodes.append(node)

    def find_node(self, node_id: str):
        """Retrieve a node by its ID."""
        for node in self.nodes:
            if node.node_id == node_id:
                return node
        return None

    def find_similar_node(
        self, input_vector: list[float], top_k: int = 3
    ) -> list[openparse.Node]:
        """Find the top_k nodes with the highest cosine similarity to the input_vector."""
        assert self.nodes, "Database is empty. Please add nodes first."
        assert top_k <= len(
            self.nodes
        ), "top_k should be less than or equal to the number of nodes."

        similarities = []
        for node in self.nodes:
            similarity = cosine_similarity(input_vector, node.embedding)
            similarities.append((node, similarity))

        similarities.sort(key=lambda x: x[1], reverse=True)

        return [node for node, _ in similarities[:top_k]]

    @property
    def num_nodes(self) -> int:
        """Return the number of nodes in the database."""
        return len(self.nodes)

    def delete_all_nodes(self) -> None:
        """Delete all nodes from the database."""
        self.nodes = []


db = VectorDatabase()

In [None]:
import openparse

doc_path = "./docs/portland-site-assessment-phase-1.pdf"
pdf = openparse.Pdf(doc_path)
parser = openparse.DocumentParser()
parsed_doc = parser.parse(doc_path)

Nodes are parsed as markdown - bold text is kept. This helps LLM's understand the structure of the document. Let's try looking at the first couple nodes.

In [None]:
for node in parsed_doc.nodes[10:11]:
    display(node)
    print("====================================")

In [None]:
# let's embed all the nodes and add to the database
raw_node_texts = [node.text for node in parsed_doc.nodes]
embeddings = get_many_embeddings(raw_node_texts)

for node, embedding in zip(parsed_doc.nodes, embeddings):
    node.embedding = embedding
    db.add_node(node)

print("=== Database now has ", db.num_nodes, " nodes ===")

# 4. Querying

In [None]:
#########################
### UTILITY FUNCTIONS ###
#########################

from IPython.display import Markdown


def get_completion(prompt: str) -> Markdown:
    """
    OpenAI returns a complex object, this is a simple wrapper function to directly return the completion text.
    """
    completion = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )

    cost_dollars = completion.usage.total_tokens / 100_000

    print(
        f"Completion used {completion.usage.total_tokens} tokens costing ${cost_dollars:.2f}"
    )
    
    return Markdown(completion.choices[0].message.content)


def display_similar_nodes(
    similar_nodes: list[openparse.Node], query_vector: list[float], pdf: openparse.Pdf
) -> None:
    page_nums = set()
    annotations = []
    for node in similar_nodes:
        sim = cosine_similarity(query_vector, node.embedding)
        page_nums.add(node.start_page)
        page_nums.add(node.end_page)
        annotations.append(round(sim, 3))

    pdf.display_with_bboxes(similar_nodes, page_nums=page_nums, annotations=annotations)

In [None]:
# let's try asking one of our original questions

question = "Is there lead contamination into the groundwater?"

# Get the embedding of the text
query_vector = get_embedding(question)

# find the most similar node
similar_nodes = db.find_similar_node(query_vector, top_k=5)

for node in similar_nodes:
    sim = cosine_similarity(query_vector, node.embedding)
    print(
        f"Found similar node on page {node.start_page} with a similarity of {sim:.2f}"
    )

In [None]:
# create a single string of all the similar nodes
context = "\n\n".join([node.text for node in similar_nodes])

print(context)

## 4.1 Creating Prompts 

In [None]:
# you can add a variable to a string by using format

name = "sergey"

template = """{name} is from Denver"""

print("Without calling format: ", template)
print("After calling format: ", template.format(name=name))

In [None]:
prompt_template = """
Using the document provided, answer the following question:

question: {question}

context: {context}
"""

prompt = prompt_template.format(question=question, context=context)

### Ok let's try running a completion

In [None]:
prompt_template = """
Using the document provided, answer the following question:

question: {question}

context: {context}
"""

prompt = prompt_template.format(question=question, context=context)

completion = get_completion(prompt)

print("Original Question:", question)
completion

### We can display citations showing users exactly where we got our answer

In [None]:
display_similar_nodes(similar_nodes, query_vector, pdf)

## 5: Structured Output

In [None]:
from pydantic import BaseModel

class ContainsHazards(BaseModel):
    contains_lead: bool


response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {
            "role": "user",
            "content": prompt,
        },
    ],
    response_model=ContainsHazards,
)

print(response.dict())

In [None]:
if response.contains_lead:
    question = "What mitigations have been performed up to this point to deal with the lead exposure?"

    query_vector = get_embedding(question)

    similar_nodes = db.find_similar_node(query_vector, top_k=5)

    context = "\n\n".join([node.text for node in similar_nodes])

    prompt = prompt_template.format(question=question, context=context)

    mitigations_performed = get_completion(prompt)

In [None]:
mitigations_performed

In [None]:
question = (
    "Why were the following mitigations to remove the lead from the property ineffective "
    + mitigations_performed.data
)

query_vector = get_embedding(question)

# this is a more complex question, let's expand the search to top 9 nodes
similar_nodes = db.find_similar_node(query_vector, top_k=9)

context = "\n\n".join([node.text for node in similar_nodes])

prompt = prompt_template.format(question=question, context=context)

failure_reasons = get_completion(prompt)

In [None]:
failure_reasons

## 6. Limitations to RAG

What are some limitations to this approach? Let's discuss with the class

### Advanced Challenges (Optional)

#### 1. Let's pass the entire document to ChatGPT and see if we get a different answer

In [None]:
# Try combining all the nodes into one string,
# Hint: You can iterate across the original document nodes by using `for node in parsed_doc.nodes:`. Each Node has a `node.text` attribute!

# Create a prompt the same way we created one earlier except now pass the full document string

# Request a completion