In [32]:
!pip install openai

import openai
from google.colab import userdata

# Fetch the API key from Colab's secrets
api_key = userdata.get('OPENAI_API_KEY')

# Initialize the OpenAI client
client = openai.OpenAI(api_key=api_key)



# 1.Upload the PDF

In [1]:
from google.colab import files
import io
uploaded = files.upload()

Saving 2025-01-18-pdf-1-TechAI-Goolge-whitepaper_Prompt Engineering_v4-af36dcc7a49bb7269a58b1c9b89a8ae1.pdf to 2025-01-18-pdf-1-TechAI-Goolge-whitepaper_Prompt Engineering_v4-af36dcc7a49bb7269a58b1c9b89a8ae1.pdf


# 2. Extract the text from the pdf

In [2]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [None]:
from pdfminer.high_level import extract_text

# Assuming the uploaded PDF is the first (and only) file in the 'uploaded' dictionary
pdf_filename = list(uploaded.keys())[0]

# Extract text from the PDF
raw_text = extract_text(pdf_filename)

# Display the first 500 characters of the extracted text
print(raw_text[:500])

# 3. Chunk the text

## 1. Hamalski way

In [6]:
def chunk_text(text, chunk_size, overlap):
    """Chunks text into smaller pieces with overlap.

    Args:
        text (str): The input text.
        chunk_size (int): The desired size of each chunk.
        overlap (int): The number of characters to overlap between chunks.

    Returns:
        list: A list of text chunks.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Example usage (replace with your desired chunk_size and overlap)
chunk_size = 500 # 400 text # 100 next chunk
overlap = 100
text_chunks = chunk_text(raw_text, chunk_size, overlap)
print(f"Created {len(text_chunks)} chunks.")
print("First chunk:", text_chunks[0][:200]) # Print first 200 characters of the first chunk

Created 203 chunks.
First chunk: Prompt  
Engineering

Author: Lee Boonstra

Acknowledgements

Reviewers and Contributors

Michael Sherman

Yuan Cao

Erick Armbrust

Anant Nawalgaria

Antonio Gulli

Simone Cammel

Curators and Edito


## 2. Puctual Chunking

In [8]:
import re

def punctual_chunking(text, chunk_size=500, overlap=100):
    """Chunks text into smaller pieces based on punctuation with overlap.

    Args:
        text (str): The input text.
        chunk_size (int): The approximate desired size of each chunk.
        overlap (int): The number of characters to overlap between chunks.

    Returns:
        list: A list of text chunks.
    """
    # Split the text by punctuation that typically ends sentences or paragraphs
    split_points = r'(?<=[.!?;\n])\s*' # Split after punctuation followed by optional whitespace
    sentences = re.split(split_points, text)

    chunks = []
    current_chunk = ""
    i = 0
    while i < len(sentences):
        sentence = sentences[i]

        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence
            i += 1
        else:
            if current_chunk: # Add the current chunk if it's not empty
                chunks.append(current_chunk.strip())
                # Start the next chunk with overlap
                overlap_start = max(0, len(current_chunk) - overlap)
                current_chunk = current_chunk[overlap_start:] + sentence
                i += 1
            else: # If current_chunk is empty, the sentence is larger than chunk_size
                 # In this case, just add the sentence as a chunk (or part of it)
                 chunks.append(sentence[:chunk_size].strip())
                 sentences[i] = sentence[chunk_size:] # Keep the rest of the sentence for the next iteration
                 if not sentences[i]: # If the rest is empty, move to the next sentence
                     i += 1


    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# Example usage (replace raw_text with your variable containing the extracted text)
punctual_chunks = punctual_chunking(raw_text, chunk_size=500, overlap=100)
print(f"Created {len(punctual_chunks)} chunks using punctual chunking.")
print("First chunk:", punctual_chunks[0][:200])

Created 213 chunks using punctual chunking.
First chunk: Prompt  
Engineering
Author: Lee Boonstra
Acknowledgements
Reviewers and Contributors
Michael Sherman
Yuan Cao
Erick Armbrust
Anant Nawalgaria
Antonio Gulli
Simone Cammel
Curators and Editors
Antonio 


## 3. AI Chunking

In [15]:
def ai_chunking(text, client, model="gpt-4o-mini", max_input_chars=5000):
    """Chunks text using an AI model to preserve context.

    Args:
        text (str): The input text (up to max_input_chars).
        client: The initialized OpenAI client.
        model (str): The OpenAI model to use for chunking.
        max_input_chars (int): The maximum number of characters to send to the model.

    Returns:
        list: A list of text chunks generated by the AI.
    """
    if len(text) > max_input_chars:
        text = text[:max_input_chars]
        print(f"Warning: Input text truncated to {max_input_chars} characters for AI processing.")

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that chunks text while preserving context. Return the chunks as a Python list of strings in JSON format. The chunks will be used for RAG. Kepp in mind that overlap is also needed"},
                {"role": "user", "content": f"Please chunk the following text into meaningful sections, ensuring context is preserved. Provide the output as a JSON object with a key 'chunks' containing a Python list of strings:\n\n{text}"}
            ],
            response_format={"type": "json_object"}
        )
        # Assuming the model returns a JSON object with a key like "chunks" containing the list
        # You might need to inspect the model's output format and adjust accordingly.
        ai_generated_chunks = json.loads(response.choices[0].message.content).get("chunks", [])
        return ai_generated_chunks
    except Exception as e:
        print(f"An error occurred during AI chunking: {e}")
        return []

import json

# Example usage (assuming 'client' is your initialized OpenAI client and 'raw_text' is your extracted text)
# Note: Sending a large amount of text to the API will incur costs.
ai_chunks = ai_chunking(raw_text, client, max_input_chars=5000)
print(f"Created {len(ai_chunks)} chunks using AI chunking.")
if ai_chunks:
  print("First AI chunk:", ai_chunks[0][:200])

# You can then combine these chunks with the other chunks if needed:
# all_chunks = text_chunks + punctual_chunks + ai_chunks

Created 12 chunks using AI chunking.
First AI chunk: Prompt Engineering


In [None]:
# Get the number of chunks to print from the user
num_chunks_to_print = int(input("Enter the number of chunks to print: "))

# Print the first n chunks
print(f"\nFirst {num_chunks_to_print} AI chunks:")
for i, chunk in enumerate(ai_chunks[:num_chunks_to_print]):
    print(f"Chunk {i+1}:\n{chunk}\n---")

# 4. Gerate embeddings

In [20]:
import openai
import time

def generate_embeddings(chunks, client, model="text-embedding-ada-002"):
    """Generates embeddings for a list of text chunks using OpenAI.

    Args:
        chunks (list): A list of text chunks (strings).
        client: The initialized OpenAI client.
        model (str): The OpenAI embedding model to use.

    Returns:
        dict: A dictionary mapping each chunk to its embedding vector,
              or None if an error occurs.
    """
    chunk_embeddings = {}
    try:
        # OpenAI's embeddings endpoint can take a list of inputs
        response = client.embeddings.create(
            input=chunks,
            model=model
        )
        # Assuming the response structure contains 'data' which is a list of embedding objects
        for i, chunk in enumerate(chunks):
            if i < len(response.data):
                chunk_embeddings[chunk] = response.data[i].embedding
            else:
                print(f"Warning: No embedding returned for chunk {i+1}.")

        return chunk_embeddings
    except Exception as e:
        print(f"An error occurred during embedding generation: {e}")
        return None

# Example usage (assuming 'client' is your initialized OpenAI client and 'punctual_chunks' is your list of chunks)
# Note: Generating embeddings will incur costs.
embeddings = generate_embeddings(punctual_chunks, client)

if embeddings:
   print(f"Generated embeddings for {len(embeddings)} chunks.")
   # Example of accessing an embedding:
   first_chunk = list(embeddings.keys())[0]
   print("\nFirst Chunk:")
   print(first_chunk)
   print("\nEmbedding for the first chunk (first 10 elements):")
   print(embeddings[first_chunk][:10]) # Print first 10 elements of the embedding

Generated embeddings for 213 chunks.

First Chunk:
Prompt  
Engineering
Author: Lee Boonstra
Acknowledgements
Reviewers and Contributors
Michael Sherman
Yuan Cao
Erick Armbrust
Anant Nawalgaria
Antonio Gulli
Simone Cammel
Curators and Editors
Antonio Gulli
Anant Nawalgaria
Grace Mollison 
Technical Writer
Joey Haymaker
Designer
Michael Lanning 
2
Prompt EngineeringSeptember 2024Table of contents
Introduction 
Prompt engineering 
LLM output configuration 
Output length 
Sampling controls 
Temperature 
Top-K and top-P 
Putting	it	all	together

Embedding for the first chunk (first 10 elements):
[0.014929180964827538, 0.007073588203638792, -0.009895914234220982, -0.023090466856956482, -0.00027459030388854444, 0.02422792837023735, -0.010251371189951897, -0.005331850610673428, -0.04689184948801994, -0.03480631858110428]


# 5. Store the embeddings in Chroma DB

## 1. Set up Chroma DB

In [None]:
!pip install chromadb

In [30]:
import chromadb

# Create a Chroma DB client
client_db = chromadb.Client()

# Create a collection (or get an existing one)
collection = client_db.get_or_create_collection(name="my_document_embeddings")

print(f"Chroma DB client created and collection '{collection.name}' is ready.")

Chroma DB client created and collection 'my_document_embeddings' is ready.


## 2. Store the embeddings and the chunks

In [31]:
# Store embeddings in Chroma DB

# Prepare data for Chroma DB
ids = [f"chunk_{i}" for i in range(len(embeddings))]
documents = list(embeddings.keys())
embedding_vectors = list(embeddings.values())

# Add to the collection
collection.add(
    embeddings=embedding_vectors,
    documents=documents,
    ids=ids
)

print(f"Added {len(documents)} documents and embeddings to the collection.")
print(f"Collection count: {collection.count()}")

Added 213 documents and embeddings to the collection.
Collection count: 213


In [28]:
# Retrieve the first 2 items from the collection
retrieved_items = collection.get(
    ids=[f"chunk_{i}" for i in range(2)], # Assuming IDs are sequential as created
    include=['embeddings', 'documents']
)

# Print the structured output
if retrieved_items and retrieved_items['ids']:
    print("Example of the first 2 stored entries in Chroma DB:")
    for i in range(len(retrieved_items['ids'])):
        print(f"\n--- Entry {i+1} ---")
        print(f"ID: {retrieved_items['ids'][i]}")
        print(f"Document (Chunk):")
        print(retrieved_items['documents'][i][:500] + "...") # Print first 200 chars of document
        print(f"Embedding (first 10 elements):")
        print(retrieved_items['embeddings'][i][:10])
else:
    print("Could not retrieve items from the collection.")

Example of the first 2 stored entries in Chroma DB:

--- Entry 1 ---
ID: chunk_0
Document (Chunk):
Prompt  
Engineering
Author: Lee Boonstra
Acknowledgements
Reviewers and Contributors
Michael Sherman
Yuan Cao
Erick Armbrust
Anant Nawalgaria
Antonio Gulli
Simone Cammel
Curators and Editors
Antonio Gulli
Anant Nawalgaria
Grace Mollison 
Technical Writer
Joey Haymaker
Designer
Michael Lanning 
2
Prompt EngineeringSeptember 2024Table of contents
Introduction 
Prompt engineering 
LLM output configuration 
Output length 
Sampling controls 
Temperature 
Top-K and top-P 
Putting	it	all	together...
Embedding (first 10 elements):
[ 0.01492918  0.00707359 -0.00989591 -0.02309047 -0.00027459  0.02422793
 -0.01025137 -0.00533185 -0.04689185 -0.03480632]

--- Entry 2 ---
ID: chunk_1
Document (Chunk):
iguration 
Output length 
Sampling controls 
Temperature 
Top-K and top-P 
Putting	it	all	together 
Prompting techniques 
General prompting / zero shot 
One-shot & few-shot 
System, contextual and rol

# 7. Vector Search functionality

## 1. Covert the query into embeddings

In [49]:
import openai

def generate_query_embedding(query_text, client, model="text-embedding-ada-002"):
    """Generates an embedding for a text query using OpenAI.

    Args:
        query_text (str): The input query text.
        client: The initialized OpenAI client.
        model (str): The OpenAI embedding model to use.

    Returns:
        list: The embedding vector for the query text, or None if an error occurs.
    """
    try:
        response = client.embeddings.create(
            input=[query_text], # OpenAI expects a list of inputs
            model=model
        )
        # Assuming the response structure contains 'data' which is a list of embedding objects
        if response.data and len(response.data) > 0:
            return response.data[0].embedding
        else:
            print("Warning: No embedding returned for the query.")
            return None
    except Exception as e:
        print(f"An error occurred during query embedding generation: {e}")
        return None

# Example usage (assuming 'client' is your initialized OpenAI client)
query = "What is prompt engineering?"
query_embedding = generate_query_embedding(query, client)

if query_embedding:
   print(f"Generated embedding for the query (first 10 elements):")
   print(query_embedding[:10])

Generated embedding for the query (first 10 elements):
[-0.013495800085365772, -0.0033809461165219545, -0.011480874381959438, -0.011949624866247177, 0.004152284469455481, 0.007041743025183678, -0.00880130473524332, -0.007870800793170929, -0.021996265277266502, -0.024514921009540558]


## 2. Vector search

In [68]:
def vector_search(query_embedding, collection, n_results=5, distance_threshold=None):
    """Performs a vector search in Chroma DB based on a text query embedding with an optional distance threshold.

    Args:
        query_embedding (list): The embedding vector for the query.
        collection: The Chroma DB collection object.
        n_results (int): The number of similar results to retrieve.
        distance_threshold (float, optional): The maximum distance for retrieved results.
                                              Results with a distance greater than this will be excluded.
                                              Lower values mean higher similarity.

    Returns:
        tuple: A tuple containing two lists: the list of most relevant document chunks
               and the list of their corresponding distances, or None if an error occurs.
    """
    if query_embedding is None:
        print("Error: Query embedding is None.")
        return None, None

    try:
        # Build the where clause for distance filtering if a threshold is provided
        where_clause = {}
        if distance_threshold is not None:
             where_clause = {"distance": {"$lt": distance_threshold}}


        # Perform the vector search in Chroma DB
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results,
            include=['documents', 'distances'], # Include documents and distances in the results
            where=where_clause # Apply the distance filter
        )

        # The results are returned as a dictionary, extract the documents and distances
        # Note: The structure is a bit nested, results['documents'][0] and results['distances'][0] are lists for the first query
        if results and results.get('documents') and results['documents'][0]:
            # Return the lists of document chunks and distances
            return results['documents'][0], results['distances'][0]
        else:
            print("No results found for the query.")
            return [], []

    except Exception as e:
        print(f"An error occurred during vector search: {e}")
        return None, None

# Example usage (assuming 'client' is your initialized OpenAI client, 'collection' is your Chroma DB collection and query_embedding is already generated)
# query_text = "What is prompt engineering?"
# query_embedding = generate_query_embedding(query_text, client) # Generate embedding separately

# if query_embedding:
#     search_results, distances = vector_search(query_embedding, collection, n_results=3, distance_threshold=0.2) # Example with a threshold

#     if search_results:
#         print(f"\nTop {len(search_results)} most relevant chunks for the query '{query_text}':")
#         for i, chunk in enumerate(search_results):
#             print(f"Result {i+1} (Distance: {distances[i]:.4f}):\n{chunk}\n---")

## 3. AI Response

In [54]:
import openai

def generate_ai_response(query_text, search_results, client, model="gpt-4o-mini"):
    """Generates an AI response based on a query and retrieved document chunks.

    Args:
        query_text (str): The original user query.
        search_results (list): A list of relevant document chunks retrieved from the vector database.
        client: The initialized OpenAI client.
        model (str): The OpenAI model to use for response generation.

    Returns:
        str: The AI-generated response, or None if an error occurs.
    """
    if not search_results:
        return "Could not find relevant information to answer the query."

    # Combine the retrieved chunks into a single context string
    context = "\n\n".join(search_results)

    try:
        response = client.chat.completions.create(
            model=model,
            temperature=0.1,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context. You should use only the information from the provided context. If the information is avaliable in the context the say: I dont know"},
                {"role": "user", "content": f"Based on the following context, answer the query:\n\nContext:\n{context}\n\nQuery: {query_text}"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred during AI response generation: {e}")
        return None

# Example usage (assuming 'client' is your initialized OpenAI client, 'search_results' are from the vector search)
# query = "What is prompt engineering?"
# search_results = vector_search(query, client, collection, n_results=3) # Get search results first

# if search_results:
#     ai_response = generate_ai_response(query, search_results, client)
#     if ai_response:
#         print("\nAI Response:")
#         print(ai_response)
# else:
#     print("No search results to generate a response.")

# 7. Ask questions

In [69]:
# Step 1: Accept a user question
query = input("Please enter your question about the document: ")
print(f"\nUser Query: {query}")

# Step 2: Generate embedding for the query
query_embedding = generate_query_embedding(query, client)

if query_embedding:
    print(f"\nQuery Embedding (first 10 elements): {query_embedding[:10]}...")

    # Step 3: Perform vector search in Chroma DB
    # You can adjust n_results and distance_threshold as needed
    search_results, distances = vector_search(query_embedding, collection, n_results=5, distance_threshold=0.4)

    if search_results:
        print(f"\nVector Search Results ({len(search_results)} chunks found):") # Print the number of chunks found
        # Print the first search result and its distance
        print(f"Result 1 (Distance: {distances[0]:.4f}):\n{search_results[0][:200]}...") # Print first 200 characters of the first result
        print(f"Result 1 (Distance: {distances[1]:.4f}):\n{search_results[1][:200]}...") # Print first 200 characters of the first result
        # Step 4: Generate AI response based on search results
        ai_response = generate_ai_response(query, search_results, client)

        if ai_response:
            print("\nAI Response:")
            print(ai_response)
        else:
            print("Failed to generate AI response.")
    else:
        print("No relevant information found in the document.")
else:
    print("Failed to generate query embedding.")

Please enter your question about the document: kakwo e каварма

User Query: kakwo e каварма

Query Embedding (first 10 elements): [-0.0029184683226048946, 0.00248140306212008, -0.004828866571187973, -0.04328356310725212, -0.01948465220630169, -0.00897393748164177, -0.02202245220541954, -0.04057657718658447, -0.006524256896227598, -0.005688898265361786]...
No results found for the query.
No relevant information found in the document.
