In [140]:
import os
import numpy as np
import fitz
import time
import polars as pl

from dotenv import load_dotenv
from google import genai
from google.genai import types


load_dotenv()

client = genai.Client(api_key=os.getenv("API_KEY"))

### Pre-processing and splitting

In [247]:
def remove_headers_and_footers(
    pdf_path, header_height_pt=70, footer_height_pt=70
):
    """
    Extracts text from a PDF, excluding the header and footer areas.

    Args:
        pdf_path (str): Path to the PDF file.
        header_height_pt (int, optional): Height of the header area in points.
            Defaults to 50 (a reasonable starting value).
        footer_height_pt (int, optional): Height of the footer area in points.
            Defaults to 50 (a reasonable starting value).

    Returns:
        str: Extracted text without the header and footer.
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        page_rect = page.rect  # Get the page rectangle
        page_height = page_rect.height
        # Define the clipping rectangle, excluding header and footer
        clip_rect = fitz.Rect(
            page_rect.x0,
            header_height_pt,
            page_rect.x1,
            page_height - footer_height_pt,
        )
        text += page.get_text(clip=clip_rect) + "\n"  # Add newline between pages
    doc.close()
    return text

# Example usage:
pdf_file_1 = "test.pdf"  # Replace with your PDF file
pdf_file_2 = "test_2.pdf"
text_1 = remove_headers_and_footers(pdf_file_1)
text_2 = remove_headers_and_footers(pdf_file_2)
extracted_text = text_2

In [142]:
print(type(extracted_text))

<class 'str'>


In [248]:
# Splitting text into sentences (basic split)
sentences = extracted_text.split(". ")

### Create semantic embeddings

In [None]:
def get_embedding(text, model="text-embedding-004", delay = 0.6):
    """
    Creates an embedding for the given text using OpenAI.

    Args:
    text (str): Input text.
    model (str): Embedding model name.

    Returns:
    np.ndarray: The embedding vector.
    """
    time.sleep(delay)
    response = client.models.embed_content(
              model=model, 
              contents=text)
    
    return np.array(response.embeddings[0].values)

test = test_get_embedding("Describe love")
print(test.shape)


(768,)


In [None]:
# Generate embeddings for each sentence
embeddings = [get_embedding(sentence) for sentence in sentences]

print(f"Generated {len(embeddings)} sentence embeddings.")

Generated 145 sentence embeddings.


In [250]:
def cosine_similarity(vec1, vec2):
    """
    Computes cosine similarity between two vectors.

    Args:
    vec1 (np.ndarray): First vector.
    vec2 (np.ndarray): Second vector.

    Returns:
    float: Cosine similarity.
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Compute similarity between consecutive sentences
similarities = [cosine_similarity(embeddings[i], embeddings[i + 1]) for i in range(len(embeddings) - 1)]

In [251]:
def compute_breakpoints(similarities, method="percentile", threshold=90):
    """
    Computes chunking breakpoints based on similarity drops.

    Args:
    similarities (List[float]): List of similarity scores between sentences.
    method (str): 'percentile', 'standard_deviation', or 'interquartile'.
    threshold (float): Threshold value (percentile for 'percentile', std devs for 'standard_deviation').

    Returns:
    List[int]: Indices where chunk splits should occur.
    """
    # Determine the threshold value based on the selected method
    if method == "percentile":
        # Calculate the Xth percentile of the similarity scores
        threshold_value = np.percentile(similarities, threshold)
    elif method == "standard_deviation":
        # Calculate the mean and standard deviation of the similarity scores
        mean = np.mean(similarities)
        std_dev = np.std(similarities)
        # Set the threshold value to mean minus X standard deviations
        threshold_value = mean - (threshold * std_dev)
    elif method == "interquartile":
        # Calculate the first and third quartiles (Q1 and Q3)
        q1, q3 = np.percentile(similarities, [25, 75])
        # Set the threshold value using the IQR rule for outliers
        threshold_value = q1 - 1.5 * (q3 - q1)
    else:
        # Raise an error if an invalid method is provided
        raise ValueError("Invalid method. Choose 'percentile', 'standard_deviation', or 'interquartile'.")

    # Identify indices where similarity drops below the threshold value
    return [i for i, sim in enumerate(similarities) if sim < threshold_value]

# Compute breakpoints using the percentile method with a threshold of 90
breakpoints = compute_breakpoints(similarities, method="percentile", threshold=90)

In [252]:
def split_into_chunks(sentences, breakpoints):
    """
    Splits sentences into semantic chunks.

    Args:
    sentences (List[str]): List of sentences.
    breakpoints (List[int]): Indices where chunking should occur.

    Returns:
    List[str]: List of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    start = 0  # Initialize the start index

    # Iterate through each breakpoint to create chunks
    for bp in breakpoints:
        # Append the chunk of sentences from start to the current breakpoint
        chunks.append(". ".join(sentences[start:bp + 1]) + ".")
        start = bp + 1  # Update the start index to the next sentence after the breakpoint

    # Append the remaining sentences as the last chunk
    chunks.append(". ".join(sentences[start:]))
    return chunks  # Return the list of chunks

# Create chunks using the split_into_chunks function
text_chunks = split_into_chunks(sentences, breakpoints)

# Print the number of chunks created
print(f"Number of semantic chunks: {len(text_chunks)}")

# Print the first chunk to verify the result
print("\nFirst text chunk:")
print(text_chunks[0])

Number of semantic chunks: 130

First text chunk:
28623831
4649393
The Summer 
That Never 
Was
Campaign 
Overview
Chapter 1:  
To Light the 
Night
Chapter 2:  
Reclaiming 
Willowshore
Chapter 3:  
The 
Willowshore 
Curse
Chapter 4:  
The Wall of 
Ghosts
Willowshore
Adventure  
Toolbox
less important than defeating the creatures that now 
control downtown Willowshore.
Ugly Cute: Granny Hu did hear from some survivors 
that the spider statue has gone missing, but she finds 
the idea of the stone statue “waking up on 
its own” to be ridiculous superstition.


In [None]:
def create_embeddings(text_chunks):
    """
    Creates embeddings for each text chunk.

    Args:
    text_chunks (List[str]): List of text chunks.

    Returns:
    List[np.ndarray]: List of embedding vectors.
    """
    # Generate embeddings for each text chunk using the get_embedding function
    return [get_embedding(chunk) for chunk in text_chunks]

# Create chunk embeddings using the create_embeddings function
chunk_embeddings = create_embeddings(text_chunks)

### Vector store

In [255]:
class VectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []
    #tagit bort np.array framför embedding i append
    def add_item(self, text, embedding, metadata=None):
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})

    def semantic_search(self, query_embedding, k=5):
        if not self.vectors:
            return []
        query_vector = np.array(query_embedding)

        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        
        similarities.sort(key=lambda x: x[1], reverse=True)

        results = []

        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({"text": self.texts[idx],
                            "metadata": self.metadata[idx],
                            "similarity": score
                            })
        
        return results
    
    def save(self, filename="embeddings.parquet"):
        df = pl.DataFrame(
            dict(
                vectors=self.vectors,
                texts=self.texts,
                metadata=self.metadata))
        df.write_parquet(filename)

    def load(self, file):
        df = pl.read_parquet(file, columns=["vectors", "texts", "metadata"])
        self.vectors = df["vectors"].to_list()
        self.texts = df["texts"].to_list()
        self.metadata = df["metadata"].to_list()

In [256]:
vector_store = VectorStore()
for sentence in sentences:
    vector_store.add_item(text=sentence, embedding=chunk_embeddings, metadata=["metadata"])

In [257]:
print((vector_store.vectors[0].shape))

(130, 768)


In [258]:
vector_store.save("embeddings.parquet")

ValueError: cannot parse numpy data type dtype('O') into Polars data type

In [179]:
vector_store_2 = VectorStore()
vector_store_2.load("embeddings.parquet")

### New semantic search

In [None]:
def semantic_search_2(query, text_chunks, chunk_embeddings, k=5):
    """
    Finds the most relevant text chunks for a query.

    Args:
    query (str): Search query.
    text_chunks (List[str]): List of text chunks.
    chunk_embeddings (List[np.ndarray]): List of chunk embeddings.
    k (int): Number of top results to return.

    Returns:
    List[str]: Top-k relevant chunks.
    """
    # Generate an embedding for the query
    query_embedding = get_embedding(query)
    
    # Calculate cosine similarity between the query embedding and each chunk embedding
    similarities = [cosine_similarity(query_embedding, emb) for emb in chunk_embeddings]
    
    # Get the indices of the top-k most similar chunks
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    # Return the top-k most relevant text chunks
    return [text_chunks[i] for i in top_indices]

In [None]:
def semantic_search(query, text_chunks, chunk_embeddings, k=5):
    """
    Finds the most relevant text chunks for a query.

    Args:
    query (str): Search query.
    text_chunks (List[str]): List of text chunks.
    chunk_embeddings (List[np.ndarray]): List of chunk embeddings.
    k (int): Number of top results to return.

    Returns:
    List[str]: Top-k relevant chunks.
    """
    # Generate an embedding for the query
    query_embedding = get_embedding(query)
    
    # Calculate cosine similarity between the query embedding and each chunk embedding
    similarities = [cosine_similarity(query_embedding, emb) for emb in chunk_embeddings]
    
    # Get the indices of the top-k most similar chunks
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    # Return the top-k most relevant text chunks
    return [text_chunks[i] for i in top_indices]

In [260]:
def generate_user_prompt(query):
    context = "\n".join(semantic_search(query, text_chunks, chunk_embeddings))
    user_prompt = f"The question is {query}. This is the context: {context}."
    return user_prompt

In [261]:
# Define the system prompt for the AI assistant
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="gemini-2.0-flash"):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "meta-llama/Llama-2-7B-chat-hf".

    Returns:
    dict: The response from the AI model.
    """
    response = client.models.generate_content(
        model=model,
        config=types.GenerateContentConfig(
        system_instruction=system_prompt),
        contents=generate_user_prompt(user_message)
        )
    return response

# Create the user prompt based on the top chunks
user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

# Generate AI response
ai_response = generate_response(system_prompt, user_prompt)

<class 'numpy.ndarray'>


In [263]:
print(generate_response(system_prompt, "What is granny Hu's objective?").text)

<class 'numpy.ndarray'>
Granny Hu's objectives are:

1.  To ensure that Willowshore still has a physician when everything is over.
2.  To retake the barracks downtown for its armory.
3.  To find her missing grandchildren.
