# IN_SYS - SW13 Exercise 5

## Retrieval Augmented Generation

#### RAG Pipeline with GPT-4 and OpenAI Embeddings

In [1]:
from openai import OpenAI
import tiktoken
import numpy as np
import os
from dotenv import load_dotenv

# Load OpenAI API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#### Initialize tokenizer

In [2]:
llm_tokenizer = tiktoken.encoding_for_model("gpt-4")  # GPT-4 tokenizer

#### HSLU Knowledge Base

In [3]:
hslu_chunks = [
    "HSLU has seven schools: Engineering and Architecture, Business, Computer Science, Social Work, Design Film and Art, Music, and Health Sciences.",
    "The Engineering and Architecture school offers Business Engineering, Energy Engineering, and Digital Construction programs.",
    "The Business school provides Business Administration, International Management, and Accounting degrees.",
    "Computer Science department includes Software Engineering, Data Science, and IT programs.",
    "Social Work focuses on counseling, social services, and social pedagogy.",
    "Design Film and Art covers Graphic Design, Film production, and Fine Arts.",
    "Music school has Music Performance, Composition, and Music Education.",
    "Health Sciences offers Nursing, Physiotherapy, and Health Management programs."
]

#### We will create chunk embeddings

In [4]:
def create_embeddings(texts):
    embeddings = []
    
    for text in texts:
        response = client.embeddings.create(
            model="text-embedding-3-large",
            input=text
        )
        embedding = response.data[0].embedding
        embeddings.append(embedding)
    
    return embeddings

print("Creating embeddings for all chunks...")
chunk_embeddings = create_embeddings(hslu_chunks)
print(f"Created {len(chunk_embeddings)} embeddings")
print(f"Each embedding: {len(chunk_embeddings[0])} dimensions")
print()

Creating embeddings for all chunks...
Created 8 embeddings
Each embedding: 3072 dimensions



#### We will simulate asking a question

In [5]:
question = "Which school offers Business Engineering program?"
print(f"Question: '{question}'")

Question: 'Which school offers Business Engineering program?'


#### Let's see how the question is tokenized

In [6]:
llm_question_token_ids = llm_tokenizer.encode(question)
llm_question_token_texts = [llm_tokenizer.decode([t]) for t in llm_question_token_ids]

print(f"GPT-4 Tokenization:")
print(f"  Token IDs: {llm_question_token_ids}")
print(f"  Token Texts: {llm_question_token_texts}")
print(f"  Token Count: {len(llm_question_token_ids)}")

GPT-4 Tokenization:
  Token IDs: [23956, 2978, 6209, 8184, 17005, 2068, 30]
  Token Texts: ['Which', ' school', ' offers', ' Business', ' Engineering', ' program', '?']
  Token Count: 7


#### Create question embeddings

In [7]:
question_embedding_response = client.embeddings.create(
            model="text-embedding-3-large",
            input=question
)
question_embedding = question_embedding_response.data[0].embedding

print(f"Question embeddings vector created:")
print(f"    Vector dimensions: {len(question_embedding)}")
print(f"    Sample (first 10): {question_embedding[:10]}")
print()

# Note: Embedding model uses its own internal tokenizer
# OpenAI doesn't expose this, but we know it's different from GPT-4
print("Embedding model uses CLIP tokenizer internally, which is different from GPT-4 tokenizer")

Question embeddings vector created:
    Vector dimensions: 3072
    Sample (first 10): [-0.00805139821022749, 0.025078123435378075, -0.029882565140724182, 0.008935731835663319, -0.00026934235938824713, -0.01706632412970066, 0.02156718634068966, 0.026054851710796356, 0.006309127900749445, -0.04073215276002884]

Embedding model uses CLIP tokenizer internally, which is different from GPT-4 tokenizer


#### Vector similarity search

In [8]:
def cosine_similarity(a, b):
    """Calculate cosine similarity between two vectors."""
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate similarities
similarities = []
for i, chunk_embedding in enumerate(chunk_embeddings):
    similarity = cosine_similarity(question_embedding, chunk_embedding)
    similarities.append((i, similarity, hslu_chunks[i]))

# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)

print("Similarity Scores:")
for idx, score, chunk in similarities:
    indicator = " ← TOP RESULT" if idx == similarities[0][0] else ""
    print(f"  Chunk {idx+1}: {score:.4f}{indicator}")

Similarity Scores:
  Chunk 2: 0.6647 ← TOP RESULT
  Chunk 3: 0.5223
  Chunk 1: 0.3881
  Chunk 4: 0.3543
  Chunk 8: 0.3536
  Chunk 7: 0.2505
  Chunk 6: 0.1862
  Chunk 5: 0.1410


#### Get top-k chunks

In [9]:
k = 2
top_k = similarities[:k]

print(f"Question: '{question}'")
print(f"\nTop-{k} most similar chunks:")
for idx, score, chunk in top_k:
    print(f"\nChunk {idx+1} (score: {score:.4f}):")
    print(f"  '{chunk}'")

# Combine chunks for context
retrieved_chunks = "\n\n".join([chunk for _, _, chunk in top_k])

Question: 'Which school offers Business Engineering program?'

Top-2 most similar chunks:

Chunk 2 (score: 0.6647):
  'The Engineering and Architecture school offers Business Engineering, Energy Engineering, and Digital Construction programs.'

Chunk 3 (score: 0.5223):
  'The Business school provides Business Administration, International Management, and Accounting degrees.'


#### Tokenize retrieved chunks with GPT-4 model

#### Build full RAG prompt consisting of question and chunks (context) and tokenize it

In [10]:
# Build RAG prompt
prompt = f"""Context: {retrieved_chunks}

Question: {question}"""

print("Full Prompt:")
print("-"*40)
print(prompt)
print("-"*40)
print()

# Tokenize entire prompt with LLM
llm_prompt_tokens = llm_tokenizer.encode(prompt)
llm_prompt_token_texts = [llm_tokenizer.decode([t]) for t in llm_prompt_tokens]

print(f"LLM Prompt Tokenization:")
print(f"  Total tokens: {len(llm_prompt_tokens)}")
print(f"  Tokens: {llm_prompt_tokens}")
print(f"  Texts: {llm_prompt_token_texts}")
print()

# Check token count (GPT-4 has 8k context window)
print(f"Token count check:")
print(f"  Prompt tokens: {len(llm_prompt_tokens)}")
print(f"  GPT-4 context window: 8192 tokens")
print(f"  Status: {'Within limit' if len(llm_prompt_tokens) < 8000 else 'Close to limit'}")
print()

Full Prompt:
----------------------------------------
Context: The Engineering and Architecture school offers Business Engineering, Energy Engineering, and Digital Construction programs.

The Business school provides Business Administration, International Management, and Accounting degrees.

Question: Which school offers Business Engineering program?
----------------------------------------

LLM Prompt Tokenization:
  Total tokens: 42
  Tokens: [2014, 25, 578, 17005, 323, 38943, 2978, 6209, 8184, 17005, 11, 12634, 17005, 11, 323, 14434, 24987, 7620, 382, 791, 8184, 2978, 5825, 8184, 17128, 11, 7327, 9744, 11, 323, 45344, 12628, 382, 14924, 25, 16299, 2978, 6209, 8184, 17005, 2068, 30]
  Texts: ['Context', ':', ' The', ' Engineering', ' and', ' Architecture', ' school', ' offers', ' Business', ' Engineering', ',', ' Energy', ' Engineering', ',', ' and', ' Digital', ' Construction', ' programs', '.\n\n', 'The', ' Business', ' school', ' provides', ' Business', ' Administration', ',', ' I

#### Get the answer from LLM (GPT-4)

In [11]:
print("Calling GPT-4 API...")

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.7,
    max_tokens=150
)

# Extract the answer
answer = response.choices[0].message.content

print(f"   GPT-4 Answer:")
print(f"   '{answer}'")
print()

# Tokenize the answer to see GPT-4's output tokens
llm_answer_token_ids = llm_tokenizer.encode(answer)
print(f"Answer Tokenization:")
print(f"  Token Count: {len(llm_answer_token_ids)}")
print(f"  Token IDs: {llm_answer_token_ids[:10]}...")
print()

Calling GPT-4 API...
   GPT-4 Answer:
   'The Engineering and Architecture school offers the Business Engineering program.'

Answer Tokenization:
  Token Count: 11
  Token IDs: [791, 17005, 323, 38943, 2978, 6209, 279, 8184, 17005, 2068]...

