# Simplest RAG -- Embedding Search

### Simple Setup and Test

In [28]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for search

# API key
from SECRET import OPENAI_API_KEY

# create a list of models 
GPT_MODEL = "gpt-4o"
# models
EMBEDDING_MODEL = "text-embedding-3-small"

client = OpenAI(api_key=OPENAI_API_KEY)

In [29]:
# an example question about the 2022 Olympics
query = 'Which athletes won the most number of gold medals in 2024 Summer Olympics?'

response = client.chat.completions.create(
    messages=[
        {'role': 'system', 'content': 'You answer questions about the 2024 Games or latest events. And the output should change line by line.'},
        {'role': 'user', 'content': query},
    ],
    model=GPT_MODEL,
    temperature=0,
)

print(response.choices[0].message.content)

I'm sorry, but I don't have information on the results of the 2024 Summer Olympics.  
The event is scheduled to take place in Paris, France, from July 26 to August 11, 2024.  
For the latest updates, you might want to check official Olympic sources or news outlets.


### Simplest Inserting Knowledge

In [None]:
wikipedia_article = 

In [30]:
query = f"""

Use the below article on the 2024 Summer Olympics to answer the subsequent question. 
If the answer cannot be found, write "I don't know."

Article:
\"\"\"
{wikipedia_article}
\"\"\"

Question: Which countries won the maximum number of gold, silver and bronze medals respectively at 2024 Summer Olympics? 
List the countries in the order of gold, silver and bronze medals.
"""

response = client.chat.completions.create(
    messages=[
        {'role': 'system', 'content': 'You answer questions about the recent events.'},
        {'role': 'user', 'content': query},
    ],
    model=GPT_MODEL,
    temperature=0,
)

print(response.choices[0].message.content)

The countries that won the maximum number of gold, silver, and bronze medals at the 2024 Summer Olympics are as follows:

- Gold medals: United States and China (tied with 40 gold medals each)
- Silver medals: United States (44 silver medals)
- Bronze medals: United States (42 bronze medals)


### Retrieving Augmented Generation with Embedding Search

In [21]:
embeddings_path = "data/winter_olympics_2022.csv"
df = pd.read_csv(embeddings_path)
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

#### Search Function

In [20]:
def search_articles(query: str, df: pd.DataFrame, top_n: int = 3) -> list[str]:
    """
    Calculate the embedding of the query,
    then calculate the cosine similarity between each article and the query,
    and return the top_n articles sorted by similarity.
    """
    # Get the embedding for the query
    query_embedding = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    ).data[0].embedding

    # Iterate through each row in the DataFrame and calculate similarity
    scored_articles = []
    for _, row in df.iterrows():
        similarity = 1 - spatial.distance.cosine(query_embedding, row["embedding"])
        scored_articles.append((row["text"], similarity))

    # Sort by similarity in descending order and return the top_n articles
    scored_articles.sort(key=lambda x: x[1], reverse=True)
    top_articles = [text for text, _ in scored_articles[:top_n]]
    return top_articles

#### Ask Function

In [34]:
def ask(query: str, knowledge_base: pd.DataFrame, top_n: int = 3, model: str = GPT_MODEL) -> str:
    """
    Use GPT to answer the query.
    First, use the search_articles function to find the most relevant articles from the DataFrame,
    then send these articles along with the question to GPT.
    """
    # Get the most relevant articles to the query
    selected_articles = search_articles(query, df, top_n)

    # Construct the prompt for GPT
    introduction = (
        "The following articles provide information about the 2022 Winter Olympics. "
        "Please answer the question based on these articles. "
        "If the answer is not found in the articles, respond with 'I am not allowed to tell you that.'"
    )
    articles_text = ""
    for article in selected_articles:
        articles_text += f'\n\nWikipedia article section:\n"""\n{article}\n"""'
    message_text = f"{introduction}{articles_text}\n\nQuestion: {query}"

    messages = [
        {"role": "system", "content": "You only answer questions about the 2022 Winter Olympics."},
        {"role": "user", "content": message_text},
    ]

    # Call the GPT API to get the answer
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

#### Try

In [32]:
ask('Which athletes won the gold medal in curling at the 2022 Winter Olympics? Just give me his name and contry.', knowledge_base=df)

'Niklas Edin, Sweden.'

In [35]:
ask('Which athletes won the gold medal in curling at the 2020 Winter Olympics? Just give me his name and contry.', knowledge_base=df)

'I am not allowed to tell you that.'