In [43]:
import os
from openai import OpenAI
import numpy as np

OPEN_AI_KEY = os.getenv("OPENAI_API_KEY")


In [44]:


client = OpenAI(api_key=OPEN_AI_KEY)

# declare embedding model
response = client.embeddings.create(
    input="Your text string goes here",
    model="text-embedding-3-small"
)



In [45]:
# chunk the txt into 50 character chunks
query = "What is Messi's name?"

def chunk_text(text, chunk_size):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

f = open("messi.txt", encoding='utf8')
sentences = chunk_text(f.read(), 50)
print(sentences)

['Lionel Andrés Messi[note 1] (Spanish pronunciation', ': [ljoˈnel anˈdɾes ˈmesi] ⓘ; born 24 June 1987), a', 'lso known as Leo Messi, is an Argentine profession', 'al footballer who plays as a forward for and capta', 'ins both Major League Soccer club Inter Miami and ', 'the Argentina national team. Widely regarded as on', 'e of the greatest players of all time, Messi set n', 'umerous records for individual accolades won throu', 'ghout his professional footballing career such as ', "eight Ballon d'Or awards and eight times being nam", "ed the world's best player by FIFA.[note 2] He is ", 'the most decorated player in the history of profes', 'sional football having won 45 team trophies,[note ', '3] including twelve league titles, four UEFA Champ', 'ions Leagues, two Copa Américas, and one FIFA Worl', 'd Cup. Messi holds the records for most European G', 'olden Shoes (6), most goals for a single club (672', ', with Barcelona), most goals (474), hat-tricks (3', '6) and assists (192) in La

In [46]:
# use embedding model on query and txt
query_embedding = client.embeddings.create(
    input=query,
    model="text-embedding-3-small"
).data[0].embedding

resource_responses = client.embeddings.create(
    input=sentences,
    model="text-embedding-3-small"
).data


In [47]:

# similarity function
def cosine_simularity(A, B):
    return np.dot(A,B) / ( np.linalg.norm(A) * np.linalg.norm(B) )

simularities = {}


In [48]:

# find the top 5 most similar sentences
for i in range(len(resource_responses)): 
    embedding = resource_responses[i].embedding
    resource_embedding = np.array(embedding)
    simularity = cosine_simularity(query_embedding, resource_embedding)
    simularities[simularity] = sentences[i]

sorted_simularities = sorted(list(simularities.keys()))
length = len(sorted_simularities)
print(sorted_simularities[length-5:length])

top_simularities = []

for i in range(length-5, length):
    top_simularities.append(simularities[sorted_simularities[i]])

rag_query = "answer the question: " + query + ". Given this information (please cite the information used): " + " ".join(top_simularities)
print(rag_query)


[0.5716238837863366, 0.5979521272010121, 0.636203199617289, 0.6587311758914197, 0.6664274657421646]
answer the question: What is Messi's name?. Given this information (please cite the information used): ics. After his senior debut in 2005, Messi became  earnings.

Early life
Messi was born on 24 June 19 ational, Messi is the national team's all-time lea Lionel Andrés Messi[note 1] (Spanish pronunciation lso known as Leo Messi, is an Argentine profession


In [49]:
from openai import OpenAI
client = OpenAI()

# create completion
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": rag_query
        }
    ]
)

print(completion.choices[0].message.content)

Messi's full name is Lionel Andrés Messi. This information is derived from the text you provided, which states: "Lionel Andrés Messi... is also known as Leo Messi, is an Argentine professional."
