In [20]:
import os
from openai import OpenAI
import requests
import pandas as pd
from dateutil import parser
from scipy.spatial.distance import cosine as cosine_distance
import numpy as np

In [21]:
## Define the API key here for simplicity
api_key = "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"
organization = "org-cWiHQSP3yldcr6rE8IiusbNR"
model = "gpt-3.5-turbo"

In [22]:
os.environ["OPENAI_API_KEY"] = api_key

In [23]:
client = OpenAI(organization=organization, )

In [24]:
# define the prompt
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "When did Russia invade Ukraine?"},
]

In [25]:
## Get Response
response = client.chat.completions.create(
    model=model,
    messages=messages
)

In [26]:
## Print response
response.choices[0].message.content

'Russia invaded Ukraine in February 2014, when Russian forces entered the Crimean Peninsula and eventually annexed it, sparking the ongoing conflict in eastern Ukraine.'

In [27]:
## Obviously this is a very simple example, but it shows how to use the OpenAI API with the Python SDK. And we also See that the model is not able to answer the question correctly.
## Russia invaded Ukraine in 2021. 2014 was the year when Russia annexed Crimea. The Reason for this is that the model is not able to understand the context of the question.
## The Data used to train the model is not up to date.

In [28]:
y2021 = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exlimit=1&titles=2021&explaintext=1&formatversion=2&format=json")
y2022 = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exlimit=1&titles=2022&explaintext=1&formatversion=2&format=json")
y2023 = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exlimit=1&titles=2023&explaintext=1&formatversion=2&format=json")

In [29]:
year_data = [y2021.json(), y2022.json(), y2023.json()]
years = [2021, 2022, 2023]

## now we load each year's data into a df
df = pd.DataFrame()

for i in range(len(year_data)):
    temp_df = pd.DataFrame()
    temp_df["text"] = year_data[i]["query"]["pages"][0]["extract"].split("\n")
    ## add the year to the df text as a prefix
    current_year = years[i]
    temp_df["text"] = temp_df["text"].apply(lambda x: str(current_year) + " - " + x)
    df = pd.concat([df, temp_df])


## cleanup the data
df = df[df["text"].str.len() > 0]
df = df[~df["text"].str.startswith("==")]
df = df[df["text"].str.contains("–")]

In [31]:
for i in range(len(df)):
    print(df.iloc[i]["text"])

2021 - January 1 – The African Continental Free Trade Area comes into effect.
2021 - January 4 – The border between Qatar and Saudi Arabia reopens.
2021 -  January 6 – Supporters of US President Donald Trump attack the US Capitol, disrupting certification of the 2020 presidential election, and forcing Congress to evacuate. Five people die during the ensuing riot. The event is classified as a domestic terrorist attack, and draws international condemnation.
2021 - January 10 – Kim Jong-un is elected as the General Secretary of the ruling Workers' Party of Korea, inheriting the title from his father Kim Jong-il, who died in 2011.
2021 - January 13 – In Lyon, France, the first transplant of both arms and shoulders is performed on an Icelandic patient at the Édouard Herriot Hospital.
2021 - January 14 – The 2021 Ugandan general election is held. Incumbent President Yoweri Museveni, who has ruled since 1986, wins reelection.
2021 - January 20 – Joe Biden and Kamala Harris are inaugurated as 

In [32]:
## now list available models from the openai API
client.models.list().data

[Model(id='gpt-4-vision-preview', created=1698894917, object='model', owned_by='system'),
 Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'),
 Model(id='text-embedding-3-large', created=1705953180, object='model', owned_by='system'),
 Model(id='gpt-3.5-turbo-instruct-0914', created=1694122472, object='model', owned_by='system'),
 Model(id='dall-e-2', created=1698798177, object='model', owned_by='system'),
 Model(id='whisper-1', created=1677532384, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo-16k-0613', created=1685474247, object='model', owned_by='openai'),
 Model(id='babbage-002', created=1692634615, object='model', owned_by='system'),
 Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo-16k', created=1683758102, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo-0125', created=1706048358, object='model', owned_by='system'),
 Model(id='gpt-3.5-

In [33]:
embeddings_model = "text-embedding-3-large"

In [34]:
# use batch size of 10
batch_size = 10
embeddings = []

for i in range(0, len(df), batch_size):
    # send text to the API
    response = client.embeddings.create(
        input=df["text"].iloc[i:i+batch_size].tolist(),
        model=embeddings_model
    )
    
    extracted_embeddings = response.data
    
    for embedding in extracted_embeddings:
        embeddings.append(embedding.embedding)
    
## add the embeddings to the dataframe
df["embedding"] = embeddings

In [35]:
## now we can use the embeddings to improve the model's response using RAG with Cosine Similarity
from scipy.spatial.distance import cosine as cosine_distance
import numpy as np

In [37]:
## example with two random embeddings
cosine_distance(df["embedding"].iloc[0], df["embedding"].iloc[1])

0.6224922136301136

In [38]:
## optimized function to calculate the cosine similarity of a query with all the embeddings

## transform the list of embeddings into a single numpy array
embeddings_array = [np.array(embedding) for embedding in df["embedding"]]
## trasnform the list of embeddings into a single numpy array
embeddings_array = np.array(embeddings_array)

# now utilize the numpy broadcasting to calculate the cosine similarity of a query with all the embeddings
def get_cosine_similarity(query_embedding, embeddings_array):
    return 1 - np.dot(embeddings_array, query_embedding) / (np.linalg.norm(embeddings_array, axis=1) * np.linalg.norm(query_embedding))

In [39]:
## now we can use the get_cosine_similarity function to get the most similar text to the query

query = "When did Russia invade Ukraine?"

query_embedding = client.embeddings.create(
    input=query,
    model=embeddings_model
)

query_embedding = query_embedding.data[0].embedding
query_embedding = np.array(query_embedding)

In [40]:
similarities = get_cosine_similarity(query_embedding, embeddings_array)

In [41]:
## get the 5 most similar text
most_similar_text = df.iloc[similarities.argsort()[:5]]

In [42]:
## lets first check by using a simple text search for russia
russia_text = df[df["text"].str.contains("Russia")]

In [43]:
for (i, row) in russia_text.iterrows():
    print(row["text"])

2021 - February 20 – 2020–21 H5N8 outbreak: 7 people test positive for H5N8 bird flu at a poultry farm in southern Russia, becoming the first known human cases.
2021 - April 2 – Russia warns NATO against sending any troops to aid Ukraine, amid reports of a large Russian military build-up on its borders.
2021 - June 10 – An annular solar eclipse is visible from Canada, Greenland, the North Pole, and the Russian Far East.
2021 - September 19 – The 2021 Russian legislative election is held, with the United Russia party winning nearly 50% of the vote.
2021 - November 16 – Russia draws international condemnation following an anti-satellite weapon test that creates a cloud of space debris, threatening the International Space Station.
2022 - January 4 – The five permanent members of the UN Security Council—China, France, Russia, the United Kingdom and the United States—issue a rare joint statement affirming that "a nuclear war cannot be won and must never be fought."
2022 - January 6 – The CS

In [44]:
## We see several entries that contain the word "Russia" and instantly can spot some about a invasion. 
## lets use the get_cosine_similarity function to get the most similar text to the query "When did Russia invade Ukraine?"

In [45]:
## lets check what the semantic search returns
for (i, row) in most_similar_text.iterrows():
    print(row["text"])
    print("\n")

2022 - February 21 – February 24 – Russian President Vladimir Putin signs a decree declaring the Luhansk People's Republic and Donetsk People's Republic as independent from Ukraine, and, despite international condemnation and sanctions, begins a full-scale invasion of Ukraine; at dawn on 24 February missiles strike Kyiv. Ukraine severs diplomatic relations with Russia, followed by the Federated States of Micronesia on 25 February.


2022 - April 18 – Russian invasion of Ukraine: The battle of Donbas begins, leading to the deaths of several thousand military personnel and civilians.


2022 - October 8 – Russian invasion of Ukraine: An explosion occurs on the Crimean Bridge connecting Crimea and Russia, killing three and causing a partial collapse of the only road bridge between the Crimean Peninsula and the Russian mainland. Two days later, retaliatory missile strikes are conducted by Russia across Ukraine, the most widespread since the start of the invasion, notably including attacks o

In [46]:
context_string = "\n".join(most_similar_text["text"].tolist())

In [47]:
conext_prompt = """
Answer the question based on the context below, and if the
question can not be answered based on the context, say "I don't know".

Context:

{context}

-----------------------------------
Question: When did Russia invade Ukraine?
""".format(context=context_string)

In [48]:
## the text seems promising. Lets use the most similar texts to improve the model's response
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": conext_prompt},
]

In [49]:
response = client.chat.completions.create(
    model=model,
    messages=messages
)

In [50]:
response.choices[0].message.content

'Russia invaded Ukraine on February 24, 2022.'