In [2]:
#OpenAI Word Embeddings, Semantic Search
#Word embeddings are a way of representing words and phrases as vectors. They can be used for a variety of tasks, including semantic search, anomaly detection, 
# and classification. In the video on OpenAI Whisper, I mentioned how words whose vectors are numerically similar are also similar in semantic meaning. 
# In this tutorial, we will learn how to implement semantic search using OpenAI embeddings. Understanding the Embeddings concept will be crucial to the next 
# several videos in this series since we will use it to build several practical applications.

In [3]:
!pip install openai -q

In [1]:
import openai
import pandas as pd
import numpy as np
from getpass import getpass

openai.api_key = getpass()

In [4]:
#Read Data File Containing Words

#Now that we have configured OpenAI, let's start with a simple CSV file with familiar words. From here we'll build up to a more complex semantic search 
# using sentences from the Fed speech. Save the linked "words.csv" as a CSV and upload it to Google Colab. Once the file is uploaded, let's read it 
# into a pandas dataframe using the code below:

df = pd.read_csv('words.csv')
print(df)

            text
0            red
1       potatoes
2           soda
3         cheese
4          water
5           blue
6         crispy
7      hamburger
8         coffee
9          green
10          milk
11      la croix
12        yellow
13     chocolate
14  french fries
15         latte
16          cake
17         brown
18  cheeseburger
19      espresso
20    cheesecake
21         black
22         mocha
23         fizzy
24        carbon
25        banana


In [5]:
#Calculate Word Embeddings

#To use word embeddings for semantic search, you first compute the embeddings for a corpus of text using a word embedding algorithm. What does 
# this mean? We are going to create a numerical representation of each of these words. To perform this computation, we'll use OpenAI's 'get_embedding' function.

#Since we have our words in a pandas dataframe, we can use "apply" to apply the get_embedding function to each row in the dataframe. 
# We then store the calculated word embeddings in a new text file called "word_embeddings.csv" so that we don't have to call OpenAI again to perform these calculations.

from openai.embeddings_utils import get_embedding

df['embedding'] = df['text'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df.to_csv('word_embeddings.csv')

In [6]:
get_embedding("the fox crossed the road", engine='text-embedding-ada-002')

[-0.0006075783167034388,
 0.00036217979504726827,
 -0.020174754783511162,
 0.007051064632833004,
 -0.01384962908923626,
 0.025376256555318832,
 -0.022232631221413612,
 -0.022321006283164024,
 0.013874879106879234,
 -0.033128008246421814,
 0.029239507392048836,
 0.008850127458572388,
 0.032976508140563965,
 -0.015831753611564636,
 0.0048732515424489975,
 -0.0010605002753436565,
 0.01954350620508194,
 -0.0010549768339842558,
 0.018306255340576172,
 -0.030678758397698402,
 -0.009860128164291382,
 0.029441507533192635,
 0.0023498288355767727,
 -0.02365925721824169,
 -0.003907438833266497,
 0.006457689218223095,
 0.015238379128277302,
 -0.013824379071593285,
 -0.002794860163703561,
 -0.011589753441512585,
 0.008092626929283142,
 0.0018953286344185472,
 -0.028810258954763412,
 0.00047462122165597975,
 -0.013862254098057747,
 -0.017409879714250565,
 -0.0018827036255970597,
 -0.008660752326250076,
 0.01124256569892168,
 -0.027421507984399796,
 0.027143757790327072,
 0.005277251359075308,
 -0.0

In [7]:
#Semantic Search
#Now that we have our word embeddings stored, let's load them into a new dataframe and use it for semantic search. Since the 'embedding' in the CSV is 
# stored as a string, we'll use apply() and to interpret this string as Python code and convert it to a numpy array so that we can perform calculations on it.

df = pd.read_csv('word_embeddings.csv')
df['embedding'] = df['embedding'].apply(eval).apply(np.array)
df

Unnamed: 0.1,Unnamed: 0,text,embedding
0,0,red,"[1.8579006791696884e-05, -0.024676261469721794..."
1,1,potatoes,"[0.004971202462911606, -0.031133046373724937, ..."
2,2,soda,"[0.025859493762254715, -0.007452284451574087, ..."
3,3,cheese,"[-0.0038699328433722258, -0.009271041490137577..."
4,4,water,"[0.019031280651688576, -0.01257743313908577, 0..."
5,5,blue,"[0.005490605719387531, -0.007445123512297869, ..."
6,6,crispy,"[-0.0010195652721449733, -0.005407913122326136..."
7,7,hamburger,"[-0.013206875883042812, -0.0018223668448626995..."
8,8,coffee,"[-0.0007566261338070035, -0.0194522924721241, ..."
9,9,green,"[0.01538460049778223, -0.010931522585451603, 0..."


In [8]:
#Let's now prompt ourselves for a search term that isn't in the dataframe. We'll use word embeddings to perform a semantic 
# search for the words that are most similar to the word we entered. I'll first try the word "hot dog". Then we'll come back and try the word "yellow".

search_term = input('Enter a search term: ')


In [9]:
# semantic search

#Now that we have a search term, let's calculate an embedding or vector for that search term using the OpenAI get_embedding function.

search_term_vector = get_embedding(search_term, engine="text-embedding-ada-002")
search_term_vector

[-0.021754980087280273,
 -0.00431280629709363,
 -0.0095869405195117,
 -0.0313945971429348,
 -0.021504772827029228,
 0.003371231956407428,
 -0.027839001268148422,
 -0.013708798214793205,
 -0.009435499086976051,
 -0.025692474097013474,
 0.017501434311270714,
 0.03186867758631706,
 -0.006597605999559164,
 0.004655197262763977,
 -0.0196347925812006,
 0.010429748333990574,
 0.04032309353351593,
 -0.010864321142435074,
 0.028392093256115913,
 0.00474737910553813,
 -0.021570617333054543,
 0.0036444859579205513,
 0.011200128123164177,
 -0.01926606334745884,
 -0.002057636622339487,
 0.0014979594852775335,
 0.010179540142416954,
 -0.005425576586276293,
 -0.004019799176603556,
 -0.020543444901704788,
 0.041850682348012924,
 0.010976256802678108,
 -0.022439762949943542,
 -0.000737868482246995,
 -0.0058634416200220585,
 -0.012530184350907803,
 -0.0007872518035583198,
 -0.002612375421449542,
 0.009343316778540611,
 0.00021728643332608044,
 0.0028708146419376135,
 -0.006008299067616463,
 0.0032313126

In [10]:
#Once we have a vector representing that word, we can see how similar it is to other words in our dataframe by calculating the cosine similarity of our search 
# term's word vector to each word embedding in our dataframe.
from openai.embeddings_utils import cosine_similarity

df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))

df

Unnamed: 0.1,Unnamed: 0,text,embedding,similarities
0,0,red,"[1.8579006791696884e-05, -0.024676261469721794...",0.81207
1,1,potatoes,"[0.004971202462911606, -0.031133046373724937, ...",0.816827
2,2,soda,"[0.025859493762254715, -0.007452284451574087, ...",0.820797
3,3,cheese,"[-0.0038699328433722258, -0.009271041490137577...",0.824113
4,4,water,"[0.019031280651688576, -0.01257743313908577, 0...",0.798268
5,5,blue,"[0.005490605719387531, -0.007445123512297869, ...",0.787098
6,6,crispy,"[-0.0010195652721449733, -0.005407913122326136...",0.820531
7,7,hamburger,"[-0.013206875883042812, -0.0018223668448626995...",0.876765
8,8,coffee,"[-0.0007566261338070035, -0.0194522924721241, ...",0.799683
9,9,green,"[0.01538460049778223, -0.010931522585451603, 0...",0.785477


In [11]:
#Sorting By Similarity
#Now that we have calculated the similarities to each term in our dataframe, we simply sort the similarity values to find the terms 
# that are most similar to the term we searched for. Notice how the foods are most similar to "hot dog". Not only that, it puts fast 
# food closer to hot dog. Also some colors are ranked closer to hot dog than others. Let's go back and try the word "yellow" and walk through the results.

df.sort_values("similarities", ascending=False).head(20)

Unnamed: 0.1,Unnamed: 0,text,embedding,similarities
7,7,hamburger,"[-0.013206875883042812, -0.0018223668448626995...",0.876765
18,18,cheeseburger,"[-0.018216600641608238, 0.005054354667663574, ...",0.856907
14,14,french fries,"[0.0014476682990789413, -0.016491735354065895,...",0.838477
3,3,cheese,"[-0.0038699328433722258, -0.009271041490137577...",0.824113
2,2,soda,"[0.025859493762254715, -0.007452284451574087, ...",0.820797
6,6,crispy,"[-0.0010195652721449733, -0.005407913122326136...",0.820531
1,1,potatoes,"[0.004971202462911606, -0.031133046373724937, ...",0.816827
13,13,chocolate,"[0.0015315973432734609, -0.012976923026144505,...",0.816716
0,0,red,"[1.8579006791696884e-05, -0.024676261469721794...",0.81207
16,16,cake,"[-0.013637279160320759, -0.01684682071208954, ...",0.812014


In [12]:
#Adding Words Together
#What's even more interesting is that we can add word vectors together. What happens when we add the numbers for milk and espresso, 
# then search for the word vector most similar to milk + espresso? Let's make a copy of the original dataframe and call it food_df. 
# We'll operate on this copy. Let's try adding word together. Let's add milk + espresso and store the results in milk_espresso_vector.

food_df = df.copy()

milk_vector = food_df['embedding'][10]
espresso_vector = food_df['embedding'][19]

milk_espresso_vector = milk_vector + espresso_vector
milk_espresso_vector

array([-0.02157659, -0.03206679, -0.01620988, ..., -0.00423221,
        0.00078145, -0.02898556])

In [13]:
#Now let's find the words most similar to milk + espresso. If you have never done this before, it's pretty surprising that you 
# can add words together like this and find similar words using numbers.


food_df["similarities"] = food_df['embedding'].apply(lambda x: cosine_similarity(x, milk_espresso_vector))
food_df.sort_values("similarities", ascending=False)

Unnamed: 0.1,Unnamed: 0,text,embedding,similarities
19,19,espresso,"[-0.02250584401190281, -0.012747502885758877, ...",0.960501
10,10,milk,"[0.0009292512550018728, -0.019319288432598114,...",0.960501
15,15,latte,"[-0.015634099021553993, -0.003942839801311493,...",0.922975
22,22,mocha,"[-0.012487593106925488, -0.026140518486499786,...",0.899327
8,8,coffee,"[-0.0007566261338070035, -0.0194522924721241, ...",0.895382
3,3,cheese,"[-0.0038699328433722258, -0.009271041490137577...",0.885243
13,13,chocolate,"[0.0015315973432734609, -0.012976923026144505,...",0.883493
2,2,soda,"[0.025859493762254715, -0.007452284451574087, ...",0.874156
4,4,water,"[0.019031280651688576, -0.01257743313908577, 0...",0.866049
7,7,hamburger,"[-0.013206875883042812, -0.0018223668448626995...",0.852722


In [14]:
#Microsoft Earnings Call Transcript
#Let's tie this back to finance. I have attached some text from a recent Microsoft earnings call here. Click on "raw" and save the file 
# as a CSV. Upload it to Google Colab as microsoft-earnings.csv. Let's use what we just learned to perform a semantic search on sentences in 
# the Microsoft earnings call. We'll start by reading the paragraphs into a pandas dataframe.


earnings_df = pd.read_csv('microsoft-earnings.csv')
earnings_df

Unnamed: 0,text
0,"Thank you, Brett. To start, I want to outline ..."
1,"With that context, this quarter, the Microsoft..."
2,It helps them align their spend with demand an...
3,We are the platform of choice for customers' S...
4,Now to data and AI. With our Microsoft Intelli...
...,...
57,Other income and expense should be roughly $10...
58,"And finally, as a reminder, for Q2 cash flow, ..."
59,And FX should decrease COGS and operating expe...
60,With the high margins in our Windows OEM busin...


In [15]:
#Once we have the dataframe, we'll once again compute the embeddings for each line in our CSV file.

earnings_df['embedding'] = earnings_df['text'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
earnings_df.to_csv('earnings-embeddings.csv')

In [16]:
#If you download the earnings_embeddings.csv file locally and open it up, you'll see that our embeddings are for 
# entire paragraphs - not just words. This means that we'll be able to search on similar sentences even if there isn't an exact
# match for the string we search for. We are searching on meaning.


#artificial intelligence demand cloud products
earnings_search = input("Search earnings for a sentence:")

In [17]:
earnings_search_vector = get_embedding(earnings_search, engine="text-embedding-ada-002")
earnings_search_vector

[-0.010804958641529083,
 -0.025622181594371796,
 -0.002305292524397373,
 0.0036567922215908766,
 0.020399197936058044,
 0.027522727847099304,
 -0.022961417213082314,
 -0.00862988829612732,
 0.0006629915442317724,
 -0.035476867109537125,
 0.0227502454072237,
 0.022905103862285614,
 0.01010105200111866,
 -0.004406452178955078,
 -0.017358323559165,
 0.006007838528603315,
 0.02035696431994438,
 0.007158725056797266,
 0.0043184636160731316,
 -0.02153952606022358,
 -0.010706411674618721,
 0.02504497766494751,
 0.015218449756503105,
 -0.004378295969218016,
 -0.009622395969927311,
 -0.014655324630439281,
 0.028142165392637253,
 -0.0365186482667923,
 0.0013163044350221753,
 0.01137512270361185,
 0.024242525920271873,
 -0.017682120203971863,
 -0.00769369350746274,
 -0.005669963546097279,
 -0.010277029126882553,
 0.0027241164352744818,
 -0.0033805088605731726,
 -0.012149418704211712,
 0.018737979233264923,
 -0.006708224769681692,
 0.01596459001302719,
 0.013613543473184109,
 0.014120356179773808,

In [18]:
earnings_df["similarities"] = earnings_df['embedding'].apply(lambda x: cosine_similarity(x, earnings_search_vector))

earnings_df

Unnamed: 0,text,embedding,similarities
0,"Thank you, Brett. To start, I want to outline ...","[-0.009509023278951645, -0.00374739826656878, ...",0.749826
1,"With that context, this quarter, the Microsoft...","[-0.0016295332461595535, -0.028975291177630424...",0.800685
2,It helps them align their spend with demand an...,"[0.008828130550682545, -0.03199512138962746, 0...",0.796974
3,We are the platform of choice for customers' S...,"[0.011994918808341026, -0.024179909378290176, ...",0.800597
4,Now to data and AI. With our Microsoft Intelli...,"[-0.004754434805363417, 0.0038801338523626328,...",0.820928
...,...,...,...
57,Other income and expense should be roughly $10...,"[-0.01832527294754982, -0.014160438440740108, ...",0.688981
58,"And finally, as a reminder, for Q2 cash flow, ...","[-0.012947804294526577, -0.010494815185666084,...",0.715621
59,And FX should decrease COGS and operating expe...,"[0.0009834554512053728, -0.015616976656019688,...",0.771283
60,With the high margins in our Windows OEM busin...,"[0.010544494725763798, -0.03846913203597069, -...",0.757930


In [19]:
earnings_df.sort_values("similarities", ascending=False)

Unnamed: 0,text,embedding,similarities
5,"Cosmos DB now supports postscript SQL, making ...","[-0.00441406574100256, -0.005979578942060471, ...",0.846157
12,Our cloud for sustainability is off to a fast ...,"[0.008838257752358913, -0.016278056427836418, ...",0.823559
4,Now to data and AI. With our Microsoft Intelli...,"[-0.004754434805363417, 0.0038801338523626328,...",0.820928
11,"All up more than 400,000 organizations now use...","[-0.0011049157474189997, -0.020452041178941727...",0.813871
9,Power Automate has more than seven million mon...,"[-0.025379547849297523, -0.03403877094388008, ...",0.806160
...,...,...,...
29,"Thank you, Satya, and good afternoon, everyone...","[0.01388876885175705, -0.014524796977639198, -...",0.718285
44,Operating expenses increased 2% and 5% in cons...,"[0.017512913793325424, 0.008054900914430618, 0...",0.716947
58,"And finally, as a reminder, for Q2 cash flow, ...","[-0.012947804294526577, -0.010494815185666084,...",0.715621
57,Other income and expense should be roughly $10...,"[-0.01832527294754982, -0.014160438440740108, ...",0.688981


In [20]:
#Sentences of the Fed Speech
#Let's use the Fed Speech example once more. Let's calculate the word embeddings for a particular sentence in the November 2nd speech that we 
# discussed in the OpenAI Whisper tutorial. Then we'll take a new sentence from a future speech that isn't in our dataset, and find the most similar 
# sentence in our dataset. Here is the sentence we will use to search for similarity:

#"the inflation is too damn high"
#As we did previously, take the linked CSV file and upload it to Google Colab as fed-speech.csv. We'll once again read it into a pandas dataframe.

fed_df = pd.read_csv('fed-speech.csv')
fed_df

Unnamed: 0,text
0,Good afternoon
1,My colleagues and I are strongly committed to ...
2,We have both the tools that we need and the re...
3,Price stability is the responsibility of the F...
4,"Without price stability, the economy does not ..."
5,"In particular, without price stability, we wil..."
6,"Today, the FOMC raised our policy interest rat..."
7,We are moving our policy stance purposefully t...
8,"In addition, we are continuing the process of ..."
9,Restoring price stability will likely require ...


In [21]:
#We'll once again calculate the embeddings and save them in a new CSV file.
fed_df['embedding'] = fed_df['text'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
fed_df.to_csv('fed-embeddings.csv')

In [22]:
#We'll then enter the new sentence that we want to find similarity for:
#"We will continue to increase interest rates and tighten monetary policy"
fed_sentence = input('Enter something Jerome Powell said: ')


In [23]:
#Enter something Jerome Powell said: the inflation is too damn high
#Again we'll get the vector for this sentence, find the cosine similarity, and sort by most similar.

fed_sentence_vector = get_embedding(fed_sentence, engine="text-embedding-ada-002")
fed_sentence_vector

[-0.0337650440633297,
 -0.017830269411206245,
 0.017046798020601273,
 -0.008112709037959576,
 0.003573003923520446,
 0.016831975430250168,
 -0.03047952428460121,
 -0.013647547923028469,
 -0.007329238578677177,
 -0.03032788448035717,
 0.01838628016412258,
 0.015795772895216942,
 0.020597688853740692,
 -0.019018111750483513,
 -0.0005271839327178895,
 -0.004969350062310696,
 0.018272550776600838,
 -0.01646551489830017,
 0.02802802063524723,
 -0.027724741026759148,
 -0.014784843660891056,
 0.016617152839899063,
 -0.0032381336204707623,
 -0.013369542546570301,
 -0.02044604904949665,
 0.011474049650132656,
 0.0051557403057813644,
 -0.01780499517917633,
 -0.0077209738083183765,
 0.001310259336605668,
 0.0008040048414841294,
 -0.012674528174102306,
 -0.002502050483599305,
 -0.0071396897546947,
 -0.009357416070997715,
 -0.028963129967451096,
 -0.018196729943156242,
 0.016528697684407234,
 -0.004722936078906059,
 -0.03348704054951668,
 0.012567116878926754,
 0.02663799189031124,
 0.0126429367810

In [24]:
fed_df = pd.read_csv('fed-embeddings.csv')
fed_df['embedding'] = fed_df['embedding'].apply(eval).apply(np.array)
fed_df


Unnamed: 0.1,Unnamed: 0,text,embedding
0,0,Good afternoon,"[-0.017543727532029152, 0.020710282027721405, ..."
1,1,My colleagues and I are strongly committed to ...,"[-0.027029043063521385, -0.01242210902273655, ..."
2,2,We have both the tools that we need and the re...,"[0.003941578324884176, -0.015006175264716148, ..."
3,3,Price stability is the responsibility of the F...,"[0.009378707036376, -0.016561055555939674, -0...."
4,4,"Without price stability, the economy does not ...","[-0.003026996273547411, -0.014454687014222145,..."
5,5,"In particular, without price stability, we wil...","[-0.03618694841861725, -0.008898851461708546, ..."
6,6,"Today, the FOMC raised our policy interest rat...","[-0.024621201679110527, -0.02114815264940262, ..."
7,7,We are moving our policy stance purposefully t...,"[-0.025701606646180153, -0.012234759517014027,..."
8,8,"In addition, we are continuing the process of ...","[-0.03149143233895302, 0.0019273122306913137, ..."
9,9,Restoring price stability will likely require ...,"[-0.010953230783343315, -0.020290518179535866,..."


In [25]:
fed_df["similarities"] = fed_df['embedding'].apply(lambda x: cosine_similarity(x, fed_sentence_vector))

fed_df

Unnamed: 0.1,Unnamed: 0,text,embedding,similarities
0,0,Good afternoon,"[-0.017543727532029152, 0.020710282027721405, ...",0.73577
1,1,My colleagues and I are strongly committed to ...,"[-0.027029043063521385, -0.01242210902273655, ...",0.836351
2,2,We have both the tools that we need and the re...,"[0.003941578324884176, -0.015006175264716148, ...",0.806917
3,3,Price stability is the responsibility of the F...,"[0.009378707036376, -0.016561055555939674, -0....",0.816067
4,4,"Without price stability, the economy does not ...","[-0.003026996273547411, -0.014454687014222145,...",0.783453
5,5,"In particular, without price stability, we wil...","[-0.03618694841861725, -0.008898851461708546, ...",0.803627
6,6,"Today, the FOMC raised our policy interest rat...","[-0.024621201679110527, -0.02114815264940262, ...",0.891531
7,7,We are moving our policy stance purposefully t...,"[-0.025701606646180153, -0.012234759517014027,...",0.858911
8,8,"In addition, we are continuing the process of ...","[-0.03149143233895302, 0.0019273122306913137, ...",0.834526
9,9,Restoring price stability will likely require ...,"[-0.010953230783343315, -0.020290518179535866,...",0.846409


In [26]:
fed_df.sort_values("similarities", ascending=False)

Unnamed: 0.1,Unnamed: 0,text,embedding,similarities
6,6,"Today, the FOMC raised our policy interest rat...","[-0.024621201679110527, -0.02114815264940262, ...",0.891531
34,34,"With today's action, we've raised interest rat...","[-0.031383171677589417, -0.013122319243848324,...",0.882609
35,35,We anticipate that ongoing increases in the ta...,"[-0.027336647734045982, -0.01710493117570877, ...",0.879648
41,41,"Even so, we still have some ways to go, and in...","[-0.015159842558205128, -0.018926262855529785,...",0.873755
38,38,That's why we say in our statement that in det...,"[-0.0264048520475626, -0.025579296052455902, 0...",0.872831
52,52,We at the Fed will do everything we can to ach...,"[-0.03184179216623306, -0.0009344873833470047,...",0.866995
33,33,"At today's meeting, the committee raised the t...","[-0.030012447386980057, -0.013376826420426369,...",0.866519
40,40,There is significant uncertainty around that l...,"[-0.011700413189828396, -0.01555322390049696, ...",0.863046
36,36,Financial conditions have tightened significan...,"[-0.02959049679338932, -0.02223173901438713, 0...",0.861405
7,7,We are moving our policy stance purposefully t...,"[-0.025701606646180153, -0.012234759517014027,...",0.858911


In [27]:
#Calculating Cosine Similarity
#We used the Cosine Similarity function, but how does it actually work? Cosine similarity is just calculating the similarity between two vectors. 
#There is a mathematical equation for calculating the angle between two vector.

v1 = np.array([1,2,3])
v2 = np.array([4,5,6])

# (1 * 4) + (2 * 5) + (3 * 6)
dot_product = np.dot(v1, v2)
dot_product

32

In [28]:
# square root of (1^2 + 2^2 + 3^2) = square root of (1+4+9) = square root of 14
np.linalg.norm(v1)

3.7416573867739413

In [29]:
# square root of (4^2 + 5^2 + 6^2) = square root of (16+25+36) = square root of 14
np.linalg.norm(v2)

8.774964387392123

In [30]:
magnitude = np.linalg.norm(v1) * np.linalg.norm(v2)
magnitude

32.83291031876401

In [31]:
dot_product / magnitude


0.9746318461970762

In [32]:
from scipy import spatial

result = 1 - spatial.distance.cosine(v1, v2)

result

0.9746318461970761