# OpenAI API: Embeddings

### Install the necessary libraries.

In [None]:
pip install openai

Collecting openai
  Downloading openai-1.35.13-py3-none-any.whl.metadata (21 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm>4 (from openai)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting annotated-types>=0.4.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.20.1 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading openai-1.35.13-py3-none-any.whl (328 kB)
[2K

In [None]:
pip install openai[datalib]

Collecting pandas-stubs>=1.1.0.11 (from openai[datalib])
  Downloading pandas_stubs-2.2.2.240603-py3-none-any.whl.metadata (10 kB)
Collecting types-pytz>=2022.1.1 (from pandas-stubs>=1.1.0.11->openai[datalib])
  Downloading types_pytz-2024.1.0.20240417-py3-none-any.whl.metadata (1.5 kB)
Downloading pandas_stubs-2.2.2.240603-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.0/157.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading types_pytz-2024.1.0.20240417-py3-none-any.whl (5.2 kB)
Installing collected packages: types-pytz, pandas-stubs
Successfully installed pandas-stubs-2.2.2.240603 types-pytz-2024.1.0.20240417
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install urllib3==1.26.6

Collecting urllib3==1.26.6
  Downloading urllib3-1.26.6-py2.py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.5/138.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.0.7
    Uninstalling urllib3-2.0.7:
      Successfully uninstalled urllib3-2.0.7
Successfully installed urllib3-1.26.6
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
[0mSuccessfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


### Import the libraries and environment file to gain access to the Open API Key
#### The key can be generated here: https://platform.openai.com/account/api-keys

In [None]:
import os
from openai import OpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

### Authenticate to the API using the API Key
#### Pull from environment variables or use api_key = ("your_key_here") to hardcode the key

In [None]:
client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY']
)

#### Helper functions

In [None]:
def get_embeddings(word, model):
    try:
        response = client.embeddings.create(
            input= word,
            model= model
        )

        return response
    except openai.APIError as e:
        print(e.http_status)
        print(e.error)
        return e.error


### Cosine Similarity

**Cosine similarity** is a metric used to measure the similarity between two non-zero vectors in an inner product space. It is often used in NLP to compare embeddings by calculating the cosine of the angle between two vectors, resulting in a similarity score between -1 and 1.

#### Formula
\[ \text{cosine\_similarity} = \frac{\vec{A} \cdot \vec{B}}{\|\vec{A}\| \|\vec{B}\|} \]
where:
- \(\vec{A} \cdot \vec{B}\) is the dot product of vectors A and B.
- \(\|\vec{A}\|\) and \(\|\vec{B}\|\) are the magnitudes (or Euclidean norms) of vectors A and B.

#### Interpretation
- **1**: The vectors are identical.
- **0**: The vectors are orthogonal (no similarity).
- **-1**: The vectors are diametrically opposed.


In [None]:
import numpy as np
from numpy.linalg import norm

# compute cosine similarity
def compute_cosine_similarity(embeddings1, embeddings2):
    return np.dot(embeddings1,embeddings2)/(norm(embeddings1)*norm(embeddings2))

# Embeddings for single words

### Embeddings for NLP

**Embeddings** are dense vector representations of words, sentences, or documents, designed to capture the semantic meaning and relationships between them. Traditional NLP techniques, such as the bag-of-words model or TF-IDF, represent words as sparse vectors, which often fail to capture the contextual meaning. Embeddings address this limitation by representing words in a continuous vector space, where similar words have similar vectors.

#### Types of Embeddings

1. **Word Embeddings**:
   - **Word2Vec**: Developed by Google, Word2Vec uses neural networks to learn word associations from large datasets. It creates embeddings using two main approaches: Continuous Bag of Words (CBOW) and Skip-gram.
   - **GloVe**: Developed by Stanford, GloVe (Global Vectors for Word Representation) creates embeddings by factorizing the word co-occurrence matrix, capturing global statistical information.
   - **FastText**: Developed by Facebook, FastText extends Word2Vec by considering subword information, allowing it to handle rare and out-of-vocabulary words better.

2. **Contextualized Word Embeddings**:
   - **ELMo**: Developed by AllenNLP, ELMo (Embeddings from Language Models) generates word representations that capture context, meaning a word can have different embeddings depending on its context in a sentence.
   - **BERT**: Developed by Google, BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model that generates context-aware embeddings for words. It considers the bidirectional context of words in a sentence.
   - **GPT-3**: Developed by OpenAI, GPT-3 (Generative Pre-trained Transformer 3) is a large-scale transformer model that generates highly contextualized embeddings for various NLP tasks.

3. **Sentence and Document Embeddings**:
   - **InferSent**: Developed by Facebook, InferSent generates sentence-level embeddings that can be used for various downstream tasks.
   - **Universal Sentence Encoder**: Developed by Google, this encoder creates embeddings for sentences and paragraphs, designed for easy integration into various NLP applications.
   - **Doc2Vec**: An extension of Word2Vec, Doc2Vec generates embeddings for entire documents.

#### Applications in NLP
- **Document Similarity**: Comparing the similarity between documents, articles, or any text data.
- **Information Retrieval**: Ranking search results based on the similarity of the query and documents.
- **Recommendation Systems**: Recommending items (e.g., products, movies) based on textual descriptions or user reviews.
- **Clustering**: Grouping similar documents or text data for tasks such as topic modeling.
- **Semantic Search**: Enhancing search engines to understand the semantic meaning of queries and documents.

### Example

Consider two sentences:
1. "The cat sat on the mat."
2. "The dog sat on the rug."

Using embeddings, each word or the entire sentence is represented as a vector. Cosine similarity can then be calculated between these vectors to determine how similar the sentences are.

#### Calculation Steps
1. **Obtain Embeddings**: Generate embeddings for the sentences using a model like BERT.
2. **Compute Cosine Similarity**: Apply the cosine similarity formula to the embeddings of the two sentences.

By leveraging embeddings and cosine similarity, we can perform sophisticated text analysis and comparisons, capturing the nuances of human language more effectively than traditional methods.

Generate embeddings for single words

In [None]:
response = get_embeddings("sun","text-embedding-ada-002")

In [None]:
sun_embeddings = response.data[0].embedding

In [None]:
print(sun_embeddings)

[0.024731263518333435, -0.0025623245164752007, -0.0014253035187721252, -0.009048685431480408, -0.02318471111357212, 0.025301046669483185, -0.014719375409185886, -0.028299186378717422, -0.00608785217627883, -0.022777725011110306, 0.007088362704962492, 0.01802953891456127, 0.008553517051041126, 0.0018670543795451522, -0.01535698864609003, -0.005039860028773546, 0.043113525956869125, -0.025708032771945, 0.026562707498669624, -0.021624593064188957, -0.009516720660030842, 0.007027314510196447, -0.005148389842361212, -0.002426662016659975, -0.0025928483810275793, 0.004181794822216034, -9.199609485222027e-05, -0.019508259370923042, 0.004093614406883717, -0.02043076418340206, 0.02226220816373825, -0.01585894078016281, -0.017771780490875244, -0.005623208358883858, -4.1838087781798095e-05, -0.01137529592961073, -0.005636774469166994, -0.000401052093366161, -0.0009386145393364131, -0.007169760297983885, 0.0011098884278908372, 0.01702563650906086, -0.0012396156089380383, -0.006223514676094055, 0.0

In [None]:
response = get_embeddings("moon","text-embedding-ada-002")

In [None]:
moon_embeddings = response.data[0].embedding

In [None]:
print(moon_embeddings)

[0.017546771094202995, -0.009298109449446201, 0.0021268813870847225, -0.01645534485578537, -0.02790132537484169, 0.008332617580890656, -0.020723100751638412, -0.012418468482792377, 0.005124804563820362, -0.034002114087343216, 0.006667492911219597, 0.017910579219460487, 0.015629779547452927, -0.0035331416875123978, -0.013866706751286983, 0.00218285177834332, 0.039879024028778076, -0.021002953872084618, 0.009466021321713924, -0.016329411417245865, 0.014972125180065632, -0.002448712009936571, 0.016161499544978142, -0.003799001919105649, -0.007118755951523781, 0.0034509350080043077, 0.00931909866631031, -0.003522647311910987, -0.0007901469361968338, -0.013104107230901718, 0.016497323289513588, -0.01695908047258854, -0.028195170685648918, -0.0015365667641162872, -0.007744926493614912, -0.0020201874431222677, 0.004970885347574949, -0.007863864302635193, 0.0010372044052928686, -0.004530117381364107, 0.013467916287481785, 0.012628357857465744, 0.0008290639379993081, 0.005205261986702681, -0.00

### Compare the vectors

#### Vectors need to be the same length for the comparison

In [None]:
len(sun_embeddings)

1536

In [None]:
len(moon_embeddings)

1536

#### Cosine similarity is a measure of similarity between two non-zero vectors. The value can be between 0 and 1; the closer the value is to 1, the more similar the vectors are.

In [None]:
cosine = compute_cosine_similarity(sun_embeddings, moon_embeddings)
print("Cosine Similarity:", cosine)

Cosine Similarity: 0.879386766617019


### Generate embeddings for dissimilar words

In [None]:
response = get_embeddings("cloud","text-embedding-ada-002")
computer_embeddings = response.data[0].embedding
print(response)

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.004539640620350838, -0.012169663794338703, -0.0006128515233285725, -0.011888720095157623, 0.014622782357037067, 0.03237020969390869, -0.009079281240701675, -0.032479844987392426, 0.006924237124621868, -0.03670085594058037, 0.014746123924851418, 0.025298647582530975, 0.0021430531051009893, 0.0030766772106289864, -0.017377402633428574, 0.005296818446367979, 0.027450265362858772, 0.000763173564337194, -0.006331514101475477, -0.031164206564426422, -0.0003280532837379724, 0.011039036326110363, -0.0034741100389510393, -0.010059159249067307, -0.01614399068057537, -0.0019203537376597524, 0.03508371487259865, -0.01541764847934246, 0.007393618579953909, -0.01065530814230442, 0.00714008416980505, 0.0006680981023237109, -0.009298554621636868, -0.005982047412544489, -0.010319545865058899, -0.015088737942278385, 0.0060711270198225975, -0.010840320028364658, -0.0035066583659499884, 0.0057730525732040405, 0.010799205861985683, -0.003200018545612693

In [None]:
response = get_embeddings("hero","text-embedding-ada-002")
hero_embeddings = response.data[0].embedding
print(response)

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.011492114514112473, -0.016904346644878387, -0.011065955273807049, -0.009297390468418598, -0.006960613653063774, 0.01399935595691204, -0.017898719757795334, -0.019503923133015633, -0.005021585617214441, -0.0020402411464601755, 0.028666364029049873, 0.027643579989671707, 0.015625866129994392, -0.0028783557936549187, -0.009183748625218868, -0.00737967062741518, 0.04542865604162216, 0.00021851799101568758, 0.03852486610412598, -0.015341760590672493, -0.010916799306869507, 0.009105619043111801, -0.0027451806236058474, 0.0018839823314920068, -0.00875758845359087, 0.010206532664597034, 0.005415783263742924, -0.044434282928705215, 0.020597733557224274, -0.015725303441286087, 0.003732451470568776, -0.01424084696918726, -0.02664920501410961, -0.01656341925263405, -0.02058352902531624, -0.013203857466578484, -6.009521894156933e-05, -0.0046415929682552814, 0.010547460056841373, 0.009638318791985512, 0.023722907528281212, -0.0052275629714131355,

In [None]:
print("Cosine Similarity:", compute_cosine_similarity(computer_embeddings, hero_embeddings))

Cosine Similarity: 0.7829414235083916


# Embedding for sentences

### Determine textual entailment for similar sentences

In [None]:
text_premise = '''
                The astronaut completed her spacewalk outside
                the International Space Station.
               '''
text_hypothesis = "The spacewalk occurred in space."

sentences = [text_premise, text_hypothesis]

response = get_embeddings(sentences,"text-embedding-ada-002")

# Text clustering

**Text clustering** is a technique used in natural language processing (NLP) and text mining to group a set of documents or text data into clusters based on their similarity. The main goal is to organize large amounts of unstructured text data into meaningful clusters, making it easier to understand, analyze, and extract insights. Here are some common applications and uses of text clustering:

1. **Document Organization and Management**:
   - Automatically organizing documents into categories or topics.
   - Simplifying the process of finding and retrieving documents on similar topics.

2. **Topic Discovery and Modeling**:
   - Identifying underlying themes or topics within a large corpus of text.
   - Understanding the main subjects discussed in a set of documents.

3. **Search Optimization**:
   - Enhancing search engines by grouping similar search results together.
   - Providing users with more relevant and organized search results.

4. **Customer Feedback Analysis**:
   - Analyzing customer reviews, feedback, or survey responses by clustering similar comments.
   - Identifying common issues, themes, or sentiments expressed by customers.

5. **Social Media Analysis**:
   - Grouping similar posts, tweets, or comments to understand trends and public opinion.
   - Identifying and tracking emerging topics or events on social media platforms.

6. **Recommendation Systems**:
   - Improving recommendation systems by clustering user-generated content.
   - Suggesting similar articles, products, or services based on text similarity.

7. **Spam Detection and Filtering**:
   - Grouping similar spam messages to improve filtering algorithms.
   - Identifying patterns in spam content to enhance detection methods.

8. **Market Research and Competitive Analysis**:
   - Clustering text data from market research reports, news articles, or competitor analysis.
   - Extracting insights about industry trends, market dynamics, and competitive strategies.

9. **Healthcare and Biomedical Research**:
   - Analyzing and clustering medical records, research papers, or clinical trial data.
   - Identifying patterns in patient symptoms, treatment outcomes, or biomedical literature.

10. **Sentiment Analysis**:
    - Grouping text based on sentiment to understand overall positive, negative, or neutral trends.
    - Analyzing customer sentiment towards products, brands, or services.

Text clustering helps to reduce the complexity of analyzing large volumes of text data, enabling more efficient and effective extraction of useful information and insights.

### Cluster similar words

In [None]:
from sklearn.cluster import KMeans

# Words to cluster
words = ['apple', 'banana', 'car', 'bike', 'grape', 'truck']

response = client.embeddings.create(
            input= words,
            model= "text-embedding-ada-002")

In [None]:
embeddings = []

for i, embedding in enumerate(words):
    embeddings.append(response.data[i].embedding)

In [None]:
# Apply KMeans clustering

#number of clusters based on the number of categories: fruit and vehicle
k = 2

#randmo_state = 0 makes our results reproducible.
#n_init = number of times the k-means algorithm is run
kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit(embeddings)
labels = kmeans.labels_

In [None]:
# Print the clusters
for label, word in zip(labels, words):
    print(f"{word}: Cluster {label}")

apple: Cluster 1
banana: Cluster 1
car: Cluster 0
bike: Cluster 0
grape: Cluster 1
truck: Cluster 0


### Compare the vectors

#### Vectors need to be the same length for the comparison

In [None]:
len(response.data[0].embedding)

1536

In [None]:
len(response.data[1].embedding)

1536

#### Cosine similarity is a measure of similarity between two non-zero vectors. The value can be between 0 and 1; the closer the value is to 1, the more similar the vectors are.

In [None]:
cosine = compute_cosine_similarity(response.data[0].embedding, response.data[1].embedding)
print("Cosine Similarity:", cosine)

Cosine Similarity: 0.9000503698739672


### Determine textual entailment for dissimilar sentences

In [None]:
text_premise = "A group of students is studying in the library."
text_hypothesis = "It is raining outside."

sentences = [text_premise, text_hypothesis]

response = get_embeddings(sentences,"text-embedding-ada-002")

In [None]:
len(response.data[0].embedding)

1536

In [None]:
len(response.data[1].embedding)

1536

In [None]:
print("Cosine Similarity:", compute_cosine_similarity(response.data[0].embedding,
                                                      response.data[1].embedding))

Cosine Similarity: 0.7909503880133563
