## Word Embeddings

#### 1. Define Model

In [1]:
import openai
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
base_url = "https://khipus-aoai.openai.azure.com"
api_version = "2023-05-15"
api_key = os.environ.get("BjSM1Dwo5UZVvPUizHw8w0n8i7TM3fHIK3GjbeIYX5Z1nqffyiCBJQQJ99BBACYeBjFXJ3w3AAABACOGRhVh")
deployment_name = "text-embedding-ada-002" # change to your deployment name

openai.api_type = "azure"
openai.api_base = base_url
openai.api_version = api_version
openai.api_key = api_key

client = openai

#### 2. Load Data and Generate Embeddings

In [2]:
import pandas as pd

df = pd.read_csv("data/bbc-news-data.csv", delimiter='\t')
df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [3]:
# Example: Load data (update with your actual data source)
# df_subset = pd.read_csv("your_data.csv")

# Example: Assuming you have a column "text" and you generate embeddings:
def get_embedding(text):
    response = client.embeddings(deployment_id=deployment_name, input=text)
    # Adjust response parsing based on the API response format
    return response['data'][0]['embedding']




In [4]:
def generate_embedding(input):
    #"""Generate an embedding for the provided input text using Azure OpenAI."""
    response = openai.Embedding.create(
        input=input,
        engine=deployment_name  # Use engine instead of model
    )
    return response["data"][0]["embedding"]


In [6]:
print(generate_embedding(df.iloc[0].content))

AuthenticationError: No API key provided. You can set your API key in code using 'openai.api_key = <API-KEY>', or you can set the environment variable OPENAI_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the openai module at it with 'openai.api_key_path = <PATH>'. You can generate API keys in the OpenAI web interface. See https://platform.openai.com/account/api-keys for details.

In [None]:
def generate_embedding(input):
    response = client.Embedding.create(
        input=input,
        model=deployment_name
    )

    return response.data[0].embedding

# example embedding for content of first document
print(generate_embedding(df.iloc[0].content))

In [None]:
# generate embeddings for all documents
df_subset = df.sample(800)
df_subset["embedding"] = df_subset.apply(lambda x: generate_embedding(x.content), axis=1)

In [None]:
df_subset.head(10)

In [None]:
# write to file
df_subset.to_csv("data/bbc-news-data-embeddings.csv", index=False)

#### 3. Visualise embeddings

In [None]:
# Read data
df_subset = pd.read_csv("data/bbc-news-data-embeddings.csv")
df_subset["embedding"] = df_subset.embedding.apply(eval).apply(np.array)
df_subset.head()

In [None]:
# Reduce dimensionality
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
dims_pca = pca.fit_transform(df_subset.embedding.to_list())
print("shape=", dims_pca.shape)
print(dims_pca)

In [None]:
df_subset["pca3"] = dims_pca.tolist()
df_subset.head()

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection="3d")
cmap = plt.get_cmap("tab20")

categories = sorted(df_subset.category.unique())

# Plot each sample category individually
for i, cat in enumerate(categories):
    sub_matrix = np.array(df_subset[df_subset["category"] == cat]["pca3"].to_list())
    x=sub_matrix[:, 0]
    y=sub_matrix[:, 1]
    z=sub_matrix[:, 2]
    colors = [cmap(i/len(categories))] * len(sub_matrix)
    _ = ax.scatter(x, y, zs=z, zdir="z", c=colors, label=cat)

_ = ax.set_xlabel("x")
_ = ax.set_ylabel("y")
_ = ax.set_zlabel("z")
_ = ax.legend()

#### 4. Cosine Similarity

In [None]:
# Get the first item's embedding as a numpy array
first_item = df_subset.iloc[0]["embedding"]

# Define a function to calculate the cosine similarity between two numpy arrays
def cosine_similarity(a, b):
    # Use the dot product and the norm of the vectors to compute the cosine similarity
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Create an empty list to store the cosine similarities
cosine_similarities = []

# Loop through the dataframe and calculate the cosine similarity with the first item
for i, row in df_subset.iterrows():
    # Get the current item's embedding as a numpy array
    current_item = row["embedding"]
    # Calculate the cosine similarity and append it to the list
    cosine_similarities.append(cosine_similarity(first_item, current_item))

# Add a new column to the dataframe with the cosine similarities
df_subset["cosine_similarity"] = cosine_similarities

# Print the updated dataframe
df_subset[["category", "cosine_similarity"]].head(20).sort_values(by="cosine_similarity")