# Get Embeddings

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set up Azure OpenAI

In [4]:
import re
import requests
import sys
import os
from openai import AzureOpenAI
import tiktoken
from dotenv import load_dotenv
load_dotenv("environment_variables.env")

True

In [8]:
client = AzureOpenAI(
  azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT_TEXT"), 
  api_key=os.environ.get("AZURE_OPENAI_KEY_TEXT"),  
  api_version=os.environ.get("AZURE_OPENAI_API_VERSION")
)

## Load Data

In [5]:
import pandas as pd

df_orig = pd.read_csv("../data/bbc-news-data.csv", delimiter='\t')
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,399.txt,Be careful how you code,A new European directive could put software w...
2223,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Get Embeddings
ref: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/tutorials/embeddings?tabs=bash

In [11]:
# embedding = openai.Embedding.create(
#     input="Your text goes here",
#     deployment_id=deployment_id)


embedding = client.embeddings.create(input="Your text goes here", model=os.environ.get('AZURE_OPENAI_EMBEDDING_DEPLOYMENT')).data[0].embedding
# Access embeddings
# len(embedding["data"][0]["embedding"])

In [15]:
df['embedding'] = ''

for i in range(len(df)):    
#for i in range(760,765):
    try:
        embedding = client.embeddings.create(input=df['content'][i], model=os.environ.get('AZURE_OPENAI_EMBEDDING_DEPLOYMENT')).data[0].embedding
        # embedding = openai.Embedding.create(input=df['content'][i], deployment_id=deployment_id)
        # df['embedding'][i] = embedding['data'][0]['embedding']
        df['embedding'][i] = embedding
    except Exception as err:
        i
        print(f"Unexpected {err=}, {type(err)=}")

## Save embeddings
Following notebooks will require embeddings generated in this notebook. 

In [16]:
df.head(3)

Unnamed: 0,category,filename,title,content,embedding
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,"[-0.021155867725610733, -0.01682022027671337, ..."
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,"[-0.02455304190516472, -0.012937315739691257, ..."
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"[-0.021608198061585426, -0.036890748888254166,..."


In [17]:
# Save embeddings
df.to_csv("../data/bbc-news-data-embedding.csv", sep='\t', index=False)