In [2]:
import pandas as pd
import numpy as np 

import os
import sys

# Appends the entire brainstation_capstone project folder to the path.
# This allows us to make a relative import of our scripts in brainstation_capstone/scripts
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utilities import utils
from utilities.vectorizer_pipeline import VectorizerPipeline
%load_ext autoreload
%autoreload 2

DATA_PATH = utils.get_datapath('data')

**Table of contents**<a id='toc0_'></a>    
- [**4. Generating OpenAI Embeddings**](#toc1_)    
- [Loading the Dataset](#toc2_)    
- [Initialize Connection to OpenAI API](#toc3_)    
- [Checking for Lyrics Over the Max Token Limit](#toc4_)    
- [Generating Document Embeddings](#toc5_)    
- [Conclusion](#toc6_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[**4. Generating OpenAI Embeddings**](#toc0_)

The final transformation of the lyrics to try is using an OpenAI embedding model. Specifically we are using the second generation Ada embedding which boasts high performance for text classification tasks. Find more info [here](https://openai.com/blog/new-and-improved-embedding-model).

# <a id='toc2_'></a>[Loading the Dataset](#toc0_)

We will embed the lyrics that are not stemmed and do not have stop words removed. This decision was made as the embeddings themselves can capture the differences between words that stemming would reduce to the same word. For example, the embedding will provide different results for `run` and `running` but stemming would reduce both these words to `run`. 

In [3]:
df = pd.read_csv(DATA_PATH / 'clean_lyrics_english.csv')
df.shape

(33842, 11)

# <a id='toc3_'></a>[Initialize Connection to OpenAI API](#toc0_)


The first step to generate the document embeddings is to initialize our connection to the OpenAI API. 

In [None]:
import tiktoken
import openai

# Reading the OpenAI API key. 
with open(DATA_PATH / 'open_ai_key.txt', 'r') as file:
    openai_API_key = file.readline()

openai.API_key = openai_API_key

# Setting the model parameters for the embeddings.
embedding_model = 'text-embedding-ada-002'
embedding_encoding = 'cl100k_base' # Tokenizer for the above ada embedding model. 
max_tokens = 8000 # This is the max token limit for the Ada Embedding.

# <a id='toc4_'></a>[Checking for Lyrics Over the Max Token Limit](#toc0_)

After intializing our connection to the OpenAI API, we need to make sure all our lyrics stay below the 8000 token limit set for the embedding model. 

In [None]:
encoding = tiktoken.get_encoding(embedding_encoding)

In [None]:
df['n_tokens'] = df['cleaned_lyrics'].apply(lambda x: len(encoding.encode(x)))

In [None]:
# Check to limit lyrics to those below the max token limit for the second generation ada embedding model.
df = df[
    df['n_tokens'] <= max_tokens
]

In [None]:
df.shape

(35901, 15)

# <a id='toc5_'></a>[Generating Document Embeddings](#toc0_)

Now that we have filtered out all the songs that are above the token limit we can proceed to access the API and get the document embeddings for our dataset. 

In [None]:
# These packages are used to prevent us from reaching the rate limit when accessing the API. 
from tenacity import retry, stop_after_attempt, wait_exponential

In [None]:
# This function creates an Ada embedding given text as input. 
# The decorator allows us to pause the number of requests that we are making to the API
# if we reach the rate limit. 
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=30))
def get_embedding(text, engine="text-embedding-ada-002"):
    response = openai.Embedding.create(input=[text], model=engine)
    embeddings = response['data'][0]['embedding']
    return embeddings

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
df['ada_embeddings'] = df['cleaned_lyrics'].progress_apply(lambda x: get_embedding(x))

100%|██████████| 35901/35901 [2:07:10<00:00,  4.70it/s]  


In [None]:
df.to_csv(DATA_PATH / 'clean_lyrics_ada.csv')

# <a id='toc6_'></a>[Conclusion](#toc0_)

From the above we have successfully generated document embeddings for all our lyrics using a second generation Ada text embedding from OpenAI. 