In [None]:
OPENAI_API_KEY = ''
OPENAI_ORG = ''

# Using GPT for Product Recommendation Engines

Today we will be using SingleStore Kai™ for MongoDB, along with OpenAI, to put together a simple product recommendation engine in Python.

**note:** this notebook is a WIP and should be completed in the next day or so.

## Import Dataset

First, we'll import a dataset into our database. Let's use [Open Library's Works](https://openlibrary.org/data/ol_dump_works_latest.txt.gzhttps://openlibrary.org/data/ol_dump_works_latest.txt.gz) dataset.

For the sake of brevity for the webinar, we've downloaded the extremely large dataset linked above and restricted it to 

### Download Dataset

In [None]:
import requests

dataset_url = 'https://raw.githubusercontent.com/singlestore-labs/webinar-code-examples/main/kai-product-rec/books_scifi.txt'

def download_file(dataset_url):
    local_filename = dataset_url.split('/')[-1]
    with requests.get(dataset_url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filename

local_dataset = download_file(dataset_url)

### Read text file into variable

In [None]:
file = open(local_dataset)
data = file.readlines()
file.close()

In [None]:
cost_per_1k = 0.004
dollar_limit = 20.00
budget_tokens = (dollar_limit / cost_per_1k) * 1000
token_usage = 0

def budget_status(token_usage):
    if budget_tokens > token_usage:
        return 'ok'
    else:
        return 'spent'

### Create SQL Table

Here we are creating the table to store our books in. We will have 

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS products (
  _id INT AUTO_INCREMENT PRIMARY KEY,
  title VARCHAR(255) NOT NULL,
  embedding BLOB NOT NULL
);

## Helpers

### Track Token Usage

You should be mindful to track your token usage to ensure you don't blow your budget on any particular project. This is rudimentary, but should do the trick for our simple project today.

**Note:** Pricing changes often, check [here](https://openai.com/pricing) and adjust the pricing as necessary.

The function below will return 'ok' if our budget hasn't been spent yet, if it has, then it will return 'spent'.

### Create Embeddings and Load into S2

Now we need to loop through the dataset to create the embeddings using the OpenAI API. 

This process will track token usage and bail if we've spent our budget.

In [None]:
!pip install openai

import openai
import ast
from sqlalchemy import *

openai.organization = OPENAI_ORG
openai.api_key = OPENAI_API_KEY

conn = create_engine(connection_url)

model_id = 'text-embedding-ada-002'

ds_with_embeddings = []
total_items = len(data)

def request_embedding(text, token_usage):
    
    budget = budget_status(token_usage)
    
    if budget == 'ok':
        #print('Budget status: OK\nTokens: {}/{}'.format(token_usage,budget_tokens))
        try:
            if OPENAI_API_KEY:
                response = openai.Embedding.create(input=text,model=model_id)
                embedding = response['data'][0]['embedding']
                tokens = response['usage']['total_tokens']
                status = 'success'
                #print(embedding)
                return embedding,tokens,status
            else:
                print('You need to set your OpenAI API Key to the variable OPENAI_API_KEY')
        except Exception as e:
            print(e)
            embedding = ''
            tokens = 0
            status = 'failed'
            return embedding,tokens,status
    else:
        print('Budget Spent: {}/{}'.format(token_usage,budget_tokens))
        embedding = ''
        tokens = 0
        status = 'budget_spent'
        return embedding,tokens,status

def write_to_db(data):
    keys = ["title", "embedding" ];
    query = "INSERT INTO products (title, embedding) VALUES (%s, JSON_ARRAY_PACK_F32(%s))"
    
    try:
        with conn:
            conn.execute(query, (data[keys[0]].replace("'",""), str(data[keys[1]])))
            print("Wrote item")
    except Exception as e:
        print(e)
    

loop_counter = 0
print('Requesting embeddings. I will update you every 1000 embeddings.')
for b in data:
    try:
        embedding,tokens,status = request_embedding(b, token_usage)
        if status != 'failed' and status != 'budget_spent':
            book = ast.literal_eval(b)
            book['embedding'] = embedding
            write_to_db(book)
            token_usage += tokens
            #print('Completed {}/{}'.format(len(ds_with_embeddings),total_items))
            loop_counter += 1
            if loop_counter == 1000:
                print('Completed {}/{}'.format(len(ds_with_embeddings),total_items))
                print('Token usage: {}/{}'.format(token_usage,budget_tokens))
                loop_counter = 0
        elif status == 'budget_spent':
            print('Getting embedding failed because the budget is spent.')
        else:
            print('Getting embedding for this book failed:\n{}'.format(b))
    except Exception as e:
        print(e)
        
conn.close()


In [None]:
query = 'The Martian'



sql_query = 'SELECT title FROM products WHERE EUCLIDEAN_DISTANCE(vector, JSON_ARRAY_PACK('query')) ;

SELECT EUCLIDEAN_DISTANCE(vector, JSON_ARRAY_PACK('[5.9,3,5.1,1.8]')) AS euclidean_distance, title
FROM products
ORDER BY euclidean_distance
LIMIT 5;

