In [3]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import json
import faiss
from IPython.display import display, Markdown

from helpers import flatten_data

In [10]:
"""
Load openai API
"""
# Automatically load environment variables from the .env file in the current working directory
load_dotenv()

# Get the API key from the environment variable
openai_api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(
    # This is the default and can be omitted
    api_key=openai_api_key,
)

In [4]:
# Read the JSON data from a file
with open('users_products_sales.json', 'r') as file:
    data = json.load(file)

# Convert into list and flattened the key, val
data_strings = flatten_data(data)

In [5]:
"""
Load pre-trained model and tokenizer from HF
"""
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [6]:
"""
Generate embeddings
"""
def generate_embeddings(text_list):
    inputs = tokenizer(
        text_list,
        return_tensors='pt',
        padding=True, 
        truncation=True
    )
    with torch.no_grad():
        model_output = model(**inputs)

    return model_output.last_hidden_state.mean(dim=1).numpy()

In [7]:
embeddings = generate_embeddings(data_strings)

In [8]:
"""
Store embedding in FAISS
"""
dimensions = embeddings.shape[1]
# Create index
index = faiss.IndexFlatL2(dimensions)
# Add embedding to index, please have a look at other methods to get familiar
index.add(embeddings)

In [16]:
"""
Query vector database
"""
def query_vector_database(query_text):
    # Generate an embedding for the query
    query_embedding = generate_embeddings([query_text])
    # Seach FAISS index
    distances, indices = index.search(query_embedding, k=3)

    return indices[0], distances[0]

In [25]:
"""
Chat with custom data using chatGPT
"""
def chat_with_custom_data(user_input):
    # Query the vector database
    indices, distances = query_vector_database(user_input)
    # Retrieve corresponding data
    results = [data_strings[i] for i in indices]
    
    # Format the results as reponse
    response_text = "Here are the top results:\n"
 
    # Generate answer using ChatGPT
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": response_text                
            },
            {
                "role": "user",
                "content": user_input + " please write in nice markdown"
            }
    
        ],
        model="gpt-3.5-turbo"
    )
    
    message_content = chat_completion.choices[0].message.content

    return message_content

In [35]:
# Example usage
user_input = "who bought the most and what product sell the most?"
response = chat_with_custom_data(user_input)

In [41]:
user_input = "who is the youngest user?"
indices, distances = query_vector_database(user_input)
# Retrieve corresponding data
results = [data_strings[i] for i in indices]

In [42]:
results

['sale_id: 1005, user_id: 1, sales_product_id: 102, quantity: 1, total_price: 699.99, sale_date: 2023-05-18',
 'sale_id: 1004, user_id: 4, sales_product_id: 104, quantity: 3, total_price: 599.97, sale_date: 2023-04-12',
 'sale_id: 1003, user_id: 3, sales_product_id: 103, quantity: 1, total_price: 299.99, sale_date: 2023-03-05']