In [1]:
#!pip install -r requirements.txt

In [37]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import json
import faiss
from IPython.display import display, Markdown

In [3]:
"""
Load openai API
"""
# Automatically load environment variables from the .env file in the current working directory
load_dotenv()

# Get the API key from the environment variable
openai_api_key = os.getenv('OPENAI_API_KEY')

In [4]:
"""
Load JSON data
"""
with open('./simple_vector_db/data.json', 'r') as file:
    data = json.load(file)

In [5]:
"""
Convert each row as string
"""
data_strings = [f"Name: {row['Name']}, Age: {row['Age']}, Department: {row['Department']}, Salary: {row['Salary']}" for row in data]

In [6]:
"""
Load pre-trained model and tokenizer from HF
"""
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [7]:
"""
Generate embeddings
"""
def generate_embeddings(text_list):
    inputs = tokenizer(
        text_list,
        return_tensors='pt',
        padding=True, 
        truncation=True
    )
    with torch.no_grad():
        model_output = model(**inputs)

    return model_output.last_hidden_state.mean(dim=1).numpy()

In [8]:
embeddings = generate_embeddings(data_strings)

In [9]:
embeddings

array([[-0.16853712,  0.09782203, -0.3504172 , ..., -0.47833598,
        -0.09226134, -0.02919849],
       [-0.13911068,  0.03090546, -0.21950956, ..., -0.42540738,
        -0.17673936, -0.03296225],
       [-0.3203677 , -0.22014733, -0.06118219, ..., -0.4841047 ,
        -0.39684767,  0.11418969],
       [-0.21524845, -0.04571941,  0.02459079, ..., -0.45589957,
        -0.05748866, -0.01889198]], dtype=float32)

In [10]:
"""
Store embedding in FAISS
"""
dimensions = embeddings.shape[1]
# Create index
index = faiss.IndexFlatL2(dimensions)
# Add embedding to index, please have a look at other methods to get familiar
index.add(embeddings)

In [11]:
"""
Query vector database
"""
def query_vector_database(query_text):
    # Generate an embedding for THE QUERY
    query_embedding = generate_embeddings([query_text])
    # Search FAISS index
    distances, indices = index.search(query_embedding, k=3)

    return indices[0], distances[0]

In [12]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [45]:
"""
Chat with custom data using chatGPT
"""
# Set up ppenAPI client
client = OpenAI(
    api_key=openai_api_key

)
def chat_with_custom_data(user_input):
    # Query the vector database
    indices, distances = query_vector_database(user_input)
    # Retrieve corresponding data
    results = [data[i] for i in indices]
    
    # Format the results as reponse
    response_text = "Here are the top results:\n"
    for result in results:
        response_text += f"Name: {result['Name']}, Age: {result['Age']}, Department: {result['Department']}, Salary: {result['Salary']}\n"

    # Generate answer using ChatGPT
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": response_text                
            },
            {
                "role": "user",
                "content": user_input + " please write in nice markdown"
            }
    
        ],
        model="gpt-3.5-turbo"
    )
    
    message_content = chat_completion.choices[0].message.content

    return message_content

In [46]:
# Example usage
user_input = "What is the average salary in the IT department?"
response = chat_with_custom_data(user_input)

In [47]:
display(Markdown(response))

The average salary in the IT department is calculated as follows:

(total salary of Bob + total salary of Diana) / total number of employees in the IT department = (70000 + 72000) / 2 = 71000

Therefore, the average salary in the IT department is $71,000.