# Constructing a Knowledge Graph-Based Deep Lake Vector Store for Semantic Search with LlamaIndex and OpenAI

copyright 2024, Denis Rothman

**A Practical Guide to Building a  Graph-Based Semantic Search Engine with Deep Lake, LlamaIndex, and OpenAI**

**Summary**

*   Pipeline 1 : Collecting and preparing the documents
*   Pipeline 2 : Creating and populating a Deep Lake Vector Store
*   Pipeline 3:  Index-based RAG.

**Topics**
*   Knowledge graph index-based semantic search and LLM response
*   Re-ranking
*   Metrics calculations and display




# Installing the environment

In [None]:
#Google Drive option to store API Keys
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import PIL
import subprocess

# Check current version of Pillow
current_version = PIL.__version__

# Define the required version
required_version = "10.2.0"

# Function to parse version strings
def version_tuple(version):
    return tuple(map(int, (version.split("."))))

# Compare current and required version
if version_tuple(current_version) < version_tuple(required_version):
    print(f"Current Pillow version {current_version} is less than {required_version}. Updating...")
    # Uninstall current version of Pillow
    subprocess.run(['pip', 'uninstall', 'pillow', '-y'])
    # Install the required version of Pillow
    subprocess.run(['pip', 'install', f'pillow=={required_version}'])
else:
    print(f"Current Pillow version {current_version} meets the requirement.")

Current Pillow version 10.4.0 meets the requirement.


Restart session before continuing to meet the Pillow version requirement:   
Go to Runtime->Restart Session  
You can then select Run all or run the program cell by cell.

In [None]:
!pip install llama-index-vector-stores-deeplake==0.1.2

Collecting llama-index-vector-stores-deeplake==0.1.2
  Downloading llama_index_vector_stores_deeplake-0.1.2-py3-none-any.whl (4.3 kB)
Collecting llama-index-core<0.11.0,>=0.10.1 (from llama-index-vector-stores-deeplake==0.1.2)
  Downloading llama_index_core-0.10.52.post2-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from llama-index-core<0.11.0,>=0.10.1->llama-index-vector-stores-deeplake==0.1.2)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.11.0,>=0.10.1->llama-index-vector-stores-deeplake==0.1.2)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.11.0,>=0.10.1->llama-index-vector-stores-deeplake==0.1.2)
  Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)
Collecting httpx (from llama-index-core<0.11.0,>=0.10.1->l

LlamaIndex supports Deep Lake vector stores through the DeepLakeVectorStore class.

In [None]:
!pip install deeplake==3.9.8

Collecting deeplake==3.9.8
  Downloading deeplake-3.9.8.tar.gz (593 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m593.5/593.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting boto3 (from deeplake==3.9.8)
  Downloading boto3-1.34.140-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting pathos (from deeplake==3.9.8)
  Downloading pathos-0.3.2-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting humbug>=0.3.1 (from deeplake==3.9.8)
  Downloading humbug-0.3.2-py3-none-any.whl (15 kB)
Collecting lz4 (from deeplake==3.9.8)
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86

In [None]:
!pip install llama-index==0.10.37

Collecting llama-index==0.10.37
  Downloading llama_index-0.10.37-py3-none-any.whl (6.8 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index==0.10.37)
  Downloading llama_index_agent_openai-0.2.7-py3-none-any.whl (12 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index==0.10.37)
  Downloading llama_index_cli-0.1.12-py3-none-any.whl (26 kB)
Collecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index==0.10.37)
  Downloading llama_index_embeddings_openai-0.1.10-py3-none-any.whl (6.2 kB)
Collecting llama-index-indices-managed-llama-cloud<0.2.0,>=0.1.2 (from llama-index==0.10.37)
  Downloading llama_index_indices_managed_llama_cloud-0.1.6-py3-none-any.whl (6.7 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index==0.10.37)
  Downloading llama_index_legacy-0.9.48-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-ll

Next, let's import the required modules and set the needed environmental variables:

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext



In [None]:
!pip install pyvis==0.3.2

Collecting pyvis==0.3.2
  Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis==0.3.2)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, pyvis
Successfully installed jedi-0.19.1 pyvis-0.3.2


In [None]:
#Retrieving and setting the OpenAI API key
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline()
f.close()

#The OpenAI KeyActiveloop and OpenAI API keys
import os
import openai
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
#Retrieving and setting the Activeloop API token
f = open("drive/MyDrive/files/activeloop.txt", "r")
API_token=f.readline()
f.close()
ACTIVELOOP_TOKEN=API_token
os.environ['ACTIVELOOP_TOKEN'] =ACTIVELOOP_TOKEN

In [None]:
# For Google Colab and Activeloop while waiting for Activeloop (April 2024) pending new version
#This line writes the string "nameserver 8.8.8.8" to the file. This is specifying that the DNS server the system
#should use is at the IP address 8.8.8.8, which is one of Google's Public DNS servers.
with open('/etc/resolv.conf', 'w') as file:
   file.write("nameserver 8.8.8.8")

## GitHub

In [9]:
def download(directory, filename):
    # The base URL of the image files in the GitHub repository
    base_url = 'https://raw.githubusercontent.com/Denis2054/RAG-Driven-Generative-AI/main/'

    # Complete URL for the file
    file_url = f"{base_url}{directory}/{filename}"

    # Use curl to download the file, including an Authorization header for the private token
    try:
        # Prepare the curl command with the Authorization header
        curl_command = f'curl -o {filename} {file_url}'

        # Execute the curl command
        subprocess.run(curl_command, check=True, shell=True)
        print(f"Downloaded '{filename}' successfully.")
    except subprocess.CalledProcessError:
        print(f"Failed to download '{filename}'. Check the URL, your internet connection and the file path")

# Scenario


In [10]:
#File name for file management
graph_name="Marketing"

# Path for vector store and dataset
db="hub://denis76/marketing01"
vector_store_path = db
dataset_path = db

#if True upserts data; if False, passes upserting and goes to connection
pop_vs=True
# if pop_vs==True, overwrite=True will overwrite dataset, False will append it:
ow=True

# Pipeline 1 : Collecting and preparing the documents

In [5]:
!mkdir data

In [11]:
# Define your variables
if pop_vs==True:
  directory = "Chapter07/citations"
  file_name = graph_name+"_urls.txt"
  download(directory,file_name)

Downloaded 'Marketing_urls.txt' successfully.


In [None]:
# Read URLs from the file
import requests
from bs4 import BeautifulSoup
import re
import os

if pop_vs==True:
  directory = "Chapter07/citations"
  file_name = graph_name+"_urls.txt"

  with open(file_name, 'r') as file:
      urls = [line.strip() for line in file]

  # Display the URLs
  print("Read URLs:")
  for url in urls:
      print(url)

In [None]:
import requests
import re
import os
from bs4 import BeautifulSoup

def clean_text(content):
    # Remove references and unwanted characters
    content = re.sub(r'\[\d+\]', '', content)   # Remove references
    content = re.sub(r'[^\w\s\.]', '', content)  # Remove punctuation (except periods)
    return content

def fetch_and_clean(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise exception for bad responses (e.g., 404)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Prioritize "mw-parser-output" but fall back to "content" class if not found
        content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None

        # Remove specific sections, including nested ones
        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span', id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()
                section.parent.decompose()
                section = content.find('span', id=section_title)

        # Extract and clean text
        text = content.get_text(separator=' ', strip=True)
        text = clean_text(text)
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return None  # Return None on error

if pop_vs==True:
  # Directory to store the output files
  output_dir = './data/'  # More descriptive name
  os.makedirs(output_dir, exist_ok=True)

  # Processing each URL (and skipping invalid ones)
  for url in urls:
      article_name = url.split('/')[-1].replace('.html', '')  # Handle .html extension
      filename = os.path.join(output_dir, f"{article_name}.txt")

      clean_article_text = fetch_and_clean(url)
      if clean_article_text:  # Only write to file if content exists
          with open(filename, 'w', encoding='utf-8') as file:
              file.write(clean_article_text)
  print(f"Content(ones that were possible) written to files in the '{output_dir}' directory.")

In [None]:
if pop_vs==True:
  # load documents
  documents = SimpleDirectoryReader("./data/").load_data()
  # Print the first document
  print(documents[0])

# Pipeline 2 : Creating and populating the Deep Lake Vector Store

In [None]:
if pop_vs==True:
    # Create an index over the documents
    # overwrite=True will overwrite dataset, False will append it
    if ow==True:
       vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)
    else:
        vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=False)

    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    # Create an index over the documents
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

In [None]:
import deeplake
ds = deeplake.load(dataset_path)  # Loads the dataset

|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/denis76/marketing01



|

hub://denis76/marketing01 loaded successfully.



 

In [None]:
ds.summary()

Dataset(path='hub://denis76/marketing01', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
 embedding  embedding  (468, 1536)  float32   None   
    id        text      (468, 1)      str     None   
 metadata     json      (468, 1)      str     None   
   text       text      (468, 1)      str     None   


In [None]:
import json
import pandas as pd
import numpy as np

# Assuming 'ds' is your loaded Deep Lake dataset

# Create a dictionary to hold the data
data = {}

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    # Check if the tensor is multi-dimensional
    if tensor_data.ndim > 1:
        # Flatten multi-dimensional tensors
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        # Convert 1D tensors directly to lists and decode text
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

In [None]:
# Function to display a selected record
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record.get("id", "N/A"),
        "Metadata": record.get("metadata", "N/A"),
        "Text": record.get("text", "N/A"),
        "Embedding": record.get("embedding", "N/A")
    }

    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

# Example usage
rec = 0  # Replace with the desired record number
display_record(rec)


ID:
['a61734be-fe23-421e-9a8b-db6593c48e08']

Metadata:
file_path: /content/data/24-hour_news_cycle.txt
file_name: 24-hour_news_cycle.txt
file_type: text/plain
file_size: 2763
creation_date: 2024-07-05
last_modified_date: 2024-07-05
_node_content: {"id_": "a61734be-fe23-421e-9a8b-db6593c48e08", "embedding": null, "metadata": {"file_path": "/content/data/24-hour_news_cycle.txt", "file_name": "24-hour_news_cycle.txt", "file_type": "text/plain", "file_size": 2763, "creation_date": "2024-07-05", "last_modified_date": "2024-07-05"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "d77731c4-0a8b-46bc-a534-89122924b04e", "node_type": "4", "metadata": {"file_path": "/content/data/24-hour_news_cycle.txt", "file_name": "24-hour_news_cycle.txt", "f

## Original documents

In [None]:
# Ensure 'text' column is of type string
df['text'] = df['text'].astype(str)
# Create documents with IDs
documents = [Document(text=row['text'], doc_id=str(row['id'])) for _, row in df.iterrows()]

# Pipeline 3:Knowledge Graph Index-based RAG

## Generating the Knowledge Graph Index

In [None]:
from llama_index.core import KnowledgeGraphIndex
import time
# Start the timer
start_time = time.time()

#graph index with embeddings
graph_index = KnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=2,
    include_embeddings=True,
)

# Stop the timer
end_time = time.time()

# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Index creation time: {elapsed_time:.4f} seconds")

Index creation time: 365.2075 seconds


In [None]:
print(type(graph_index))

<class 'llama_index.core.indices.knowledge_graph.base.KnowledgeGraphIndex'>


In [None]:
#similarity_top_k
k=3
#temperature
temp=0.1
#num_output
mt=1024
graph_query_engine = graph_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

### Displaying the graph

In [None]:
## create graph
from pyvis.network import Network

g = graph_index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)

# Set node and edge properties: colors and sizes
for node in net.nodes:
    node['color'] = 'lightgray'
    node['size'] = 10

for edge in net.edges:
    edge['color'] = 'black'
    edge['width'] = 1

In [None]:
fgraph="Knowledge_graph_"+ graph_name + ".html"
net.write_html(fgraph)
print(fgraph)

Knowledge_graph_Marketing.html


In [None]:
from IPython.display import HTML

# Load the HTML content from a file and display it
with open(fgraph, 'r') as file:
    html_content = file.read()

# Display the HTML in the notebook
display(HTML(html_content))

## Interacting with the Knowledge graph index

### User input and RAG functions

In [None]:
import time
import textwrap

def execute_query(user_input, k=3, temp=0.1, mt=1024):

    # Start the timer
    start_time = time.time()

    # Execute the query with additional parameters
    response = graph_query_engine.query(user_input)

    # Stop the timer
    end_time = time.time()

    # Calculate and print the execution time
    elapsed_time = end_time - start_time
    print(f"Query execution time: {elapsed_time:.4f} seconds")

    # Print the response, wrapped to 100 characters per line
    print(textwrap.fill(str(response), 100))
    return response

In [None]:
user_query="What is the primary goal of marketing for the consumer market?"

In [None]:
import time
import textwrap
import sys
import io
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 100))

Query execution time: 1.9763 seconds
The primary goal of marketing for the consumer market is to attract and retain customers by
effectively reaching them with messages that are relevant to their interests and needs. This
involves creating brand awareness, engaging consumers with products or services that align with
their preferences, and ultimately driving them to make purchase decisions that satisfy their desires
and requirements.


## Installing the similarity score packages and defining the functions

Install the package(s) that fit your project.

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

In [None]:
!pip install sentence-transformers==3.0.1

Collecting sentence-transformers==3.0.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m225.3/227.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-many

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]

In [None]:
import time
import textwrap
import sys
import io

# Re-ranking

In [None]:
user_query="Which experts are often associated with marketing theory?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 100))

Query execution time: 1.5566 seconds
Marketing researchers are often associated with marketing theory.


In [None]:
# prompt: read the code above and write a python code to find out what type of object "response" is
type(response)

llama_index.core.base.response.schema.Response

In [None]:
text2=user_query # User query scenario
#text2=user_query # human feedbak

In [None]:
# Assuming 'response' is the object containing the source_nodes
best_rank=""
best_score=0
best_text=""
for idx, node_with_score in enumerate(response.source_nodes):
    node = node_with_score.node
    print(f"Node {idx + 1}:")
    print(f"Score: {node_with_score.score}")
    print(f"ID to rank: {node.id_}")
    print("Relationships:")
    for relationship, info in node.relationships.items():
        print(f"  Relationship: {relationship}")
        print(f"    Node ID: {info.node_id}")
        print(f"    Node Type: {info.node_type}")
        print(f"    Metadata: {info.metadata}")
        print(f"    Hash: {info.hash}")
    #print(f"Text to rank: {node.text}")
    print(textwrap.fill(str(node.text), 100))
    print(f"Mimetype: {node.mimetype}")
    print(f"Start Char Index: {node.start_char_idx}")
    print(f"End Char Index: {node.end_char_idx}")
    print(f"Text Template: {node.text_template}")
    print(f"Metadata Template: {node.metadata_template}")
    print(f"Metadata Separator: {node.metadata_seperator}")
    text1=node.text
    #text2=user_query
    similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
    print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
    if similarity_score3>best_score:
      best_score=similarity_score3
      best_rank=idx + 1
      best_text=node.text
      print(f"Best Rank: {best_rank}")
      print(f"Best Score: {best_score}")
      print(f"Best Text: {best_text}")
    print("\n" + "="*40 + "\n")

print(f"Best Rank: {best_rank}")
print(f"Best Score: {best_score}")
#print(f"Best Text: {best_text}")
print(textwrap.fill(str(best_text), 100))

Node 1:
Score: 1000.0
ID to rank: 9c57de6e-04a3-4dc1-8c21-947ca9f7657e
Relationships:
  Relationship: 1
    Node ID: ['ca7aedf7-09e6-4b4d-98af-bf0010ce57c8']
    Node Type: 4
    Metadata: {}
    Hash: e81b5402c44be87570f97f9273bacbb1cc29cc31e8c79c0abe4950d93b935882
['In the absence of relevant information consumers response to marketing programs cannot be
predicted reliably or accurately. Ongoing marketing research programs provide information on
controllable and noncontrollable factors and consumers this information enhances the effectiveness
of decisions made by marketing managers.  Traditionally marketing researchers were responsible for
providing the relevant information and marketing decisions were made by the managers. However the
roles are changing and marketing researchers are becoming more involved in decision making whereas
marketing managers are becoming more involved with research. The role of marketing research in
managerial decision making is explained further using the 

# Examples for metrics

In [None]:
import numpy as np
import sys
# create an empty array score human feedback scores:
rscores =[]
# create an empty score for similarity function scores
scores=[]

## 1

In [None]:
user_query="Which experts are often associated with marketing theory?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.75
rscores.append(human_feedback)

## 2

In [None]:
user_query="How does marketing boost sales?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.5
rscores.append(human_feedback)

## 3

In [None]:
user_query="What is the difference between B2B and B2C?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.8
rscores.append(human_feedback)

## 4

In [None]:
user_query="What are the 4Ps? What do they stand for?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.9
rscores.append(human_feedback)

## 5

In [None]:
user_query="What are the 4Cs? What do they stand for?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.65
rscores.append(human_feedback)

## 6

In [None]:
user_query="What is the difference between the 4Ps and 4Cs?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.8
rscores.append(human_feedback)

## 7

In [None]:
user_query="What commodity programs does the Agricultural Marketing Service (AMS) maintain?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.9
rscores.append(human_feedback)

## 8

In [None]:
user_query="What kind of marketing is Got Milk?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.2
rscores.append(human_feedback)

## 9

In [None]:
user_query="What an is industry trade group, business association, sector association or industry body?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.2
rscores.append(human_feedback)

## 10

In [None]:
user_query="How many members are there in the American Marketing Association (AMA), theassociation for marketing professionals?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

In [None]:
text1=str(response)
text2=user_query
similarity_score3=calculate_cosine_similarity_with_embeddings(text1, text2)
print(f"Cosine Similarity Score with sentence transformer: {similarity_score3:.3f}")
scores.append(similarity_score3)
human_feedback=0.9
rscores.append(human_feedback)

## Metrics calculation and display

In [None]:
print(len(scores), scores)
print(len(rscores), rscores)

Mean, Median, Standard Deviation, Variance, Minimum, Maximum, Range, Percentile (Q1), 75th Percentile and Interquartile Range (IQR)

In [None]:
# Calculating metrics
mean_score = np.mean(scores)
median_score = np.median(scores)
std_deviation = np.std(scores)
variance = np.var(scores)
min_score = np.min(scores)
max_score = np.max(scores)
range_score = max_score - min_score
percentile_25 = np.percentile(scores, 25)
percentile_75 = np.percentile(scores, 75)
iqr = percentile_75 - percentile_25

# Printing the metrics with 2 decimals
print(f"Mean: {mean_score:.2f}")
print(f"Median: {median_score:.2f}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Minimum: {min_score:.2f}")
print(f"Maximum: {max_score:.2f}")
print(f"Range: {range_score:.2f}")
print(f"25th Percentile (Q1): {percentile_25:.2f}")
print(f"75th Percentile (Q3): {percentile_75:.2f}")
print(f"Interquartile Range (IQR): {iqr:.2f}")