#Embedding-Based Retrieval with Activeloop and OpenAI

Copyright 2024 Denis Rothman



# 1. Installing the environment

Mount a drive or implement the method that best fits your project to retrieve API tokens.

In [1]:
#Google Drive option to store API Keys
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


grequests.py contains a function to download files from GitHub

In [2]:
#GitHub grequests.py
#Script to download files from the GitHub repository.
#The private token will be removed from this function when the repository goes public.

import subprocess

# Define your private token as a variable
private_token='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'
url = "https://raw.githubusercontent.com/Denis2054/RAG-Driven-Generative-AI/main/commons/grequests.py"
output_file = "grequests.py"

# Prepare the curl command using the private token
curl_command = [
    "curl",
    "-H", f"Authorization: token {private_token}",
    "-o", output_file,
    url
]

# Execute the curl command
try:
    subprocess.run(curl_command, check=True)
    print("Download successful.")
except subprocess.CalledProcessError:
    print("Failed to download the file.")


Download successful.


### List of Dependencies for `deeplake`:

deeplake==3.9.2 has a main package and a list of dependencies:

- **deeplake (3.9.2)**: Main package being installed.
- **numpy (1.25.2)**: Required for numerical operations within deeplake.
- **pillow (10.2.0)**: Used for image manipulation and processing.
- **boto3 (1.34.69)**: Amazon Web Services (AWS) SDK for Python, used for working with AWS services.
- **pathos (0.3.2)**: Provides utilities for parallel processing.
- **humbug (0.3.2)**: Reporting tool for bugs and usage metrics.
- **lz4 (4.3.3)**: Provides LZ4 compression for fast data packing.
- **pyjwt (2.3.0)**: Allows for encoding, decoding, and verification of JWTs.
- **pydantic (2.7.1)**: Data validation by using Python type hints.
- **libdeeplake (0.0.123)**: Likely a core library for deeplake's functionalities.
- **aioboto3 (12.4.0)**: Asynchronous SDK for AWS services, allowing non-blocking AWS operations.
- **dill (0.3.8)**: Extends python’s `pickle` module for serializing and deserializing python objects.
- **aiobotocore (2.12.3)**: Core component of aioboto3, providing low-level interface to AWS services asynchronously.
- **aioitertools (0.11.0)**: Tools and helper functions to make async iterations easy.
- **botocore (1.34.69)**: Low-level interface to AWS, used by boto3 and aiobotocore.
- **jmespath (1.0.1)**: Allows declarative JSON querying.
- **s3transfer (0.10.1)**: Amazon S3 Transfer Manager for boto3.
- **ppft (1.7.6.8)**, **pox (0.3.4)**, **multiprocess (0.70.16)**: Components of pathos, handling parallel processing infrastructure.

### May 3, 2024: Explanation of Pillow Version Requirement which causes a conflict with Google Colab until Google Colab upgrades Pillow:

**Pillow (10.2.0)** is specified in the deeplake dependencies as `pillow~=10.2.0`. This version specifier means that deeplake requires a version of Pillow that is compatible with 10.2.0. The tilde equals (`~=`) is a version specifier that matches the most recent version that is compatible with the specified version, meaning it allows versions that are the same as 10.2.x where x is equal to or greater than 0, but less than the next significant release which would be 10.3.

This specific version constraint ensures that deeplake uses features or functionalities that are present only in the Pillow 10.2.x releases. If Pillow were to be updated beyond this range, it could potentially introduce breaking changes that deeplake may not be compatible with, or it could remove features that deeplake relies on. The constraint thereby protects the application from unexpected behavior or errors due to incompatible library versions.

checking to see if Google Colab has the right version Pillow. If not, Pillow is uninstalled and a version compatiable with DeepLake is installed.

In [3]:
import PIL
import subprocess

# Check current version of Pillow
current_version = PIL.__version__

# Define the required version
required_version = "10.2.0"

# Function to parse version strings
def version_tuple(version):
    return tuple(map(int, (version.split("."))))

# Compare current and required version
if version_tuple(current_version) < version_tuple(required_version):
    print(f"Current Pillow version {current_version} is less than {required_version}. Updating...")
    # Uninstall current version of Pillow
    subprocess.run(['pip', 'uninstall', 'pillow', '-y'])
    # Install the required version of Pillow
    subprocess.run(['pip', 'install', f'pillow=={required_version}'])
else:
    print(f"Current Pillow version {current_version} meets the requirement.")

Current Pillow version 9.4.0 is less than 10.2.0. Updating...


In [4]:
!pip install deeplake==3.9.2

Collecting deeplake==3.9.2
  Downloading deeplake-3.9.2.tar.gz (590 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/590.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m501.8/590.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.0/590.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting boto3 (from deeplake==3.9.2)
  Downloading boto3-1.34.96-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting pathos 

In [5]:
!pip install openai==1.25.0

Collecting openai==1.25.0
  Downloading openai-1.25.0-py3-none-any.whl (312 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai==1.25.0)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.25.0)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai==1.25.0)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully insta

In [6]:
# For Google Colab and Activeloop while waiting for Activeloop (April 2024) pending new version
#This line writes the string "nameserver 8.8.8.8" to the file. This is specifying that the DNS server the system
#should use is at the IP address 8.8.8.8, which is one of Google's Public DNS servers.
with open('/etc/resolv.conf', 'w') as file:
   file.write("nameserver 8.8.8.8")

In [7]:
#Retrieving and setting the OpenAI API key
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline()
f.close()

#The OpenAI KeyActiveloop and OpenAI API keys
import os
import openai
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

In [8]:
#Retrieving and setting the Activeloop API token
f = open("drive/MyDrive/files/activeloop.txt", "r")
API_token=f.readline()
f.close()
ACTIVELOOP_TOKEN=API_token
os.environ['ACTIVELOOP_TOKEN'] =ACTIVELOOP_TOKEN

# Retrieval Augmented Generation

### Initiating the query process

In [9]:
vector_store_path = "hub://denis76/space_exploration_v4"

In [10]:
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import deeplake.util
ds = deeplake.load(vector_store_path)

/

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/denis76/space_exploration_v4



|

hub://denis76/space_exploration_v4 loaded successfully.



 

In [11]:
vector_store = VectorStore(path=vector_store_path)
print("Vector store exists")

Deep Lake Dataset in hub://denis76/space_exploration_v4 already exists, loading from the storage
Vector store exists


## Input and Query Retrieval

## Input

### Retrieval query

the embedding function

In [12]:
def embedding_function(texts, model="text-embedding-ada-002"):
   if isinstance(texts, str):
       texts = [texts]
   texts = [t.replace("\n", " ") for t in texts]
   return [data.embedding for data in openai.embeddings.create(input = texts, model=model).data]

the user prompt

In [13]:
def get_user_prompt():
    # Request user input for the search prompt
    return input("Enter your search query: ")

# Get the user's search query
#user_prompt = get_user_prompt()
user_prompt="Tell me about space exploration on the Moon."

search and store the result in `search_results`

In [15]:
search_results = vector_store.search(embedding_data=user_prompt, embedding_function=embedding_function)

displaying the user prompt and the formatted response

In [16]:
print(user_prompt)

Tell me about space exploration on the Moon.


In [17]:
# Function to wrap text to a specified width
def wrap_text(text, width=80):
    lines = []
    while len(text) > width:
        split_index = text.rfind(' ', 0, width)
        if split_index == -1:
            split_index = width
        lines.append(text[:split_index])
        text = text[split_index:].strip()
    lines.append(text)
    return '\n'.join(lines)

In [18]:
import textwrap

# Assuming the search results are ordered with the top result first
top_score = search_results['score'][0]
top_text = search_results['text'][0].strip()
top_metadata = search_results['metadata'][0]['source']

# Print the top search result
print("Top Search Result:")
print(f"Score: {top_score}")
print(f"Source: {top_metadata}")
print("Text:")
print(wrap_text(top_text))

Top Search Result:
Score: 0.8819366693496704
Source: llm.txt
Text:
Exploration of space, planets, and moons "Space Exploration" redirects here.
For the company, see SpaceX . For broader coverage of this topic, see
Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11
mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on
Spaceflight History History of spaceflight Space Race Timeline of spaceflight
Space probes Lunar missions Mars missions Applications Communications Earth
observation Exploration Espionage Military Navigation Settlement Telescopes
Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft
Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space
stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and
reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight
types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of
space organizatio

## Augmented Input

In [19]:
augmented_input=user_prompt+" "+top_text

In [20]:
print(augmented_input)

Tell me about space exploration on the Moon. Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of space organizations Space agencies Spac

# Generation and  output

In [22]:
from openai import OpenAI
client = OpenAI()

import time


gpt_model = "gpt-4-turbo"  # or "gpt-3.5-turbo"
start_time = time.time()  # Start timing before the request

def call_gpt4_with_full_text(itext):
    # Join all lines to form a single string
    text_input = '\n'.join(itext)
    prompt = f"Please summarize or elaborate on the following content:\n{text_input}"

    try:
        response = client.chat.completions.create(
            model=gpt_model,
            messages=[
                {"role": "system", "content": "You are a space exploration expert."},
                {"role": "assistant", "content": "You can read the input and answer in detail."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1  # Fine-tune parameters as needed
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return str(e)

gpt4_response = call_gpt4_with_full_text(augmented_input)

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

print(gpt_model, "Response:", gpt4_response)

Response Time: 13.01 seconds
gpt-4-turbo Response: The text provided discusses various aspects of space exploration, focusing on the exploration of the Moon, planets, and moons in general. It mentions historical events like Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission and the Curiosity rover's self-portrait on Mars. The content also references broader topics in spaceflight history, including the Space Race and timelines of spaceflight, as well as different types of space missions (lunar, Mars, etc.) and applications such as communications, Earth observation, espionage, military, navigation, settlement, and tourism.

Additionally, the text covers various types of spacecraft, including robotic spacecraft, satellites, cargo spacecraft, crewed spacecraft, and specific models like the Apollo Lunar Module, space capsules, space shuttles, and space stations. It also discusses space launch technologies, including launch pads, expendable and reusable launch vehicles

### Formatted response

In [None]:
import textwrap

def print_formatted_response(response):
    # Define the width for wrapping the text
    wrapper = textwrap.TextWrapper(width=80)  # Set to 80 columns wide, but adjust as needed
    wrapped_text = wrapper.fill(text=response)

    # Print the formatted response with a header and footer
    print("GPT-4 Response:")
    print("---------------")
    print(wrapped_text)
    print("---------------\n")

# Assuming 'gpt4_response' contains the response from the previous GPT-4 call
print_formatted_response(gpt4_response)


# Evaluating the output with  Cosine Similarity

with initial user prompt

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity[0][0]

similarity_score = calculate_cosine_similarity(user_prompt, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

with augmented user prompt

In [None]:
similarity_score = calculate_cosine_similarity(augmented_input, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

Install sentence-transformers at the end of this session to avoid potential dependency conflicts with the RAG pipeline requirements.

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]


similarity_score = calculate_cosine_similarity_with_embeddings(augmented_input, gpt4_response)
print(f"Cosine Similarity Score: {similarity_score:.3f}")