In [29]:
import random

In [62]:
# load_dotenv() will load the environment variables from .env file
# this is where we store our API keys and other secrets

from dotenv import load_dotenv

load_dotenv()

True

In [4]:
# tiktoken helps us determine how many tokens we are using

import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")

In [16]:
# helper functions for extracting text from pdfs and splitting into chunks

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar

def extract_pdf_text(file_path, token_limit = -1):
    extracted_text = ''

    for page_layout in extract_pages(file_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    try:
                        for character in text_line:
                            if isinstance(character, LTChar):
                                extracted_text += character.get_text()
                    except:
                        pass
            # at each element, check token count
            token_count = len(tokenizer.encode(extracted_text))
            if token_count > token_limit and token_limit != -1:
                return extracted_text
    return extracted_text

def create_text_chunks(text, chunk_size):
    chunks = []
    chunk = ""
    # splitting on periods in an attempt to keep sentences together in chunks
    for sentence in text.split('.'):
        if len(tokenizer.encode(chunk)) + len(tokenizer.encode(sentence)) < chunk_size:
            chunk += sentence + '.'
        else:
            chunks.append(chunk)
            chunk = sentence + '.'
    chunks.append(chunk)
    return chunks

In [13]:
pdf = "constitution.pdf"
extracted_text = extract_pdf_text(pdf)
print(extracted_text[:1000] + "...")

THE CONSTITUTION of the United States NATIONAL CONSTITUTION CENTER We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America Article.  I. SECTION. 1 All legislative Powers herein granted shall be vested in a Congress of the United States, which shall consist of a Sen- ate and House of Representatives. SECTION. 2 The House of Representatives shall be composed of Mem- bers chosen every second Year by the People of the several States, and the Electors in each State shall have the Qualifi- cations requisite for Electors of the most numerous Branch of the State Legislature. No Person shall be a Representative who shall not have attained to the Age of twenty five Years, and been seven Years a Citizen of the United States, and w

In [42]:
extracted_text_chunks = create_text_chunks(extracted_text, chunk_size=250)
print("Total chunks: " + str(len(extracted_text_chunks)))
print(extracted_text_chunks[random.randint(0, len(extracted_text_chunks))])

Total chunks: 53
 But in choos- ing the President, the votes shall be taken by states, the representation from each state having one vote; a quorum for this purpose shall consist of a member or members from two-thirds of the states, and a majority of all the states shall be necessary to a choice. [And if the House of Representa- tives shall not choose a President whenever the right of choice shall devolve upon them, before the fourth day of March next following, then the Vice-President shall act as President, as in case of the death or other constitutional disability of the President.-]* The person having the greatest number of votes as Vice-President, shall be the Vice-Presi- dent, if such number be a majority of the whole number of Electors appointed, and if no person have a majority, then from the two highest numbers on the list, the Senate shall choose the Vice-President; a quorum for the purpose shall consist of two-thirds of the whole number of Senators, and a majority of the who

In [44]:
# now, we will put all of these chunks into a dataframe for viewing and analysis
import pandas as pd
df = pd.DataFrame(extracted_text_chunks, columns=['text'])
df.head()

Unnamed: 0,text
0,THE CONSTITUTION of the United States NATIONAL...
1,[Representatives and direct Taxes shall be ap...
2,The House of Representatives shall chuse thei...
3,]* C O N S T I T U T I O N O F T H E U N I T E...
4,"No Senator or Representative shall, during th..."


In [46]:
# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df.head()

Unnamed: 0,text,n_tokens
0,THE CONSTITUTION of the United States NATIONAL...,224
1,[Representatives and direct Taxes shall be ap...,230
2,The House of Representatives shall chuse thei...,205
3,]* C O N S T I T U T I O N O F T H E U N I T E...,249
4,"No Senator or Representative shall, during th...",228


In [52]:
# print sum of all tokens
total_tokens = df.n_tokens.sum()
print("Sum of all tokens: " + str(total_tokens))
dollars_per_thousand_tokens = 0.0001
total_cost = total_tokens/1000 * dollars_per_thousand_tokens
total_cost = round(total_cost, 4)
print("Total cost to generate embeddings: $" + str(total_cost))

Sum of all tokens: 11342
Total cost to generate embeddings: $0.0011


In [68]:
# now we will use the OpenAI API to generate embeddings for each chunk of text
# we will save these embeddings to a new column in the dataframe
# the dataframe will be saved to a csv file for later use

from openai import OpenAI
import os
from dotenv import dotenv_values

config = dotenv_values(".env")

client = OpenAI(
  api_key=config["OPENAI_API_KEY"],
)

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

df['embedding'] = df.text.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('output/constitution_embeddings.csv', index=False)
df.head()

In [11]:
# now we will load the dataframe from the csv file, so if we need to restart the notebook,
# we can just load the csv file instead of re-generating the embeddings

import pandas as pd
import numpy as np

df = pd.read_csv('output/constitution_embeddings.csv')
df['embedding'] = df.embedding.apply(eval).apply(np.array)
df.head()

Unnamed: 0,text,n_tokens,embedding
0,THE CONSTITUTION of the United States NATIONAL...,224,"[-0.0059715150855481625, 5.742223220295273e-05..."
1,[Representatives and direct Taxes shall be ap...,230,"[-0.021560681983828545, 0.005383715499192476, ..."
2,The House of Representatives shall chuse thei...,205,"[-0.015850668773055077, -0.011499756015837193,..."
3,]* C O N S T I T U T I O N O F T H E U N I T E...,249,"[-0.01521310955286026, 0.004529268480837345, 0..."
4,"No Senator or Representative shall, during th...",228,"[-0.014630777761340141, -0.01587194949388504, ..."


In [4]:
from scipy import spatial
from typing import List, Optional

def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [9]:
# at this point, we have embeddings that are ready to be uploaded into a vector database, 
# along with any metadata in the dataframe

# for now, we will use cosine similarity locally to test the embeddings
from openai import OpenAI
import os
from dotenv import dotenv_values

config = dotenv_values(".env")

client = OpenAI(
  api_key=config["OPENAI_API_KEY"],
)

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

query = "taxes"
query_embedding = get_embedding(query)
distances = distances_from_embeddings(query_embedding, df.embedding, distance_metric="cosine")

In [15]:
query_df = pd.DataFrame(df.text)
query_df['distance'] = distances
query_df.sort_values(by=['distance'], inplace=True)

for index, row in query_df.head(5).iterrows():
    print(row['text'][:500] + "...")
    print("Distance: " + str(row['distance']))
    print("")

 The Congress shall have Power To lay and collect Taxes, Duties, Imposts and Excises, to pay the Debts and provide for the common Defence and general Welfare of the United States; but all Duties, Imposts and Excises shall be uniform throughout the United States; To borrow Money on the credit of the United States; To regulate Commerce with foreign Nations, and among the several States, and with the Indian Tribes; To establish an uniform Rule of Naturalization, and uni- form Laws on the subject of...
Distance: 0.1893593888877414

 No Bill of Attainder or ex post facto Law shall be passed. [No Capitation, or other direct, Tax shall be laid, unless in Proportion to the Census or Enumeration herein before directed to be taken.]* No Tax or Duty shall be laid on Articles exported from any State No Preference shall be given by any Regulation of Com- merce or Revenue to the Ports of one State over those of another: nor shall Vessels bound to, or from, one State, be obliged to enter, clear, or p

In [17]:
# let's create a method to get the nearest matches for a query
def get_nearest_matches(query, df, n_matches=5):
    query_embedding = get_embedding(query)
    distances = distances_from_embeddings(query_embedding, df.embedding, distance_metric="cosine")
    query_df = pd.DataFrame(df.text)
    query_df['distance'] = distances
    query_df.sort_values(by=['distance'], inplace=True)
    return query_df.head(n_matches)

In [19]:
matches = get_nearest_matches("presidential oath", df)
for index, row in matches.iterrows():
    print(row['text'][:500] + "...")
    print("Distance: " + str(row['distance']))
    print("")

]* The President shall, at stated Times, receive for his Services, a Compensation, which shall neither be increased nor diminished during the Period for which he shall have been elected, and he shall not receive within that Period any other Emolument from the United States, or any of them. Before he enter on the Execution of his Office, he shall take the following Oath or Affirmation:- “I do solemnly swear (or affirm) that I will faithfully execute the Office of President of the United States, a...
Distance: 0.1491908615091867

 The President shall be Commander in Chief of the Army and Navy of the United States, and of the Militia of the several States, when called into the actual Service of the United States; he may require the Opinion, in writing, of the principal Officer in each of the executive Departments, upon any Subject relating to the Duties of their respective Offices, and he shall have Power to grant Reprieves and Pardons for Offenses against the United States, except in Cas

In [22]:
# upserting vectors into Pinecone
import pinecone
from dotenv import dotenv_values
config = dotenv_values(".env")

pinecone.init(
    api_key=config["PINECONE_API_KEY"],
    environment=config["PINECONE_ENVIRONMENT"]
)

index = pinecone.Index('openai-index')

In [30]:
pinecone_df = pd.read_json("./pdf-data.json", orient="records")
pinecone_df.head()

Unnamed: 0,title,text,n_tokens,vector
0,Combined Officer PME Handbook,eSchool of Graduate PME Officer Professional M...,500,"[-0.0097800773, 0.0036683725, -0.0137623027000..."
1,Combined Officer PME Handbook,.................................................,500,"[0.009529138900000001, -0.0008782578000000001,..."
2,Combined Officer PME Handbook,.................................................,496,"[-0.0028723259, 0.0007146763, 0.0027650606, -0..."
3,Combined Officer PME Handbook,14 Table 8. eSchool Technical (Hardware and S...,485,"[-0.0053947526, 0.0010639088, -0.0172659121, -..."
4,Combined Officer PME Handbook,mil/eSchool/ This Handbook provides details o...,490,"[-0.0102074854, 0.0018237374, -0.0229328163000..."


In [31]:
# generate ids for each row
# id will be air_u_test_1, air_u_test_2, etc.
pinecone_df['id'] = "air_u_test_" + (pinecone_df.index).astype(str)
pinecone_df.head()

Unnamed: 0,title,text,n_tokens,vector,id
0,Combined Officer PME Handbook,eSchool of Graduate PME Officer Professional M...,500,"[-0.0097800773, 0.0036683725, -0.0137623027000...",air_u_test_0
1,Combined Officer PME Handbook,.................................................,500,"[0.009529138900000001, -0.0008782578000000001,...",air_u_test_1
2,Combined Officer PME Handbook,.................................................,496,"[-0.0028723259, 0.0007146763, 0.0027650606, -0...",air_u_test_2
3,Combined Officer PME Handbook,14 Table 8. eSchool Technical (Hardware and S...,485,"[-0.0053947526, 0.0010639088, -0.0172659121, -...",air_u_test_3
4,Combined Officer PME Handbook,mil/eSchool/ This Handbook provides details o...,490,"[-0.0102074854, 0.0018237374, -0.0229328163000...",air_u_test_4


In [32]:
# upsert
batch_size = 100
namespace = "air_u_test"

for i in range(0, len(pinecone_df), batch_size):
    vectors = []
    for item in pinecone_df[i:i+batch_size].itertuples():
        vector = {}
        vector["id"] = item.id
        vector["values"] = item.vector.tolist()
        vector["metadata"] = {
            "text": item.text,
            "title": item.title,
        }
        vectors.append(vector)
    upsert_response = index.upsert(vectors=vectors, namespace=namespace)
    print(upsert_response)

AttributeError: 'Pandas' object has no attribute 'embedding'