In [1]:
import os

from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv(), override=True)

True

In [4]:
from pinecone import Pinecone

In [39]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [6]:
pine = Pinecone(api_key=PINECONE_API_KEY)

In [8]:
pine.list_indexes()

{'indexes': []}

In [9]:
from pinecone import ServerlessSpec

In [12]:
index_name = "langchain"

if index_name not in pine.list_indexes().names():
    print(f"Creating index {index_name}")

    pine.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
    )

    print(f"Index {index_name} created")

Creating index langchain
Index langchain created


In [13]:
index = pine.Index(index_name)

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [15]:
import random

vectors = [[random.random() for _ in range(1536)] for v in range(5)]
ids = list("abcde")

index_name = "langchain"
index = pine.Index(index_name)

index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [21]:
index.upsert(vectors=[('c', [0.5] * 1536)])

{'upserted_count': 1}

In [20]:
index.fetch(ids=["a"]).vectors["a"].values

[0.23729898,
 0.985395133,
 0.263809741,
 0.702729702,
 0.953879118,
 0.238443837,
 0.527686477,
 0.77764231,
 0.951725543,
 0.72390449,
 0.529577076,
 0.368944764,
 0.992309153,
 0.492987096,
 0.497626811,
 0.482107967,
 0.552660227,
 0.247973546,
 0.281994313,
 0.758099556,
 0.0282133427,
 0.504692495,
 0.690535,
 0.92677927,
 0.346552938,
 0.588260889,
 0.563542426,
 0.723368943,
 0.380051196,
 0.431026161,
 0.544169545,
 0.840601265,
 0.9648453,
 0.99228394,
 0.756762326,
 0.678016841,
 0.104129888,
 0.0409975462,
 0.443414778,
 0.463496923,
 0.648556,
 0.0366317853,
 0.736634135,
 0.797796488,
 0.791157365,
 0.625115454,
 0.540475786,
 0.505380809,
 0.999245226,
 0.544915318,
 0.948356628,
 0.259998828,
 0.603539288,
 0.481973171,
 0.541835129,
 0.296688557,
 0.0364025459,
 0.539036512,
 0.987723053,
 0.970428646,
 0.566822529,
 0.588276803,
 0.115532376,
 0.220133573,
 0.932952702,
 0.0213260688,
 0.645722032,
 0.186477363,
 0.805423915,
 0.173410341,
 0.0966291,
 0.440703064,
 0

In [22]:
index.fetch(ids=["c"]).vectors["c"].values

[0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5

In [23]:
index.delete(ids=["c"])

{}

In [24]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

In [25]:
query_vector = [random.random() for _ in range(1536)]

In [26]:
index.query(
    vector=query_vector,
    top_k=2,
    include_values=False
)

{'matches': [{'id': 'e', 'score': 0.754713, 'values': []},
             {'id': 'b', 'score': 0.749025643, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [28]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [31]:
import requests

url = "https://raw.githubusercontent.com/egodat/Churchill/master/We_shall_fight.txt"

response = requests.get(url)
churchill_speech = response.text

In [35]:
chunks = text_splitter.create_documents([churchill_speech])

for i in range(5):
    print(chunks[i].page_content)

From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the
at the end of the second week of May, only a rapid retreat to Amiens and the south could have saved
could have saved the British and French Armies who had entered Belgium at the appeal of the Belgian
of the Belgian King; but this strategic fact was not immediately realized. The French High Command
French High Command hoped they would be able to close the gap, and the Armies of the north were


In [36]:
def print_embedding_cost(texts):
    import tiktoken

    enc = tiktoken.encoding_for_model("text-embedding-ada-002")
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])

    print(f"Total tokens: {total_tokens}")
    print(f"Cost: {total_tokens / 1000 * 0.0004:.6f} USD")

In [37]:
print_embedding_cost(chunks)

Total tokens: 5389
Cost: 0.002156 USD


In [38]:
from langchain.embeddings import OpenAIEmbeddings

In [46]:
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

In [47]:
vector = embeddings.embed_query(chunks[0].page_content)

print(vector)

[-0.010211220814272481, 0.0007125094909197949, -0.0004776955987905259, 0.014987327901661632, -0.011006106484375009, 0.03956763024139059, -0.025341219881993882, -0.04149709589848755, 0.011162365646627872, -0.03155083914733523, 0.01740595303980682, 0.02712121944361197, 0.011760228165898263, -0.024199846828225464, 0.0049425572467868325, 0.008166259052179365, -0.0014878625329100745, 0.024267784946545797, 0.006474580068832531, -0.030626868444720897, 0.0035532060564621853, -0.026672823019820456, -0.003593969346549537, 0.011692289116255368, 0.011508854892938887, -0.0030436638826324023, 0.004979923304995271, -0.044432058372712274, -0.01574824358128143, -0.003566793819824636, 0.0006110257159594685, -0.018656030063119968, -0.019919693219981084, -0.023235113999676985, -0.012860839093732533, -0.014538930546547557, 0.0016551621749004643, -0.011434121845199447, 0.013377174567166939, -0.0037706105030920365, 0.029268091177153267, -0.01493297684821183, 0.005071640882314793, 0.012894808152892697, -0.025

# Insert Embeddings in Pinecone Index

In [42]:
import pinecone

from langchain_community.vectorstores import Pinecone

In [43]:
pine = pinecone.Pinecone(api_key=PINECONE_API_KEY)

In [63]:
for index in pine.list_indexes().names():
    print(f"Deleting index {index}")
    pine.delete_index(index)
    print(f"Index {index} deleted")

Deleting index churchill-speech
Index churchill-speech deleted


In [65]:
index_name = "churchill-speech"

if index_name not in pine.list_indexes().names():
    print(f"Creating index {index_name}")

    pine.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
    )

    print(f"Index {index_name} created")

Creating index churchill-speech
Index churchill-speech created


In [66]:
import requests

url = "https://raw.githubusercontent.com/egodat/Churchill/master/We_shall_fight.txt"

response = requests.get(url)
churchill_speech = response.text
chunks = text_splitter.create_documents([churchill_speech])

In [67]:
from langchain.embeddings import OpenAIEmbeddings

In [68]:
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

In [70]:
vector_store = Pinecone.from_documents(
    chunks,
    embeddings,
    index_name=index_name
)

In [73]:
vector_store = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

# Similarity Search