# Diving into Pinecone

## Vector database

Learn with https://docs.pinecone.io/docs/overview

## Install dependencies

In [4]:
pip install -r ./requirements.txt

Note: you may need to restart the kernel to use updated packages.


### Verify Pinecone is installed

In [1]:
import tqdm
import os
from dotenv import load_dotenv, find_dotenv
from pinecone import Pinecone
# loading the API Keys (Cohere, Pinecone) from .env
load_dotenv(find_dotenv(), override=True)

# Initialize Pinecone library with API key and environment
pinecone = Pinecone(
    api_key=os.environ.get('PINECONE_API_KEY')
)

  from tqdm.autonotebook import tqdm


## Pinecone Indexes

Learn with https://docs.pinecone.io/docs/indexes

In [2]:
# List all indexes in the Pinecone environment
pinecone.list_indexes()

{'indexes': [{'dimension': 4096,
              'host': 'churchill-speech-v9fxrdg.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'churchill-speech',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

### Creating an index

Firstly, go to the section Deleting an index and delete the index from the leson 01 "churchill-speech"

In [10]:
from pinecone import PodSpec

# Specify name for index
index_name = 'langchain-pinecone'

#  Check if index already exists
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ....')

    # Create index with parameters
    pinecone.create_index(index_name, 
                          # Vector dimension - The number of dimensions for vectors in this index
                          dimension=1536, 
                          # Similarity metric 
                          # Distance measure used to compare vectors
                          # 'cosine' measures the cosine similarity between vectors
                          metric='cosine',
                          spec=PodSpec(
                            environment="gcp-starter")
                          )
                        
    print('Done')
else:
    print(f'Index {index_name} already exists!')

Creating index langchain-pinecone ....
Done


In [11]:
# Retrieve metadata about the index 
pinecone.describe_index(index_name)

{'dimension': 1536,
 'host': 'langchain-pinecone-v9fxrdg.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'langchain-pinecone',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}

### Deleting an index

In [3]:
# Get index name to delete from user input
index_name = input('Enter Pinecone index to delete : ')

list_indexes = pinecone.list_indexes()

# Check if index exists 
if index_name in pinecone.list_indexes().names():
    print(f'Deleting index {index_name} ... ')
    pinecone.delete_index(index_name)
    print('Done')
else:
    print(f'Index {index_name} does not exist!')

Enter Pinecone index to delete :  churchill-speech


Deleting index churchill-speech ... 
Done


### Getting index statistics

In [13]:
index_name = 'langchain-pinecone'

# Create index with parameters
pinecone.create_index(index_name, 
                          # Vector dimension - The number of dimensions for vectors in this index
                          dimension=1536, 
                          # Similarity metric 
                          # Distance measure used to compare vectors
                          # 'cosine' measures the cosine similarity between vectors
                          metric='cosine',
                          spec=PodSpec(
                            environment="gcp-starter")
                          )
# index object
index = pinecone.Index(index_name)

# Retrieve usage statistics for the index
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Inserting into an index

In [14]:
import random

# inserting some random vectors into a Pinecone index

# Generate 5 random 1536-dim vectors 
vectors = [[random.random() for _ in range(1536)] for v in range(5)]

# Create a list of IDs to associate with each vector
ids = list('abcde')

# Specify Pinecone index name 
index_name = 'langchain-pinecone'

# Create index object
index = pinecone.Index(index_name)

# Upsert vectors into index
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

### Updating a vector

In [15]:
# Upsert a single vector to update it

# Vector ID to update
id_to_update = 'c'  

# New vector data 
new_vector = [0.3] * 1536

# Upsert the new vector data with the same ID
index.upsert(vectors=[(id_to_update, new_vector)])

{'upserted_count': 1}

### Fetching a vector

In [16]:
# Get index object
index = pinecone.Index('langchain-pinecone')  

# Specify IDs of vectors to fetch
ids_to_fetch = ['c', 'd']

# Fetch vector data for the provided IDs
index.fetch(ids=ids_to_fetch)

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'c': {'id': 'c',
                   'values': [0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
             

### Deleting vectors

In [17]:
# Specify IDs of vectors to delete
ids_to_delete = ['b', 'c']  

# Delete the vectors for those IDs
index.delete(ids=ids_to_delete)

{}

In [18]:
# Get index statistics
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [19]:
# Try to fetch a deleted vector 
index.fetch(ids=['b'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

In [20]:
# Delete all remaining vectors
index.delete(delete_all=True)

{}

## Splitting and Embedding Text Using LangChain

https://python.langchain.com/docs/modules/data_connection/document_transformers/
https://python.langchain.com/docs/integrations/text_embedding/cohere

**Text Splitting**

- Splitting large text documents into smaller pieces called chunks
- Makes large texts more manageable to process 
- Common splitting approaches:
  - Split by fixed character length 
  - Split at semantic boundaries like sentences or topics
  - Use a sliding window to create overlapping chunks
- Output is a list of text chunks from the original document

**Text Embedding**

- Encoding text into numeric vectors that capture semantic meaning
- Steps:
  1. Turn text into chunks (splitting)
  2. Map chunks to vector embeddings
  3. Aggregate chunks embeddings into a vector database
 

**Goals**
- The goal of splitting is to divide large documents into manageable sizes for processing
- The goal of embedding is to encode semantic meaning in a way that allows for semantic search 
and comparison
- Together, splitting and embedding enable semantic search, QA, and analysis of large text corpora by indexing the vectorized content


### Split document

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Open text file and read contents into churchill_speech
with open('documents/churchill_speech.txt') as f:
    churchill_speech = f.read()

# Create text splitter instance
# check this video about chunk - https://youtu.be/n0uPzvGTFI0?feature=shared
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, # maximum size of text chunk in number of characters
    chunk_overlap=20, # pecifies the number of overlapping characters between adjacent chunks.
                      # if chunk 1 ends at character 100, chunk 2 will start at character 80
)

# Split the text into chunks
chunks = text_splitter.create_documents([churchill_speech])

# Print specific chunks - you can test it
# print(chunks[2]) 
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

Now you have 23


### Create Embeddings

In [22]:
from langchain.embeddings import CohereEmbeddings

# Create embeddings instance
embeddings = CohereEmbeddings()

# Take first text chunk 
first_chunk = chunks[0]

# Embed the text into a vector 
vector = embeddings.embed_query(first_chunk.page_content)


# Print the chunk
print(first_chunk.page_content)
# Print the vector
print(vector)

Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
House of Commons
From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the
second week of May, only a rapid retreat to Amiens and the south could have saved the British and
French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact was
not immediately realized. The French High Command hoped they would be able to close the gap, and
the Armies of the north were under their orders. Moreover, a retirement of this kind would have
involved almost certainly the destruction of the fine Belgian Army of over 20 divisions and the
abandonment of the whole of Belgium. Therefore, when the force and scope of the German
penetration were realized and when a new French Generalissimo, General Weygand, assumed
command in place of General Gamelin, an effort was made by the French and British Armies in
[-1.1396484, -0.71191406, 0.213

### Inserting the Embeddings into a Pinecone Index

In [23]:
import os
from pinecone import Pinecone, PodSpec
from langchain.vectorstores import Pinecone as Pinecone_langchain

# Initialize Pinecone client 
pinecone = Pinecone(
    api_key=os.environ.get('PINECONE_API_KEY')
)

# Delete any existing indexes
indexes = pinecone.list_indexes().names()
for i in indexes:
  print('Deleting all indexes ... ', end='')
  pinecone.delete_index(i)
  print('Done')

# Create a new index
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
  print(f'Creating index {index_name} ...')
  pinecone.create_index(index_name, dimension=4096, metric='cosine',spec=PodSpec(environment="gcp-starter"))
  print('Done!')

# Index the text chunks into Pinecone 
vector_store = Pinecone_langchain.from_documents(chunks, embeddings, index_name=index_name)
print("Vector store created !")

Deleting all indexes ... Done
Creating index churchill-speech ...
Done!
Vector store created !


## Asking Questions (Similarity Search)

In [24]:
# Query text 
query = 'What are the emotions of the speech?'

# Semantic search against indexed chunks
result = vector_store.similarity_search(query)

# Print top result 
print(result)

# Clean output
print('-' * 50)
for r in result:
    print(r.page_content)
    print('-' * 50)

[Document(page_content='excited and befooled the imaginations of many Continental tyrants. Many are the tales that are told.\nWe are assured that novel methods will be adopted, and when we see the originality of malice, the\ningenuity of aggression, which our enemy displays, we may certainly prepare ourselves for every kind\nof novel stratagem and every kind of brutal and treacherous maneuver. I think that no idea is so\noutlandish that it should not be considered and viewed with a searching, but at the same time, I hope,\nwith a steady eye. We must never forget the solid assurances of sea power and those which belong to\nair power if it can be locally exercised.\nI have, myself, full confidence that if all do their duty, if nothing is neglected, and if the best\narrangements are made, as they are being made, we shall prove ourselves once again able to defend\nour Island home, to ride out the storm of war, and to outlive the menace of tyranny, if necessary for'), Document(page_content=

## Answering in Natural Language using an LLM

In [25]:
# Import RetrievalQA and Chat model
from langchain.chains import RetrievalQA  
from langchain.llms import Cohere

# Create Cohere model 
llm = Cohere(temperature=0.75, cohere_api_key=os.environ.get('COHERE_API_KEY'))

# Create retriever from vector store
retriever = vector_store.as_retriever(
    # specifies to use semantic similarity search against the Pinecone index
    search_type='similarity', 
    # Here we set k=10 to retrieve the top 10 most similar results
    search_kwargs={'k': 10}) 

# Build QA chain with retriever 
chain = RetrievalQA.from_chain_type(llm=llm, 
                                    # builds a "stuff" chain that retrieves context for questions
                                    # you can have more details with - https://chat.langchain.com/
                                    chain_type="stuff", 
                                    retriever=retriever)

# Query the chain
query = "What are the emotions of this speech?"
answer = chain.invoke(query)

print(answer)

{'query': 'What are the emotions of this speech?', 'result': ' This speech by Winston Churchill to the House of Commons on June 4, 1940, conveys a tone of determination, resilience, and defiance in the face of adversity. Churchill acknowledges the severity of the situation, including the recent military losses and the potential threat of invasion, but insists on the importance of maintaining a defensive stance until the threat has been thoroughly dealt with. Throughout the speech, Churchill emphasizes the importance of unity and cooperation in order to successfully overcome the challenges that lie ahead. '}
