#### Splitting and Embedding Text Using LangChain

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [4]:
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[0])

page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940'


In [7]:
print(chunks[1].page_content)

June 4, 1940
House of Commons


In [8]:
print(f'Length of chunks: {len(chunks)}')

Length of chunks: 300


#### Embedding Cost

In [9]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_token = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Token: {total_token}')
    print(f'Embedding Cost in USD: {total_token / 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Token: 4820
Embedding Cost in USD: 0.001928


In [10]:
from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [11]:
vector = embedding.embed_query(chunks[0].page_content)
print(vector)

[-0.04454859252972275, -0.03779107625713776, -0.002879912206270699, -0.008045266214119192, 0.015746282084248255, 0.022516546299967938, -0.02843255862727964, -0.009734645282265439, 0.0010447052554574756, 0.007229264674373191, 0.007854016333454415, 0.032742067486658956, 0.007420515020699245, -0.011723648511527381, 0.006314450277188572, -0.005351823642668397, 0.013234526573466103, -0.0025308802194518626, 0.013502277151454833, -0.011003272377775045, -0.008140891620112858, -0.02685155477090687, 0.029554560162353798, -0.0037612576647914073, -0.01436927884564262, -0.01841103805835576, 0.010837522108669883, -0.01865328809346232, 0.0030615999654312587, -0.014356529039862813, 0.007114514559709814, -0.008568017564316847, -0.016536783081112085, 0.005144635573456307, -0.018296287943692383, -0.023880797870148653, -0.022465545214203596, -0.008727392930532107, 0.0226057968030717, -0.012743650669040521, 0.013629778003220573, 0.00470475982347251, 0.00875289254209172, 0.0029516310279353096, -0.0279225570

#### Inserting the Embeddings into a Pinecone Index

In [15]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

In [16]:
# deleting all indexex

indexes = pinecone.list_indexes()

for i in indexes:
    print('Deleting all indexes ... ', end='')
    pinecone.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [18]:
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    print('Creating indexx {index_name} ...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating indexx {index_name} ...
Done!


In [19]:
vector_store = Pinecone.from_documents(chunks, embedding, index_name=index_name)

#### Asking Questions (Similarity Search)

In [20]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='front, now on that, fighting'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940')]


In [21]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
--------------------------------------------------


In [22]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [25]:
# query = 'Where should we fight?'
# query = 'Who was the king of Belgium at that time?'
query = 'What about the French Armies?'
answer = chain.run(query)
print(answer)

The French Armies were also involved in the fighting. They held certain positions along the front lines and were tasked with advancing across the Somme in large numbers to seize territory.
