In [1]:
from dotenv import load_dotenv
import os
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()
openai_api_key = os.environ["OPENAI_API_KEY"]
pinecone_api_key = os.environ["PINECONE_API_KEY"]
print("OpenAI API key:", openai_api_key)
print("Pinecone API key:", pinecone_api_key)

OpenAI API key: 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
Pinecone API key: 996970cb-0e9b-4775-8a21-e518d47baf3b


In [3]:
index_name = "h2p"

pinecone.init(api_key=pinecone_api_key, environment="gcp-starter")

In [4]:
import time

if index_name is not None and index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# we create a new index
if index_name is not None:
    pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

# wait for index to be initialized
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

In [5]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [6]:
folder_path = "./resources/"

loader = PyPDFDirectoryLoader(folder_path)

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
chuncks = text_splitter.split_documents(documents)
print(f"Num chuncks: {len(chuncks)}")
print(f"Metadata: {chuncks[0].metadata}")


Num chuncks: 2124
Metadata: {'source': 'resources/10-2020-presentation_pth2-in-fc-hdvs_by_2030.pdf', 'page': 0}


In [7]:
#######
# warning need to remove this part
#######
# fake embeddings
# import numpy as np
# embeddings = np.random.rand(len(chuncks), 1536).astype(np.float32)
# print(embeddings.shape)
#######
# warning need to remove this part
#######

In [8]:
embed = OpenAIEmbeddings(openai_api_key=openai_api_key)
embeddings = embed.embed_documents([chunck.page_content for chunck in chuncks])
print(f"res shape: {len(embeddings)}")
print(f"res 0 shape: {len(embeddings[0])}")

res shape: 2124
res 0 shape: 1536


In [9]:
str(chuncks[100].metadata)

"{'source': 'resources/2023_IEA_Hydrogen_patents_for_a_clean_energy_future.pdf', 'page': 59}"

In [11]:
# we prepare docs for upset in pinecone, we create dataframe with id and embedding
import pandas as pd
df = pd.DataFrame()
df['id'] = [str(i) for i in range(len(chuncks))]
df['values'] = embeddings
df['metadata'] = [{"metadata": str(chunck.metadata), "text": chunck.page_content} for chunck in chuncks]
print(df.head()['metadata'])


0    {'metadata': '{'source': 'resources/10-2020-pr...
1    {'metadata': '{'source': 'resources/10-2020-pr...
2    {'metadata': '{'source': 'resources/10-2020-pr...
3    {'metadata': '{'source': 'resources/10-2020-pr...
4    {'metadata': '{'source': 'resources/10-2020-pr...
Name: metadata, dtype: object


In [12]:
index.upsert_from_dataframe(df, batch_size=100, show_progress=False)

collecting async responses: 100%|██████████| 22/22 [02:55<00:00,  7.99s/it]


upserted_count: 2124