In [None]:
!pip install langchain
!pip install tqdm
!pip install langchain-pinecone



In [None]:
from google.colab import userdata
import os
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

GOOGLE_API_KEY = userdata.get('GEMINI_API_KEY')
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
PINECONE_ENVIRONMENT = 'us-east-1'

In [None]:
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(
    api_key=PINECONE_API_KEY
)

# Check if the index exists; if not, create it
index_name = "online-rag-project"  # create index any name that you suppose
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",  # Choose the metric: cosine, euclidean, or dotproduct
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENVIRONMENT  # Use your environment's region
        )
    )

# # Connect to the index
index = pc.Index(name=index_name)
print(f"Successfully connected to index: {index_name}")

Successfully connected to index: online-rag-project


In [None]:
!pip install -q -U langchain-google-genai

In [None]:
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY # We use Google Genai Model


embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",  # Specify the desired embedding model
    )

In [None]:
vector = embeddings.embed_query("I am building Online-Rag-Project")
len(vector)

768

In [None]:
vector

[0.027731068432331085,
 -0.029856368899345398,
 -0.03635884448885918,
 0.004079781472682953,
 -0.003126587253063917,
 0.009454244747757912,
 0.05576081573963165,
 -0.03277921676635742,
 -0.027367549017071724,
 0.04850129783153534,
 0.021311910822987556,
 0.027686292305588722,
 -0.03834576904773712,
 -0.0244741328060627,
 0.033721767365932465,
 -0.02759343758225441,
 0.016235828399658203,
 -0.011612839996814728,
 0.00014874755288474262,
 -0.041069094091653824,
 -0.03649775683879852,
 0.003475440200418234,
 0.00986570119857788,
 -0.04896118491888046,
 0.0038396024610847235,
 0.030029181391000748,
 0.022098086774349213,
 -0.05315674468874931,
 0.0021939112339168787,
 -0.016850685700774193,
 -0.05763878673315048,
 0.021723872050642967,
 -0.08397433906793594,
 0.013725591823458672,
 -0.005550370551645756,
 -0.03003622218966484,
 0.0169448833912611,
 0.027266938239336014,
 -0.01290828362107277,
 -0.017905378714203835,
 0.007893874309957027,
 -0.07130870968103409,
 -0.02608097530901432,
 -0.0

In [None]:
vector [:5]

[0.027731068432331085,
 -0.029856368899345398,
 -0.03635884448885918,
 0.004079781472682953,
 -0.003126587253063917]

In [None]:
from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(
    index=index,    embedding=embeddings
)

In [None]:
# Create Dummy Documents
from langchain_core.documents import Document

document_1 = Document(page_content="I had chocolate chip pinecake and scrambled eggs.", metadata={"source": "tweet"})

In [None]:
document_1


Document(metadata={'source': 'tweet'}, page_content='I had chocolate chip pinecake and scrambled eggs.')

In [None]:
# Data Save
from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(page_content="I had chocolate chip pinecake and scrambled eggs.", metadata={"source": "tweet"},)

document_2 = Document(page_content="The weather forcast for tomorrow is cloudy and overcast.", metadata={"source": "tweet"},)

document_3 = Document(page_content="Building an existing new project with LangChain - come check it out!", metadata={"source": "tweet"},)

document_4 = Document(page_content="Robber Broke into the city.", metadata={"source": "tweet"},)

document_5 = Document(page_content="That was an amazing movie .", metadata={"source": "tweet"},)

document_6 = Document(page_content="Is the new Iphone ", metadata={"source": "tweet"},)

documents = [document_1, document_2, document_3, document_4, document_5, document_6]


# Data Retrieve



In [None]:
len(documents)

6

In [None]:
from uuid import uuid4   # for random ID Imports
uuid4 ()

UUID('af2cc038-cc4f-4c91-b22e-399a78db9c02')

In [None]:
# Add Documents into vector store

uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

print(uuids)

['1223156f-a817-443f-8dfc-0a42f58a28d4', 'a88bccc5-ea6d-41b3-8c75-2bb26ea63218', '2c08d278-cc4f-439c-8d9c-ccd09a1806b7', 'e5a491cc-d320-40ea-b0bd-08623a1087d8', '8dc0e3f5-3d9c-4c14-88a0-a412e065dd3d', '1e115acb-9f4d-4788-8b86-708fdbf366ed']


In [None]:
# data retrieval

results = vector_store.similarity_search("LangChain provides abstractions to make working with LLMs easy",)

for res in results:
  print(f"*{res.page_content} [{res.metadata}]")

*Building an existing new project with LangChain - come check it out! [{'source': 'tweet'}]
*That was an amazing movie . [{'source': 'tweet'}]
*Is the new Iphone  [{'source': 'tweet'}]
*I had chocolate chip pinecake and scrambled eggs. [{'source': 'tweet'}]


In [None]:
results = vector_store.similarity_search("LangChain provides abstractions to make working with LLMs easy", k=2, filter={"source" : "tweet"},)

for res in results:
  print(f"*{res.page_content} [{res.metadata}]")

*Building an existing new project with LangChain - come check it out! [{'source': 'tweet'}]
*That was an amazing movie . [{'source': 'tweet'}]


In [None]:
results = vector_store.similarity_search_with_score("will it be hot tomorrow", k=1, filter={"source" : "news"},)

for res in results:
  print()

AttributeError: 'tuple' object has no attribute 'page_content'