## Install Dependencies

In [None]:
!pip install langchain langchain-community langchain-core pinecone fastembed langchain-pinecone

In [None]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings
from google.colab import userdata
import time



## Load embedding model

In [None]:
class MultilingualE5Embeddings(Embeddings):
    def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        # E5 requires "passage:" prefix for documents
        return [
            self.model.encode(f"passage: {text}").tolist()
            for text in texts
        ]

    def embed_query(self, text):
        # E5 requires "query:" prefix for queries
        return self.model.encode(f"query: {text}").tolist()

In [None]:
embedding_model = MultilingualE5Embeddings()

## Create index in vector store

In [None]:
pc = Pinecone(api_key=userdata.get('PINECONE_API_KEY'))
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
pc_index_name = "gic-me5"

if pc_index_name in existing_indexes:
  print(f"Vector store index '{pc_index_name}' found. Deleting existing...")
  pc.delete_index(pc_index_name)


print(f"Creating new vector store...")

pc.create_index(
  name=pc_index_name,
  dimension=1024,
  metric="cosine",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)

while not pc.describe_index(pc_index_name).status['ready']:  # Wait for the index to be ready
  time.sleep(1)
  print(f"Successfully created vector store index '{pc_index_name}'.")


Vector store index 'gic-me5' found. Deleting existing...
Creating new vector store...


## Store contexts in vector store index

In [None]:
import pandas as pd

data = pd.read_csv("gic_data_contexts_final_with_meta.csv")
print(len(data))

1699


In [None]:
data.head()

Unnamed: 0,context_id,context_title,context_content,main_category_id,main_category_name,sub_category_name,sub_category_id,service_name,service_id,service_link,context_with_metadata
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful...",1,Education & Training,Education Publications,87,Sales Outlets of Books,665,https://gic.gov.lk/gic/index.php/en/component/...,Service: Sales Outlets of Books\nMain Category...
1,1-87-1481-1,Publication Prices - Sinhala Publications,Publication Price Sinhala Publications Rs. Cts...,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...,Service: Museum Publications\nMain Category: E...
2,1-87-1481-2,Publication Prices - English Publications,English Publications\n21. Some Sinhala Combati...,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...,Service: Museum Publications\nMain Category: E...
3,1-87-1481-3,Publication Prices - English and Tamil Publica...,36. The Pleistocene of Ceylon 390 00\n37. Colo...,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...,Service: Museum Publications\nMain Category: E...
4,1-87-662-1,Departmental Administration and Record Keeping,1. Maintaining the personal files of the offic...,1,Education & Training,Education Publications,87,Services of Establishment Unit of Educational ...,662,https://gic.gov.lk/gic/index.php/en/component/...,Service: Services of Establishment Unit of Edu...


In [None]:
from langchain_core.documents import Document

docs = []

for i, row in data.iterrows():
    if i % 20 == 0:
        print(f"Adding context {i}")

    doc = Document(
        page_content=row["context_with_metadata"],
        metadata={
            "context_id": row["context_id"],
            "context_title": row["context_title"],
            "main_category_id": int(row["main_category_id"]),
            "main_category_name": row["main_category_name"],
            "sub_category_id": int(row["sub_category_id"]),
            "sub_category_name": row["sub_category_name"],
            "service_id": int(row["service_id"]),
            "service_name": row["service_name"],
            "service_link": row["service_link"],
            "language": "en"
        }
    )

    docs.append(doc)

print(len(docs))

Adding context 0
Adding context 20
Adding context 40
Adding context 60
Adding context 80
Adding context 100
Adding context 120
Adding context 140
Adding context 160
Adding context 180
Adding context 200
Adding context 220
Adding context 240
Adding context 260
Adding context 280
Adding context 300
Adding context 320
Adding context 340
Adding context 360
Adding context 380
Adding context 400
Adding context 420
Adding context 440
Adding context 460
Adding context 480
Adding context 500
Adding context 520
Adding context 540
Adding context 560
Adding context 580
Adding context 600
Adding context 620
Adding context 640
Adding context 660
Adding context 680
Adding context 700
Adding context 720
Adding context 740
Adding context 760
Adding context 780
Adding context 800
Adding context 820
Adding context 840
Adding context 860
Adding context 880
Adding context 900
Adding context 920
Adding context 940
Adding context 960
Adding context 980
Adding context 1000
Adding context 1020
Adding context 1

In [None]:
index = pc.Index(pc_index_name)
vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

In [None]:
pc.describe_index("gic-me5")["dimension"]

1024

In [None]:
embeddings = []
for i, doc in enumerate(docs):
    if i % 20 == 0:
        print(f"Computing embedding {i}")
    emb = embedding_model.embed_documents([doc.page_content])[0]
    embeddings.append(emb)

Computing embedding 0
Computing embedding 20
Computing embedding 40
Computing embedding 60
Computing embedding 80
Computing embedding 100
Computing embedding 120
Computing embedding 140
Computing embedding 160
Computing embedding 180
Computing embedding 200
Computing embedding 220
Computing embedding 240
Computing embedding 260
Computing embedding 280
Computing embedding 300
Computing embedding 320
Computing embedding 340
Computing embedding 360
Computing embedding 380
Computing embedding 400
Computing embedding 420
Computing embedding 440
Computing embedding 460
Computing embedding 480
Computing embedding 500
Computing embedding 520
Computing embedding 540
Computing embedding 560
Computing embedding 580
Computing embedding 600
Computing embedding 620
Computing embedding 640
Computing embedding 660
Computing embedding 680
Computing embedding 700
Computing embedding 720
Computing embedding 740
Computing embedding 760
Computing embedding 780
Computing embedding 800
Computing embedding 82

In [None]:
print(len(embeddings))

1699


In [None]:
vectors = [
    (str(i), emb, doc.metadata)
    for i, (doc, emb) in enumerate(zip(docs, embeddings))
]

In [None]:
batch_size = 25
index = pc.Index(pc_index_name)

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    print(f"Uploading batch {i} - {i + len(batch)}")
    index.upsert(vectors=batch)

Uploading batch 0 - 25
Uploading batch 25 - 50
Uploading batch 50 - 75
Uploading batch 75 - 100
Uploading batch 100 - 125
Uploading batch 125 - 150
Uploading batch 150 - 175
Uploading batch 175 - 200
Uploading batch 200 - 225
Uploading batch 225 - 250
Uploading batch 250 - 275
Uploading batch 275 - 300
Uploading batch 300 - 325
Uploading batch 325 - 350
Uploading batch 350 - 375
Uploading batch 375 - 400
Uploading batch 400 - 425
Uploading batch 425 - 450
Uploading batch 450 - 475
Uploading batch 475 - 500
Uploading batch 500 - 525
Uploading batch 525 - 550
Uploading batch 550 - 575
Uploading batch 575 - 600
Uploading batch 600 - 625
Uploading batch 625 - 650
Uploading batch 650 - 675
Uploading batch 675 - 700
Uploading batch 700 - 725
Uploading batch 725 - 750
Uploading batch 750 - 775
Uploading batch 775 - 800
Uploading batch 800 - 825
Uploading batch 825 - 850
Uploading batch 850 - 875
Uploading batch 875 - 900
Uploading batch 900 - 925
Uploading batch 925 - 950
Uploading batch 950 

In [None]:
stats = index.describe_index_stats()
print("Pinecone index stats:", stats)

Pinecone index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1699}},
 'total_vector_count': 1699,
 'vector_type': 'dense'}


# Add "text" metadata for embeddings

In [None]:
pc = Pinecone(api_key=userdata.get('PINECONE_API_KEY'))
pc_index_name = "gic-me5"

In [None]:
index = pc.Index(pc_index_name)
vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

In [None]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1699}},
 'total_vector_count': 1699,
 'vector_type': 'dense'}

In [None]:
index.update(
    id="0",
    set_metadata={
        "text": docs[0].page_content,
    }
)

{}

In [None]:
for i, doc in enumerate(docs):
    if i % 20 == 0:
        print(f"Updating text {i}")
    index.update(
      id=str(i),
      set_metadata={
          "text": doc.page_content,
      }
    )

Updating text 0
Updating text 20
Updating text 40
Updating text 60
Updating text 80
Updating text 100
Updating text 120
Updating text 140
Updating text 160
Updating text 180
Updating text 200
Updating text 220
Updating text 240
Updating text 260
Updating text 280
Updating text 300
Updating text 320
Updating text 340
Updating text 360
Updating text 380
Updating text 400
Updating text 420
Updating text 440
Updating text 460
Updating text 480
Updating text 500
Updating text 520
Updating text 540
Updating text 560
Updating text 580
Updating text 600
Updating text 620
Updating text 640
Updating text 660
Updating text 680
Updating text 700
Updating text 720
Updating text 740
Updating text 760
Updating text 780
Updating text 800
Updating text 820
Updating text 840
Updating text 860
Updating text 880
Updating text 900
Updating text 920
Updating text 940
Updating text 960
Updating text 980
Updating text 1000
Updating text 1020
Updating text 1040
Updating text 1060
Updating text 1080
Updating te

In [None]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1699}},
 'total_vector_count': 1699,
 'vector_type': 'dense'}