## Install Dependencies

In [1]:
!pip install langchain langchain-community langchain-core pinecone fastembed langchain-pinecone



In [2]:
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import time

## Load embedding model

In [3]:
embedding_model = FastEmbedEmbeddings()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

## Create index in vector store

In [5]:
pc = Pinecone(api_key="pcsk_4NLPph_SjWyn1UiMYviUr7RDm38Xgnuj2NdEoHoMi9EjVUhBBKcEnRaBrYBCwnHmdDau6x")
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
pc_index_name = "gic-fastembed"

if pc_index_name in existing_indexes:
  print(f"Vector store index '{pc_index_name}' found. Deleting existing...")
  pc.delete_index(pc_index_name)


print(f"Creating new vector store...")

pc.create_index(
  name=pc_index_name,
  dimension=384,
  metric="cosine",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)

while not pc.describe_index(pc_index_name).status['ready']:  # Wait for the index to be ready
  time.sleep(1)
  print(f"Successfully created vector store index '{pc_index_name}'.")


Vector store index 'gic-fastembed' found. Deleting existing...
Creating new vector store...


## Store contexts in vector store index

In [6]:
import pandas as pd

data = pd.read_csv("gic_data_contexts_final_with_meta.csv")

# contexts = data['context'].values

print(len(data))

1699


In [7]:
data.head()

Unnamed: 0,context_id,context_title,context_content,main_category_id,main_category_name,sub_category_name,sub_category_id,service_name,service_id,service_link,context_with_metadata
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful...",1,Education & Training,Education Publications,87,Sales Outlets of Books,665,https://gic.gov.lk/gic/index.php/en/component/...,Service: Sales Outlets of Books\nMain Category...
1,1-87-1481-1,Publication Prices - Sinhala Publications,Publication Price Sinhala Publications Rs. Cts...,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...,Service: Museum Publications\nMain Category: E...
2,1-87-1481-2,Publication Prices - English Publications,English Publications\n21. Some Sinhala Combati...,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...,Service: Museum Publications\nMain Category: E...
3,1-87-1481-3,Publication Prices - English and Tamil Publica...,36. The Pleistocene of Ceylon 390 00\n37. Colo...,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...,Service: Museum Publications\nMain Category: E...
4,1-87-662-1,Departmental Administration and Record Keeping,1. Maintaining the personal files of the offic...,1,Education & Training,Education Publications,87,Services of Establishment Unit of Educational ...,662,https://gic.gov.lk/gic/index.php/en/component/...,Service: Services of Establishment Unit of Edu...


In [9]:
from langchain_core.documents import Document

docs = []

for i, row in data.iterrows():
    if i % 20 == 0:
        print(f"Adding context {i}")

    doc = Document(
        page_content=row["context_with_metadata"],
        metadata={
            "context_id": row["context_id"],
            "context_title": row["context_title"],
            "main_category_id": int(row["main_category_id"]),
            "main_category_name": row["main_category_name"],
            "sub_category_id": int(row["sub_category_id"]),
            "sub_category_name": row["sub_category_name"],
            "service_id": int(row["service_id"]),
            "service_name": row["service_name"],
            "service_link": row["service_link"],
            "language": "en"
        }
    )

    docs.append(doc)

print(len(docs))

Adding context 0
Adding context 20
Adding context 40
Adding context 60
Adding context 80
Adding context 100
Adding context 120
Adding context 140
Adding context 160
Adding context 180
Adding context 200
Adding context 220
Adding context 240
Adding context 260
Adding context 280
Adding context 300
Adding context 320
Adding context 340
Adding context 360
Adding context 380
Adding context 400
Adding context 420
Adding context 440
Adding context 460
Adding context 480
Adding context 500
Adding context 520
Adding context 540
Adding context 560
Adding context 580
Adding context 600
Adding context 620
Adding context 640
Adding context 660
Adding context 680
Adding context 700
Adding context 720
Adding context 740
Adding context 760
Adding context 780
Adding context 800
Adding context 820
Adding context 840
Adding context 860
Adding context 880
Adding context 900
Adding context 920
Adding context 940
Adding context 960
Adding context 980
Adding context 1000
Adding context 1020
Adding context 1

In [10]:
index = pc.Index(pc_index_name)
vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

In [11]:
vector_store.add_documents(
    documents=docs,
    batch_size=50
)

['5edaf7a2-d19e-4095-8d10-76cd82c494b4',
 '1fe3614c-b38c-47df-a4cc-13a08007443c',
 'a9d3e053-879e-486e-ab2f-c628f3e21276',
 'cf7b6370-c7b7-42e4-977d-dfa684a65155',
 'f54028ac-55d2-428f-bb36-79c506815d5a',
 '559c5666-2488-4a9f-a3df-add60c405cfe',
 'a327ebcb-686b-4d8c-aaf7-3508e9a20c79',
 '66356e60-850a-4125-981e-2c3aed49894e',
 '85902f37-22c2-4e63-b211-ca07294afee6',
 'b11eda87-bc47-4388-9c88-c0d954f6121e',
 'bb89e718-0d89-46e1-ac15-cd8d23639f77',
 '1d415c73-fd3b-4c18-b5c6-649397a59e88',
 'e60aef7c-56a2-4f46-a565-aaebd1d51881',
 'a74cd4dc-d6c4-4b16-b55d-623df729addd',
 '2d9cffe7-8af4-4029-8429-a15e3f302264',
 'd5e48261-5c75-4b99-9e0a-0ec0fda676f0',
 '3df7c28b-305d-4af7-ba59-be60ee8a62b1',
 '3614113a-66ad-4c71-8266-63902783a3a8',
 '245f39b5-8df5-482d-8715-a1b3136d4f5b',
 '02ecdc26-99de-4beb-93fb-05e34849d021',
 '8afbc598-f182-4ace-8bf8-2431c56f9cbb',
 '474d67e1-5968-4737-842b-673e662f0cea',
 'b4a45b60-f32d-4018-826e-937084124f3e',
 'ae957aad-6d79-4e50-8ab8-a56bb635a806',
 '85911a77-5eb9-