<a href="https://colab.research.google.com/github/Bhargav-017/Faiss-embedding/blob/main/FAISS_EMBEDDING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import numpy as np
import boto3
import faiss
import os
import dotenv
from langchain_core.documents import Document
dotenv.load_dotenv(dotenv_path=r"path")
class EmbeddingCreation:
    def __init__(self, dst_path, name, json_path, index_type="flatindex", M=10,efConstruction=10, efSearch=10):
        self.bedrock = boto3.client(service_name="bedrock-runtime",
                      region_name="us-east-1")
        self.dst_path = dst_path
        self.name = name
        self.index_type = "flatindex"  #hnsw
        self.json_path = json_path
        self.get_index_documents()
        self.M = M
        self.efConstruction = efConstruction
        self.efSearch = efSearch


    def create_documents_from_json(self,json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        documents = []
        for course in data["courses"]:
            content = f"""
                        Course Name: {course['CourseName']}
                        Course Price: {course['Course_Price']}
                        Course Link: {course['Course_Link']}
                        Course Lessons: {', '.join(course['Course_lessons'])}"""
            documents.append(Document(page_content=content.strip(),
                                    metadata={"course_name": course['CourseName']}))
        return documents


    def get_embeddings(self,text):
        response = self.bedrock.invoke_model(
            modelId="amazon.titan-embed-text-v1",
            body=json.dumps({
                "inputText": text
            }),
            accept="application/json",
            contentType="application/json"
        )
        r_body = json.loads(response["body"].read())
        embd = np.array(r_body["embedding"], dtype="float32")
        return embd


    def create_embeddings_for_documents(self, documents):
        print("Creating New embeddings")
        embeddings = []
        for doc in documents:
            embd = self.get_embeddings(doc.page_content)
            embeddings.append(embd)
        embeddings = np.vstack(embeddings)
        return embeddings


    def save_embeddings(self, embeddings, documents ):
        print("Saving Embeddings")
        dimension = embeddings.shape[1]
        if self.index_type == "flatindex":
            index = faiss.IndexFlatL2(dimension)
        else :
            index = faiss.IndexHNSWFlat(dimension, self.M)
            index.hnsw.efConstruction = self.efConstruction
            index.hnsw.efSearch = self.efSearch
        index.add(embeddings)

        # saving index and data
        dpath = os.path.join(self.dst_path, f"{self.name}_{self.index_type}.index")
        faiss.write_index(index, dpath)

        # save data too
        jdata = [ {"page_content" : doc.page_content,
                "metadata" : doc.metadata} for doc in documents ]
        dpath = os.path.join(self.dst_path, f"{self.name}.json")
        with open(dpath, "w") as fl:
            json.dump(jdata, fl, indent=2)

    def load_data(self):
        # load index
        dpath = os.path.join(self.dst_path, f"{self.name}_{self.index_type}.index")
        index = faiss.read_index(dpath)
        index.ntotal

        # load raw data as document
        dpath = os.path.join(self.dst_path, f"{self.name}.json")
        with open(dpath, "r") as fl:
            raw_data = json.load(fl)

        documents = [
            Document(
                    page_content = d["page_content"], metadata=d["metadata"])
            for d in raw_data
        ]
        return index, documents

    def get_index_documents(self,):
        dpath = os.path.join(self.dst_path, f"{self.name}.index")
        if os.path.exists(dpath):
            print("INFO : Found existing index loading from disk ")
            self.index, self.documents = self.load_data()
        else:
            if self.json_path is None:
                raise ValueError("json_path must be provided to create new index")
            documents = self.create_documents_from_json(self.json_path)
            embeddings = self.create_embeddings_for_documents(documents)
            self.save_embeddings(embeddings, documents)
            self.index, self.documents = self.load_data()
        return None

    def search_database(self, user_query,k=2):
        # Step1 - create embedding for user query -- same embedding model
        user_emb = self.get_embeddings(user_query)
        user_emb = user_emb.reshape(1,-1)

        # Step 2 : search it in index
        result = self.index.search(user_emb, k =2)
        distances, indices = result
        fetched_docs = [self.documents[i] for i in indices[0]]
        return fetched_docs

In [1]:
!git init


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [2]:
!git branch -M main

In [4]:
!git remote add origin https://github.com/Bhargav-017/Faiss-embedding.git

error: remote origin already exists.


In [5]:
!git add .

In [6]:
!git commit -m "initial commit"

Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@62f7f963c8ca.(none)')
