In [None]:
!pip install GitPython -qU

In [4]:
from git import Repo

repo = Repo.clone_from("https://github.com/langchain-ai/langserve", "./langserve_repo")
branch = repo.head.reference

In [5]:
from langchain.document_loaders import GitLoader

In [6]:
loader = GitLoader(repo_path="./langserve_repo/", branch=branch)

In [7]:
langserve_data = loader.load()

In [8]:
len(langserve_data)

132

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
)

langserve_data = splitter.split_documents(langserve_data)

In [10]:
len(langserve_data)

3842

In [11]:
hf_embedding_url = "https://l28vfh3hv125bzuu.us-east-1.aws.endpoints.huggingface.cloud"

In [12]:
import getpass
import os

hf_token = getpass.getpass("Enter your Hugging Face API token: ")

In [13]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

embeddings_model = HuggingFaceInferenceAPIEmbeddings(
    api_key=hf_token, api_url=hf_embedding_url
)

In [14]:
embeddings = []

for i in range(0, len(langserve_data) - 1, 32):
    response = embeddings_model.embed_documents(
            [document.page_content for document in langserve_data[i : i + 32]]
        )
    embeddings.append(
        response
    )

In [15]:
len(embeddings)

121

In [16]:
embeddings = [item for sub_list in embeddings for item in sub_list]

In [17]:
len(embeddings)

3842

In [None]:
!pip install -qU faiss-cpu

In [18]:
from langchain.vectorstores import FAISS

langserve_text_data = [document.page_content for document in langserve_data]

document_embedding_pairs = list(zip(langserve_text_data, embeddings))

vectorstore = FAISS.from_embeddings(
    document_embedding_pairs,
    embedding=embeddings_model,
)

In [21]:
vectorstore.save_local("./langserve_index")