In [1]:

REPO_OWNER = "duplocloud"  # Replace with the repository owner
REPO_NAME = "docs"  # Replace with the repository name
DIRECTORY_PATH = "getting-started-1/application-focussed-interface"  # Replace with the directory path
SAVE_DIRECTORY = "./raw_docs"  # Local directory to save files
ACCESS_TOKEN = None  # Optional: Add a GitHub personal access token for private repos


In [2]:
import requests
"""
Fetch all files in a specific directory of a GitHub repository.

:param repo_owner: Owner of the repository (e.g., 'octocat').
:param repo_name: Name of the repository (e.g., 'Hello-World').
:param directory_path: Path to the target directory within the repository.
:param access_token: Personal access token for private repositories (optional).
:return: List of file paths within the specified directory.
"""
base_url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{DIRECTORY_PATH}"
headers = {}
files = []

response = requests.get(base_url, headers=headers)

if response.status_code == 200:
    files = []
    contents = response.json()

    for item in contents:
        if item['type'] == 'file':
            files.append(item['download_url'])

else:
    raise Exception(f"Failed to fetch files: {response.status_code} - {response.text}")

files


['https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/README.md',
 'https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/app-service-and-cloud-services.md',
 'https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/diagnostics.md',
 'https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/infrastructure.md',
 'https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/plan.md']

In [3]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(files)
docs = loader.load()
docs

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/README.md'}, page_content="# Application Focused Interface\n\nThe greatest capability of the DuploCloud platform is the application infrastructure centric abstraction created on top of the cloud provider which enables the user to deploy and operate their applications without knowledge of lower level DevOps nuances. Further, unlike a PAAS such as Heroku, the platform does not get in the way of users consuming cloud services directly from the cloud provider, as in a user directly operating on constructs like S3, DynamoDB, Lambda functions, GCP Redis, Azure SQL etc., while offering greater scale and unlimited flexibility.\n\nSome concepts relating to security (DevSecOps) are hidden from the end user, for example IAM roles, KMS keys, Azure Managed Identities, GCP service accounts etc. However, even those are configurable for the operator and in any case sin

In [4]:
from langchain.text_splitter import CharacterTextSplitter

# Step 2: Split the documents into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(docs)
print(len(texts))
texts


Created a chunk of size 564, which is longer than the specified 500
Created a chunk of size 546, which is longer than the specified 500
Created a chunk of size 744, which is longer than the specified 500
Created a chunk of size 586, which is longer than the specified 500


17


[Document(metadata={'source': 'https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/README.md'}, page_content='# Application Focused Interface'),
 Document(metadata={'source': 'https://raw.githubusercontent.com/duplocloud/docs/main/getting-started-1/application-focussed-interface/README.md'}, page_content='The greatest capability of the DuploCloud platform is the application infrastructure centric abstraction created on top of the cloud provider which enables the user to deploy and operate their applications without knowledge of lower level DevOps nuances. Further, unlike a PAAS such as Heroku, the platform does not get in the way of users consuming cloud services directly from the cloud provider, as in a user directly operating on constructs like S3, DynamoDB, Lambda functions, GCP Redis, Azure SQL etc., while offering greater scale and unlimited flexibility.'),
 Document(metadata={'source': 'https://raw.githubusercontent.com/duploclo

In [9]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# load half the embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = FAISS.from_documents(texts, embeddings)


In [10]:
vectorstore.save_local('./data/stores/faiss_vector_store')

In [11]:
vectorstore = FAISS.load_local(
    "./data/stores/faiss_vector_store", embeddings, allow_dangerous_deserialization=True
)