In [24]:
import os
from pathlib import Path
import shutil

import git
import magika
import requests

m = magika.Magika()

In [15]:
repo_url = "https://github.com/apache/airflow.git"

In [17]:
local_dir = "Airflow_docs"

In [28]:
repo_dir='./airflow_docs'

In [9]:
from git import Repo

In [19]:
repo = Repo.clone_from(repo_url, local_dir)

In [30]:
def clone_repo(repo_url, repo_dir):
    """Clone a GitHub repository."""

    if os.path.exists(repo_dir):
        shutil.rmtree(repo_dir)
    os.makedirs(repo_dir)
    git.Repo.clone_from(repo_url, repo_dir)

In [32]:
def extract_code(repo_dir):
    """Create an index, extract content of code/text files."""

    code_index = []
    code_text = ""
    for root, _, files in os.walk(repo_dir):
        for file in files:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, repo_dir)
            code_index.append(relative_path)

            file_type = m.identify_path(Path(file_path))
            if file_type.output.group in ("text", "code"):
                try:
                    with open(file_path) as f:
                        code_text += f"----- File: {relative_path} -----\n"
                        code_text += f.read()
                        code_text += "\n-------------------------\n"
                except Exception:
                    pass

    return code_index, code_text

In [34]:
clone_repo(repo_url, repo_dir)

In [36]:
code_index, code_text = extract_code(repo_dir)

In [38]:
code_index

['.asf.yaml',
 '.git-blame-ignore-revs',
 '.codespellignorelines',
 'CODE_OF_CONDUCT.md',
 'INTHEWILD.md',
 'codecov.yml',
 'RELEASE_NOTES.rst',
 'INSTALL',
 'LICENSE',
 'CONTRIBUTING.rst',
 '.pre-commit-config.yaml',
 '.hadolint.yaml',
 'Dockerfile',
 'pyproject.toml',
 'ISSUE_TRIAGE_PROCESS.rst',
 '.readthedocs.yml',
 '.editorconfig',
 'NOTICE',
 'README.md',
 'doap_airflow.rdf',
 '.bash_completion',
 '.dockerignore',
 'hatch_build.py',
 '.gitignore',
 'prod_image_installed_providers.txt',
 '.gitattributes',
 'BREEZE.rst',
 'COMMITTERS.rst',
 '.gitpod.yml',
 '.mailmap',
 'yamllint-config.yml',
 '.markdownlint.yml',
 'Dockerfile.ci',
 '.rat-excludes',
 'PROVIDERS.rst',
 '.cherry_picker.toml',
 'generated/dependency_depth.json',
 'generated/README.md',
 'generated/provider_dependencies.json',
 'generated/devel_deps.txt',
 'generated/PYPI_README.md',
 'generated/provider_metadata.json',
 'generated/dep_tree.txt',
 'docker_tests/test_docker_compose_quick_start.py',
 'docker_tests/conftes

In [49]:
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

In [52]:
PINECONE_API_KEY = ''
OPENAI_API_KEY = ''

In [56]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [59]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    chunk_size = 10000,
    chunk_overlap  = 200
)
text = text_splitter.create_documents([code_text])

Created a chunk of size 56938, which is longer than the specified 10000
Created a chunk of size 31385, which is longer than the specified 10000
Created a chunk of size 44616, which is longer than the specified 10000
Created a chunk of size 25913, which is longer than the specified 10000
Created a chunk of size 11506, which is longer than the specified 10000
Created a chunk of size 61904, which is longer than the specified 10000
Created a chunk of size 33029, which is longer than the specified 10000
Created a chunk of size 312098, which is longer than the specified 10000
Created a chunk of size 18436, which is longer than the specified 10000
Created a chunk of size 12325, which is longer than the specified 10000
Created a chunk of size 17848, which is longer than the specified 10000
Created a chunk of size 303798, which is longer than the specified 10000
Created a chunk of size 11173, which is longer than the specified 10000
Created a chunk of size 15257, which is longer than the specif

In [62]:
text1 = [chunk for chunk in text if len(chunk.page_content) <= 10000]

In [65]:
len(text1)

5877

In [70]:
pc.create_index(
                name='airflow-github',
                dimension=3072,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
            )

In [75]:
index_name = 'airflow-github'

In [78]:
index = pc.Index(index_name)

In [81]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
context = retriever.invoke("Give me code to setup Airflow")

In [90]:
import time
from typing import List
from langchain.schema import Document
from langchain.vectorstores import VectorStore
from openai import RateLimitError
import backoff

@backoff.on_exception(backoff.expo, RateLimitError, max_tries=8)
def add_documents_with_backoff(vector_store: VectorStore, documents: List[Document], batch_size: int = 100) -> List[str]:
    return vector_store.add_documents(documents[:batch_size])




In [98]:
def add_documents_to_vectorstore(vector_store: VectorStore, documents: List[Document], batch_size: int = 50) -> List[str]:
    all_ids = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        try:
            ids = add_documents_with_backoff(vector_store, batch, batch_size)
            all_ids.extend(ids)
            time.sleep(60)
        except Exception as e:
            print(f"Error adding batch to vector store: {e}")
    return all_ids

In [100]:
ids = add_documents_to_vectorstore(vector_store, text1, batch_size=50)

In [233]:
from git import Repo
import os

repo_url = ""
clone_directory = "test-github-push"

new_branch_name = "feature/new-branch"


In [237]:
repo = Repo.clone_from(repo_url, clone_directory)

In [243]:
repo.git.checkout('-b', new_branch_name)

''

In [245]:
file_path = os.path.join(clone_directory, "example.txt")


In [247]:
with open(file_path, "w") as file:
    file.write("Testing New file push to Git!\n")

In [251]:
repo.index.add(["/Users/ramkumarrp/test-github-push/example.txt"])

[(100644, 790761d8c903a93052b9cda639efb5bc4259a102, 0, example.txt)]

In [253]:
commit_message = "Checking commit"
repo.index.commit(commit_message)

<git.Commit "b1dfe01ea7c417b8fe6570edac5b18e62ad5700b">

In [255]:
origin = repo.remote(name='origin')
origin.push(refspec=f"{new_branch_name}:{new_branch_name}")

[<git.remote.PushInfo at 0x34d040860>]