In [17]:
import json
import os
import tempfile
from git import Repo
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
import re
import requests
from openai import AzureOpenAI

In [4]:
def get_unique_repo_set(jsonl_path):
    repos = set()
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                sample = json.loads(line)
                repo = sample.get("repo")
                if repo:
                    repos.add(repo.strip())
            except json.JSONDecodeError:
                continue
    return repos

In [5]:
lite_repos = get_unique_repo_set("./data/swe_bench_lite_test.jsonl")

In [6]:
len(lite_repos)

12

In [7]:
list_repos: list[str] = list(lite_repos)
list_repos = sorted(list_repos)

In [8]:
list_repos

['astropy/astropy',
 'django/django',
 'matplotlib/matplotlib',
 'mwaskom/seaborn',
 'pallets/flask',
 'psf/requests',
 'pydata/xarray',
 'pylint-dev/pylint',
 'pytest-dev/pytest',
 'scikit-learn/scikit-learn',
 'sphinx-doc/sphinx',
 'sympy/sympy']

In [9]:
GITHIB_BASE_URL = "https://github.com/"

In [10]:
model = SentenceTransformer("BAAI/bge-large-en")

In [None]:
def collect_files(root_dir, extensions={".py"}):
    collected = []

    for dirpath, _, filenames in os.walk(root_dir):
        if any(excluded in dirpath for excluded in [".git", "tests", "test", "node_modules", ".venv", "__pycache__"]):
            continue
        for fname in filenames:
            if any(fname.endswith(ext) for ext in extensions):
                collected.append(os.path.join(dirpath, fname))
    return collected

def read_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    except Exception as e:
        print(f"Error leyendo {file_path}: {e}")
        return ""
    
def add_context(file):
    model_endpoint = "http://localhost:11434/api/generate"
    payload = {"model": "deepseek-coder:6.7b", "stream": False, "prompt": f"""Read the following code and explain clearly and precisely what it does and how it can be used.
Do not introduce the explanation with phrases like "This Python file", "In this script", or "The code below".
Write in plain, well-written natural language, without markdown, numbers or bullet points. Just describe its purpose and usage clearly and directly for a technical reader:

{file}
"""}

    res = requests.post(model_endpoint, json=payload)
    res_json = json.loads(res.text)
    raw_response = res_json["response"]
    model_response = re.sub(r"<think>.*?</think>", "", raw_response, flags=re.DOTALL | re.IGNORECASE).strip()
    
    model_response = "<context>\n" + '\n'.join(line for line in model_response.splitlines() if line.strip()) + "\n<context>"
    print(model_response)
    return model_response

def embed_repo(repo_path):
    files = collect_files(repo_path)
    embeddings = []
    metadata = []
    docs = []

    for path in files:
        content = read_file(path)
        if not content.strip():
            continue
        context = add_context(content)
        
        path_split = path.split("/")
        name = path_split[-1]

        content = f"{name}\n\n" + context + "\n\n" + content
        embedding = model.encode(content)  
        embeddings.append(embedding)
        metadata.append({"name": name, "extension": "py", "path": "/".join(path_split[-2:])})
        docs.append(content)

    return embeddings, metadata, docs

In [12]:
def process_repo(repo):
    # 1. Crear carpeta temporal
    repo_name = repo.split("/")[-1]
    repo_url = GITHIB_BASE_URL + repo
    
    with tempfile.TemporaryDirectory() as tmp_dir:
        print(f"Clonando {repo_url} en {tmp_dir}")
        try:
            Repo.clone_from(repo_url, tmp_dir)
        except Exception as e:
            print(f"Error al clonar {repo_url}: {e}")
            return
        # _ = input("lll")
        embeddings, metadata, docs = embed_repo(tmp_dir )

        # 4. Guardar en vector DB
        # vector_db.add(embeddings, metadata=metadata)

        # 5. tmp_dir se elimina automáticamente
        print(f"Repo {repo_url} procesado y eliminado.")
        return embeddings, metadata, docs

In [14]:
chroma_client = chromadb.HttpClient(host="localhost", port=8005, settings=Settings())

In [25]:
current_repos = list_repos[:1]
BATCH_SIZE = 50

for idx, repo in enumerate(current_repos):
    print(f"({idx}/{len(current_repos)}) Repo: {repo} ------------------------ \n")
    embeddings, metadata, docs = process_repo(repo)
    collection_name = "ctx_"+repo.replace("/", "_")
    
    collection = chroma_client.get_or_create_collection(name=collection_name)
    
    for i in range(0, len(docs), BATCH_SIZE):
        batch_docs = docs[i:i+BATCH_SIZE]
        batch_embeddings = embeddings[i:i+BATCH_SIZE]
        batch_metadata = metadata[i:i+BATCH_SIZE]
        batch_ids = [str(uuid.uuid4()) for _ in batch_docs]

        collection.upsert(
            ids=batch_ids,
            documents=batch_docs,
            embeddings=batch_embeddings,
            metadatas=batch_metadata
        )    

(0/1) Repo: astropy/astropy ------------------------ 

Clonando https://github.com/astropy/astropy en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmpqe43lmvv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


<context>
This script is a test configuration file used by PyTest to manage different settings in tests related to astropy (a package for performing astronomy calculations). It's licensed under the BSD-style license as indicated at the top of the code. 
Here's what each section does:
1. Import necessary modules: The script imports `os`, `tempfile` from Python's standard library and `Path` from `pathlib` which are used to interact with the operating system, handle temporary files and directories respectively. Also it imports `hypothesis` for managing hypothesis tests in python.
2. Load necessary data: The script tries to load certain modules and their versions. If such a module is not available, empty dictionaries (PYTEST_HEADER_MODULES and TESTED_VERSIONS) are created.
3. Configure pytest: Two functions `pytest_configure` and `pytest_report_header` are defined which configure the header modules to be tested and report their headers respectively. 
4. Register Hypothesis profiles: Hypoth

KeyboardInterrupt: 

In [None]:
# for idx, repo in enumerate(list_repos):
#     collection_name = repo.replace("/", "_")
#     chroma_client.delete_collection(name=collection_name)

In [None]:
# collection = chroma_client.get_collection(name="ctxastropy_astropy")
# results = collection.get(include=["documents", "metadatas"], limit=1000)

In [16]:
collections = chroma_client.list_collections()
print(len(collections))
print([c.name for c in collections])  # should not include the deleted one

13
['matplotlib_matplotlib', 'pydata_xarray', 'psf_requests', 'scikit-learn_scikit-learn', 'pallets_flask', 'pytest-dev_pytest', 'pylint-dev_pylint', 'sphinx-doc_sphinx', 'django_django', 'sympy_sympy', 'mwaskom_seaborn', 'astropy_astropy', 'C3RetoAI_Testing-Github-Bot']
