In [1]:
import json
import os
import tempfile
from git import Repo
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_unique_repo_set(jsonl_path):
    repos = set()
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                sample = json.loads(line)
                repo = sample.get("repo")
                if repo:
                    repos.add(repo.strip())
            except json.JSONDecodeError:
                continue
    return repos

In [7]:
lite_repos = get_unique_repo_set("./data/swe_bench_lite_test.jsonl")

In [2]:
def extract_commits_by_repo(jsonl_path: str) -> dict:
    """
    Extrae un diccionario con repositorios como claves y lista de base_commits únicos como valores.
    
    Args:
        jsonl_path (str): Ruta al archivo .jsonl con entradas que contienen 'repo' y 'base_commit'.

    Returns:
        dict: { repo_name: [base_commit1, base_commit2, ...] }
    """
    commits_by_repo = defaultdict(set)  # set para evitar duplicados

    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line)
                repo = item.get("repo")
                base_commit = item.get("base_commit")
                if repo and base_commit:
                    commits_by_repo[repo].add(base_commit)
            except json.JSONDecodeError:
                continue

    # Convertimos los sets en listas
    return {repo: list(commits) for repo, commits in commits_by_repo.items()}

In [5]:
repo_commits = extract_commits_by_repo("./data/swe_bench_lite_test.jsonl")


In [None]:

# Ver ejemplo
print(repo_commits["django/django"])

114


In [None]:
# len(lite_repos)

12

In [8]:
sorted_repo_commits = dict(sorted(repo_commits.items()))

In [11]:
print(sorted_repo_commits.keys())
print(len(sorted_repo_commits.keys()))

dict_keys(['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy'])
12


In [12]:
GITHIB_BASE_URL = "https://github.com/"

In [15]:
model = SentenceTransformer("BAAI/bge-large-en")

In [None]:
def collect_files(root_dir, extensions={".py"}):
    collected = []

    for dirpath, _, filenames in os.walk(root_dir):
        if any(excluded in dirpath for excluded in [".git", "tests", "test", "node_modules", ".venv", "__pycache__"]):
            continue
        for fname in filenames:
            if any(fname.endswith(ext) for ext in extensions):
                collected.append(os.path.join(dirpath, fname))
    return collected

def read_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    except Exception as e:
        print(f"Error leyendo {file_path}: {e}")
        return ""

def embed_repo(repo_path, base_commit):
    files = collect_files(repo_path)
    embeddings = []
    metadata = []
    docs = []

    for path in files:
        content = read_file(path)
        if not content.strip():
            continue
        
        path_split = path.split("/")
        name = path_split[-1]

        content = f"[CLS] {name}\n" + content
        embedding = model.encode(content)  
        embeddings.append(embedding)
        metadata.append({"name": name, "extension": "py", "path": "/".join(path_split[-2:]), "base_commit": base_commit})
        docs.append(content)

    return embeddings, metadata, docs

In [None]:
def process_repo(repo, base_commit: str):
    # 1. Crear carpeta temporal
    repo_url = GITHIB_BASE_URL + repo
    
    with tempfile.TemporaryDirectory() as tmp_dir:
        print(f"Clonando {repo_url} en {tmp_dir}")
        try:
            repo_obj = Repo.clone_from(repo_url, tmp_dir)
            repo_obj.git.checkout(base_commit)
            print(f"→ Checkout exitoso al commit {base_commit}")
        except Exception as e:
            print(f"❌ Error al procesar {repo_url} @ {base_commit}: {e}")
            return [], [], []
        # _ = input("lll")
        embeddings, metadata, docs = embed_repo(tmp_dir, base_commit)

        # 4. Guardar en vector DB
        # vector_db.add(embeddings, metadata=metadata)

        # 5. tmp_dir se elimina automáticamente
        print(f"Repo {repo_url} procesado y eliminado.")
        return embeddings, metadata, docs

In [14]:
chroma_client = chromadb.HttpClient(host="localhost", port=8005, settings=Settings())

In [20]:
current_repos = list_repos[:1]
BATCH_SIZE = 50

for idx, repo in enumerate(current_repos):
    print(f"({idx}/{len(current_repos)}) Repo: {repo} ------------------------ \n")
    embeddings, metadata, docs = process_repo(repo)
    collection_name = repo.replace("/", "_")
    
    collection = chroma_client.get_or_create_collection(name=collection_name)
    
    for i in range(0, len(docs), BATCH_SIZE):
        batch_docs = docs[i:i+BATCH_SIZE]
        batch_embeddings = embeddings[i:i+BATCH_SIZE]
        batch_metadata = metadata[i:i+BATCH_SIZE]
        batch_ids = [str(uuid.uuid4()) for _ in batch_docs]

        collection.upsert(
            ids=batch_ids,
            documents=batch_docs,
            embeddings=batch_embeddings,
            metadatas=batch_metadata
        )    

(0/1) Repo: astropy/astropy ------------------------ 

Clonando https://github.com/astropy/astropy en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmplde3ceon


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/astropy/astropy procesado y eliminado.


In [None]:
# for idx, repo in enumerate(list_repos):
#     collection_name = repo.replace("/", "_")
#     chroma_client.delete_collection(name=collection_name)

In [16]:
collection = chroma_client.get_collection(name="django_django")
collection.count()

732

In [23]:
collections = chroma_client.list_collections()
print(len(collections))
print([c.name for c in collections])  # should not include the deleted one

13
['matplotlib_matplotlib', 'pydata_xarray', 'psf_requests', 'scikit-learn_scikit-learn', 'pallets_flask', 'pytest-dev_pytest', 'pylint-dev_pylint', 'sphinx-doc_sphinx', 'django_django', 'sympy_sympy', 'mwaskom_seaborn', 'astropy_astropy', 'C3RetoAI_Testing-Github-Bot']
