In [76]:
from chromadb import  chromadb
import random
import json

In [77]:
def get_random_code_samples(collection_name="my_collection", num_samples=10, num_lines=4):
    client = chromadb.HttpClient(host="localhost", port=8005)
    collection = client.get_collection(name=collection_name)

    all_docs = collection.get(include=["documents", "metadatas"])

    if not all_docs["documents"]:
        print("No documents found.")
        return []

    samples = []

    for _ in range(num_samples):
        idx = random.randint(0, len(all_docs["documents"]) - 1)
        code = all_docs["documents"][idx]
        metadata = all_docs["metadatas"][idx]

        code_lines = [line for line in code.splitlines() if line.strip()]
        if not code_lines:
            continue

        selected_lines = random.sample(code_lines, min(num_lines, len(code_lines)))

        samples.append({
            "collection_name": collection_name,
            "expected_file": metadata.get("name", "unknown"),
            "query": "\n".join(selected_lines)
        })
    return samples

In [78]:
def save_samples_to_jsonl(samples, filename="../../output.jsonl"):
    with open(filename, "w", encoding="utf-8") as f:
        for sample in samples:
            json_line = json.dumps(sample, ensure_ascii=False)
            f.write(json_line + "\n")

In [79]:
collections_names = ['astropy/astropy',
 'django/django',
 'matplotlib/matplotlib',
 'mwaskom/seaborn',
 'pallets/flask',
 'psf/requests',
 'pydata/xarray',
 'pylint-dev/pylint',
 'pytest-dev/pytest',
 'scikit-learn/scikit-learn',
 'sphinx-doc/sphinx',
 'sympy/sympy']

In [80]:
results = []
for collection in collections_names:
    results.extend(get_random_code_samples(collection.replace("/", "_")))  

In [81]:
if len(results) > 0:
    save_samples_to_jsonl(results)