### Summary + Code

In [None]:
import os
import json

def get_qa_data(path):
    with open(path, 'r') as json_file:
        data = json.load(json_file)
        json_file.close()
    return data

print(os.getcwd())
qa_data = get_qa_data('../summaries.json')

### Structure Summary Data

In [None]:
import pandas as pd
from datetime import datetime

qa_df = pd.DataFrame(qa_data)
qa_df['ids'] = [str(datetime.now()).replace(' ','-') + "-id-" + str(i) for i in range(len(qa_df))]
qa_df.iloc[0]

### Generation Libraries + Prompt

In [None]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm

# Set up inference prompt

QUERY_PROMPT = """### Task
Generate a question that could realistically have the given code snippet as a response.
To be clear, we want to reverse-engineer a question from a given response (code snippet).

### Code Snippet
```
{code_snippet}
```

### Explanation of Code
{code_summary}

### Warnings
Do not make your question too specific. Make your question general yet suitable for the resulting code snippet.

### Potential Question
"""


In [None]:
# # HuggingFace Inference API - Ran out of tokens

# client = InferenceClient(
#     provider = "hf-inference"
# )

# def query_huggingface(code_snippet, code_summary):
#     messages = [
#         {
#             "role" : "user",
#             "content" : QUERY_PROMPT.format(code_snippet = code_snippet, code_summary = code_summary)
#         }
#     ]
#     return client.chat.completions.create(
#         model = "google/gemma-2-2b-it",
#         messages = messages,
#         max_tokens = 500,
#         stream = False
#     )

In [None]:
# # Local Generation with Transformers    
# model_name = "google/gemma-2-2b-it"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map = "auto")  # Loads onto available GPU/CPU

# generator = pipeline("text-generation", model = model, tokenizer = tokenizer)

# def query_local(code_snippet, code_summary):
#     return generator(
#         QUERY_PROMPT.format(code_snippet = code_snippet, code_summary = code_summary), 
#         max_length=500, 
#         temperature=0.7, 
#         do_sample=True
#     )[0]["generated_text"]

In [None]:
# # Local Generation with Ollama
# import ollama

# def query_ollama(code_snippet, code_summary):
#     return ollama.generate(
#         model="gemma3:1b",
#         prompt= QUERY_PROMPT.format(code_snippet = code_snippet, code_summary = code_summary),
#         stream = False,
#         options={'num_predict': -1, 'keep_alive': 0},
# )

In [None]:
# Generate summaries and add to dataframe
questions = []
for i in tqdm(range(len(qa_df)), desc="Queries to HF API"):
    row = qa_df.iloc[i]
    questions.append(query_ollama(row["code"], row["summary"]))
qa_df["questions"] = questions

In [None]:
# Write data to file as backup
with open("test_dataset.json", 'w') as json_file:
    json_data = qa_df.to_json()
    for entry in json_data:
        entry["metadata"].update(entry["code"])
        del entry["code"]

    json.dump(json_data, json_file, ensure_ascii = False, indent = 4)
    json_file.close()


### Insertion

In [None]:
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

chroma_client = chromadb.Client()

# With huggingface embeddings, in case we ever transition to open source implementation
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    # api_key="",
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

chroma_client.delete_collection("test")
collection = chroma_client.get_or_create_collection(
    name="test",
    embedding_function = huggingface_ef,
    metadata={
        "hnsw:space": "cosine"
    }
)

collection.add(
    documents= qa_df["summary"],
    metadatas= qa_df["metadata"],
    ids= qa_df["ids"]
)