In [None]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
n = len(documents)

for i in range(n):
    documents[i]['id'] = i

documents[3]

In [None]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [None]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [None]:
from collections import defaultdict

In [None]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [None]:
len(hashes), len(documents)

In [None]:
import json

In [None]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [None]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
from openai import OpenAI

In [None]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [None]:
prompt = prompt_template.format(**doc)

response = client.chat.completions.create(
    model='phi',
    messages=[{"role": "user", "content": prompt}]
)

json_response = response.choices[0].message.content

In [None]:
json.loads(json_response)

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='phi',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [None]:
from tqdm.auto import tqdm


In [None]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

In [None]:
import pickle

In [None]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [None]:
json_questions = [
    r"How can I resolve the Docker error 'invalid mode: \Program Files\Git\var\lib\postgresql\data'?",
    "What should I do if I encounter an invalid mode error in Docker on Windows?",
    "What is the correct mounting path to use in Docker for PostgreSQL data on Windows?",
    "Can you provide an example of a correct Docker mounting path for PostgreSQL data?",
    r"How do I correct the mounting path error in Docker for \Program Files\Git\var\lib\postgresql\data'?"
]

In [None]:
results['58c9f99f'] = json.dumps(json_questions)

In [None]:
parsed_result = {}

for doc_id, json_questions in results.items():
    parsed_result[doc_id] = json.loads(json_questions)

In [None]:
doc_index = {d['id']: d for d in documents}


In [None]:
final_results = []

for doc_id, questions in parsed_result.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q,course,doc_id))

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(final_results,columns=['question','course','document'])

df.to_csv('ground_truth_data.csv', index=False)