# Ground Truth Data

In [1]:
import requests

## Creating Doc Dataset

In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [3]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

### Attributing IDs to Docs

- Progressive numerical IDs are a bad idea because if we add documents in the middle of the collection the IDs can change
- Randomly generating IDs based on content is a better idea (but not the best because IDs can change when the content changes)

In [4]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [5]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

### Taking a Look Into ID Structure

In [6]:
from collections import defaultdict

In [7]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [8]:
len(hashes), len(documents)

(947, 948)

I have less hashes than docs. One of them is probably not unique. 

In [9]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [10]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

The issue happened because this specific question is duplicate

In [11]:
import json

In [12]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

## Generating Ground Truth With an LLM

In [13]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()


In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
client = pipeline("text-generation", model=model, tokenizer=tokenizer)

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

: 

: 

In [None]:
def generate_questions(doc):
    
    prompt = prompt_template.format(**doc)

    response = client(prompt)
    
    output = response[0]["generated_text"]

    return response[0]["generated_text"][len(prompt):].strip()


In [None]:
doc = documents[2]
prompt = prompt_template.format(**doc)

generate_questions(doc)

NameError: name 'client' is not defined

In [30]:
from tqdm.auto import tqdm

In [23]:
results = {}

for doc in tqdm(documents): 
    doc_id = doc['id']

    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/novita/v3/openai/chat/completions (Request ID: Root=1-6904ffd7-784638d8522c9f814cbfe837;c9b7e212-f2e8-4b32-806d-3ed19e985025)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.

In [14]:
import pickle


In [16]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [18]:
results['7842b56a']

'[\n  "Can I enroll in the course after it starts?",\n  "Is late registration possible?",\n  "Am I eligible to submit homework if I join late?",\n  "Are there deadlines for final projects if I join late?",\n  "Can I submit all assignments at the end of the course if I start late?"\n]'

In [19]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

JSONDecodeError: Invalid \escape: line 6 column 59 (char 414)

In [21]:
doc_index = {doc['id']: doc for doc in documents}

In [22]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, doc_id, course))

In [23]:
import pandas as pd

In [27]:
df = pd.DataFrame(final_results, columns=['Question', 'Document ID', 'Course'])

In [28]:
df.to_csv('ground-truth-data.csv', index=False)