In [18]:
import json

with open('documents.json', 'rt') as f:
    docs_raw = json.load(f)

docs_raw

[{'course': 'data-engineering-zoomcamp',
  'documents': [{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
    'section': 'General course-related questions',
    'question': 'Course - When will the course start?'},
   {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
    'section': 'General course-related questions',
    'question': 'Course - What are the prerequisites for this course?'},
   {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in

In [19]:
# documents = []
# for items in docs_raw:
#     for idx, item in enumerate(items['documents']):
#         item['id'] = idx
#         item['course'] = items['course']
#         documents.append(item)

# documents[0]

documents = []
for items in docs_raw:
    for item in items['documents']:
        item['course'] = items['course']
        documents.append(item)

documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [39]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

for doc in documents:
    doc['id'] = generate_document_id(doc)

In [40]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [41]:
from collections import defaultdict

hashes = defaultdict(list)
for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

len(hashes), len(documents)

(947, 948)

In [42]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [44]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [46]:
# save to above document to json

with open('documents_with_id.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

!head documents_with_id.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [156]:
prompt_template = """
You emulate a student who's taking our course. \
Formulate 5 questions this student might ask based on a FAQ record. The record \
should contain the answer to the questions , and the questions should be complete and not too short. \
If possible, use as fewer words as possible from the record. The final output contains ONLY \
a LIST of questions DO NOT include any introductory sentences.

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks: 
```
["question1", "question2", ..., "question5"]
```
""".strip()

In [157]:
from groq import Groq
from dotenv import load_dotenv
import os

load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
client = Groq(api_key=groq_api_key)

In [158]:
doc = documents[2]
prompt = prompt_template.format(**doc)

response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )
json_response = response.choices[0].message.content
json_response

'["Can I still join the course despite missing the start date and what are the implications on the final projects deadline if I do decide to join?", "What is the deadline for turning in the final projects once I have joined the course?", "Can I still submit homeworks even if I registered late?", "What are the tasks that I should prioritize if I\'m joining the course late?", "Are there any time constraints that I should be aware of as a latecomer to the course?"]'

In [159]:
json.loads(json_response)

['Can I still join the course despite missing the start date and what are the implications on the final projects deadline if I do decide to join?',
 'What is the deadline for turning in the final projects once I have joined the course?',
 'Can I still submit homeworks even if I registered late?',
 "What are the tasks that I should prioritize if I'm joining the course late?",
 'Are there any time constraints that I should be aware of as a latecomer to the course?']

In [171]:
def generate_question(doc):
    prompt = prompt_template.format(**doc)
    
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )
    json_response = response.choices[0].message.content
    return json_response

In [174]:
from tqdm.auto import tqdm

In [176]:
question_list = {}
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in question_list:
        continue
    questions = generate_question(doc)
    question_list[doc_id] = questions

100%|████████████████████████| 948/948 [39:39<00:00,  2.51s/it]


In [178]:
question_list

{'c02e79ef': '["When will the course, which is focused on technical questions, officially begin?", "Can I still register for the course after it has started?", "What platform should I use to stay updated on the course schedule?", "Will the course be available on multiple devices or just on desktop?", "How can I communicate with other students and the instructor during the course?"]',
 '1f6520ca': '[\n"What are the prerequisites for this course?",\n"Where can I find information about the course schedule?",\n"What kind of support does the course offer for students who struggle with certain topics?",\n"How do I access the course materials and resources?",\n"Are there any specific software or tools required for the course?"',
 '7842b56a': '["What can I do if I miss the course start date and still want to participate?", "If I don\'t register for the course, am I still allowed to complete the homeworks?", "What deadlines do I need to be aware of for the final projects?", "Can I still be part

In [180]:
import pickle

with open('question_list_pk.pkl', 'wb') as file:
    pickle.dump(question_list, file)

In [186]:
with open('question_list_pk.pkl', 'rb') as f_in:
    results = pickle.load(f_in)

In [248]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

JSONDecodeError: Expecting ',' delimiter: line 1 column 27 (char 26)

In [249]:
 print(json_questions)

["How do I fix the error "Error: Makefile:2: *** missing separator.  Stop." when trying to convert tabs in my document to spaces in VS Code?", "Why do I have to convert tabs to spaces in VS Code?", "Is it possible to use both tabs and spaces in VS Code?", "How can I solve the issue of tabs not being recognized as tabs in my document?", "Is there a specific stack or resource I should refer to for information on converting tabs to spaces in VS Code?"]


In [250]:
json_questions =["How do I fix the error "Error: Makefile:2: missing separator.  Stop." when trying to convert tabs in my document to spaces in VS Code?", "Why do I have to convert tabs to spaces in VS Code?", "Is it possible to use both tabs and spaces in VS Code?", "How can I solve the issue of tabs not being recognized as tabs in my document?", "Is there a specific stack or resource I should refer to for information on converting tabs to spaces in VS Code?"]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1818083251.py, line 1)

In [246]:
json.dumps(json_questions)

'["What are the first steps to troubleshoot issues I\'m encountering in the course?", "Can I search for solutions to my problem using specific keywords?", "What should I do if I\'m unable to resolve an issue after trying the troubleshooting steps?", "How do I properly ask a question on Stackoverflow and other platforms?", "What should I do if I\'m stuck on a problem and can\'t seem to find a solution?"]'

In [247]:
results[doc_id] = json.dumps(json_questions)

In [251]:
doc_index = {d['id']: d for d in documents}

final_results = []
for doc_id, questions in parsed_results.items():
    course=doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [252]:
import pandas as pd
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])
df.to_csv('ground_truth.csv', index=False)

In [253]:
df

Unnamed: 0,question,course,document
0,"When will the course, which is focused on tech...",data-engineering-zoomcamp,c02e79ef
1,Can I still register for the course after it h...,data-engineering-zoomcamp,c02e79ef
2,What platform should I use to stay updated on ...,data-engineering-zoomcamp,c02e79ef
3,Will the course be available on multiple devic...,data-engineering-zoomcamp,c02e79ef
4,How can I communicate with other students and ...,data-engineering-zoomcamp,c02e79ef
...,...,...,...
201,How do I set up a local repository on my compu...,data-engineering-zoomcamp,f2945cd2
202,How do I ignore large database and other unwan...,data-engineering-zoomcamp,f2945cd2
203,What is the best way to store my notes and ver...,data-engineering-zoomcamp,f2945cd2
204,Should I store my passwords and keys in a Git ...,data-engineering-zoomcamp,f2945cd2
