In [3]:
import json 

def load_json_file(file_path):
    """Load and return the contents of a JSON file."""
    with open(file_path, 'r') as file:
        data = json.load(file)
    docs = []
    for item in data:
        for doc in item.get('documents', []):
            doc['course'] = item.get('course')
            docs.append(doc)
    return docs


documents = load_json_file('/home/gwm-279/Documents/DTC_AI/artifacts/documents.json')
len(documents)

948

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
import hashlib
from tqdm import tqdm

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id


for item in tqdm(documents):
    item['id'] = generate_document_id(item)

documents[0]

100%|██████████| 948/948 [00:00<00:00, 661817.61it/s]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [7]:
with open('/home/gwm-279/Documents/DTC_AI/artifacts/documents_with_ids.json', 'w') as f:
    json.dump(documents, f, indent=2)

In [10]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip() 

In [12]:
import os
from groq import Groq
from dotenv import load_dotenv
load_dotenv()

client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
def generate_questions(document):
    prompt = prompt_template.format(
        section=document['section'],
        question=document['question'],
        text=document['text']
    )
    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        temperature=0.7,
    )
    return response



In [24]:
def parse_question_list(question_list_str):
    try:
        question_list = json.loads(question_list_str)
        if isinstance(question_list, list) and all(isinstance(q, str) for q in question_list):
            return question_list
    except json.JSONDecodeError:
        pass
    return []

In [25]:
for doc in documents[:1]:
    response = generate_questions(doc)
    doc['question_list'] = parse_question_list(response.choices[0].message.content)
    

In [18]:
for doc in documents:
    if doc['id']=='c02e79ef':
        print(doc.keys())

dict_keys(['text', 'section', 'question', 'course', 'id', 'question_list'])


In [28]:
import pandas 
df = pandas.DataFrame(documents)
df = pandas.DataFrame(columns=['question', 'course', 'document_id'])
for doc in documents[:1]:
    for question in doc['question_list']:
        row = [question, doc['course'], doc['id']]
        df.loc[len(df)] = row

df.head()

Unnamed: 0,question,course,document_id
0,What is the initial event that will mark the b...,data-engineering-zoomcamp,c02e79ef
1,How can I stay updated on the course schedule ...,data-engineering-zoomcamp,c02e79ef
2,What are the necessary steps I need to take be...,data-engineering-zoomcamp,c02e79ef
3,What platforms or tools do I need to join to p...,data-engineering-zoomcamp,c02e79ef
4,Are there any specific registration links or c...,data-engineering-zoomcamp,c02e79ef


In [29]:
df.to_csv('/home/gwm-279/Documents/DTC_AI/artifacts/ground_truth_questions.csv', index=False)