In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [3]:
from dotenv import load_dotenv
import os
#source ./bin/activate from project folder before to activate environment
load_dotenv()  # Loads the variables from .env file

# Now you can access the API_KEY
api_key = os.getenv("OPENAI_API_KEY")


In [4]:
prompt_template = """
You are an expert and teacher in datascience, machine learning and artificial intelligence.
Your need to widen a set of questions out of a dataset of questions asked in past exams.
So for each question you have you want some other equivalent questions to ask, for which the answer is the same as in the original question.
Below you have a record with Question and Answer, so you have to reformulate the question so the answer is in the answer in the record. It is
mandatory that the reformulated question can be answered with the Answer in the record as the exams will automatically validate the answer as correct.
The number of questions to create are 3 for each record.
The questions should be complete and not too short. Use as fewer words as possible from the record. 


The record:
Question: {Question}
Answer: {Answer}

Just output the the json object without any further comments before or after. The key will be "questions" and the value the list of questions:
    
{{"questions": ["question_1", "question_2", "question_3"]}}



""".strip()

In [55]:
n=69
prompt = prompt_template.format(**documents[n])

In [56]:
documents[n]

{'id': 70,
 'Question': "Elaborate on Python's role in enabling data engineers.",
 'Answer': 'Python empowers data engineers by providing robust libraries such as NumPy, pandas, and scipy, which offer efficient tools for data processing, statistical analysis, and data preparation tasks. NumPy enables numerical computations and array operations, pandas facilitates data manipulation and analysis through DataFrame objects, while scipy offers scientific computing functionalities. Leveraging these libraries, data engineers can streamline data workflows, extract meaningful insights, and prepare data for downstream tasks such as machine learning and analytics, enhancing productivity and efficiency in data-driven projects.'}

In [30]:
model='llama3.1:8b'

In [31]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [32]:
from openai import OpenAI

#client = OpenAI()
def llm(prompt, model=model):
    response = client.chat.completions.create(
        model=model,
        temperature=0.0,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [57]:
questions = llm(prompt)

In [58]:
prompt

'You are an expert and teacher in datascience, machine learning and artificial intelligence.\nYour need to widen a set of questions out of a dataset of questions asked in past exams.\nSo for each question you have you want some other equivalent questions to ask, for which the answer is the same as in the original question.\nBelow you have a record with Question and Answer, so you have to reformulate the question so the answer is in the answer in the record. It is\nmandatory that the reformulated question can be answered with the Answer in the record as the exams will automatically validate the answer as correct.\nThe number of questions to create are 3 for each record.\nThe questions should be complete and not too short. Use as fewer words as possible from the record. \n\n\nThe record:\nQuestion: Elaborate on Python\'s role in enabling data engineers.\nAnswer: Python empowers data engineers by providing robust libraries such as NumPy, pandas, and scipy, which offer efficient tools for 

In [59]:
questions

'{"questions": [\n    "What tools enable data engineers to process, analyze, and prepare data efficiently?",\n    "How do libraries like NumPy, pandas, and scipy support data engineering tasks?",\n    "What benefits can data engineers gain from leveraging these scientific computing libraries?"\n]}'

In [74]:
#after many iterations this is the function that works better when the output of the llm is not correctly formatted
#and we have more control of which outputs failed as we save the errors and the program continue even if some
#outputs failed

import json
import re

def robust_json_loads(s):
    """
    Attempts to parse a JSON string, fixing common errors if parsing fails.

    Parameters:
    s (str): The JSON string to parse.

    Returns:
    tuple: (success (bool), data (dict or None), error_message (str or None))
    """
    original_s = s  # Keep a copy of the original string

    # First, attempt to parse the input string directly
    try:
        data = json.loads(s)
        return (True, data, None)
    except json.JSONDecodeError:
        pass  # Proceed to cleaning steps if parsing fails

    # Step 1: Strip wrapping backticks and language specifiers
    s = s.strip()
    s = re.sub(r'^```[a-zA-Z]*\s*', '', s)  # Remove starting triple backticks and optional language
    s = re.sub(r'```$', '', s)              # Remove ending triple backticks
    s = s.strip('`')                        # Remove any remaining backticks

    # Step 2: Remove any text before the first '{' or '['
    start_idx = re.search(r'[\{\[]', s)
    if not start_idx:
        error_message = "No JSON object could be detected in the input."
        return (False, None, error_message)
    s = s[start_idx.start():]

    # Step 3: Remove any text after the last '}' or ']'
    end_idx = max(s.rfind('}'), s.rfind(']'))
    if end_idx == -1:
        error_message = "No JSON object could be detected in the input."
        return (False, None, error_message)
    s = s[:end_idx+1]

    # Step 4: Remove extraneous characters after the JSON content
    # Remove any characters after the last closing brace/bracket
    s = re.sub(r'([\}\]])[\s\S]*$', r'\1', s)

    # Step 5: Remove extraneous characters before the JSON content
    # Remove any characters before the first opening brace/bracket
    s = re.sub(r'^[\s\S]*?([\{\[])', r'\1', s)

    # Step 6: Replace single quotes with double quotes for keys and values
    # Avoid changing single quotes inside double-quoted strings
    s = re.sub(
        r'(?<=[:\{\[,])\s*\'([^\']*)\'\s*(?=[:,\}\]])',
        r'"\1"',
        s
    )

    # Step 7: Remove trailing commas before closing braces/brackets
    s = re.sub(r',\s*(\}|\])', r'\1', s)

    # Step 8: Balance brackets and braces if necessary
    def balance_characters(s, open_char, close_char):
        opens = s.count(open_char)
        closes = s.count(close_char)
        if opens > closes:
            s += close_char * (opens - closes)
        elif closes > opens:
            s = open_char * (closes - opens) + s
        return s

    s = balance_characters(s, '{', '}')
    s = balance_characters(s, '[', ']')

    # Step 9: Remove unescaped control characters
    # Control characters are not allowed in JSON strings
    s = re.sub(r'[\x00-\x1F]+', '', s)

    # Step 10: Final check to remove extra double quotes at the end of strings in arrays
    # This targets the specific case you mentioned
    s = re.sub(r'(".*?")"+(?=\s*[\],}])', r'\1', s)

    # Step 11: Attempt to parse the cleaned string
    try:
        data = json.loads(s)
    except json.JSONDecodeError as e:
        error_message = f"Error parsing JSON after cleaning: {e}"
        return (False, None, error_message)

    # Validate that the parsed data contains the expected 'questions' key
    if not isinstance(data, dict) or 'questions' not in data:
        error_message = "Parsed JSON does not contain 'questions' key."
        return (False, None, error_message)

    # Check that 'questions' is a non-empty list
    if not isinstance(data['questions'], list) or not data['questions']:
        error_message = "'questions' key is empty or not a list."
        return (False, None, error_message)

    # Parsing and validation successful
    return (True, data, None)


In [73]:
parsed_question = robust_json_loads(questions)
parsed_question

(True,
 {'questions': ['What tools enable data engineers to process, analyze, and prepare data efficiently?',
   'How do libraries like NumPy, pandas, and scipy support data engineering tasks?',
   'What benefits can data engineers gain from leveraging these scientific computing libraries?']},
 None)

In [75]:
#First we are going to  make a test using only a part of the dataset for not incurring in the cost of processing the whole 1000+ records
documents_test = documents[0:5]

In [76]:
documents_test

[{'id': 1,
  'Question': 'What is under-fitting and overfitting in machine learning?',
  'Answer': "Underfitting is when a model is too simple, and overfitting is when it's too complex, making it perform poorly on new data."},
 {'id': 2,
  'Question': 'Can you explain what a false positive and a false negative are?',
  'Answer': "A false positive incorrectly indicates a condition is present when it's not, while a false negative misses detecting a condition that is there."},
 {'id': 3,
  'Question': 'Clarify the concept of Phase IV.',
  'Answer': "Phase IV studies, also known as post-marketing surveillance, are conducted after a drug or medical product is made available to the general public. They aim to monitor the product's safety, efficacy, and long-term effects in a larger and more diverse population, providing valuable insights into real-world usage. Phase IV studies help regulators, healthcare providers, and patients make informed decisions about the product's continued use by ass

In [77]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [78]:
results = {}

In [80]:
def generate_questions(doc):
    
    prompt = prompt_template.format(**doc)
    
    response = client.chat.completions.create(
        model=model,
        temperature=0.0,
        messages=[{"role": "user", "content": prompt}]
    )
    
    json_response = response.choices[0].message.content
    return json_response

In [81]:
def process_documents(documents, results, limit=None):
    # Pre-compute the set of doc_ids that are already in results
    existing_ids = set(results.keys())
    
    # Determine the number of documents to process
    num_documents_to_process = len(documents) if limit is None else min(len(documents), limit)

    # List to store problematic inputs and their error messages
    failed_records = []

    # Loop through only the required number of documents
    for i, doc in enumerate(tqdm(documents[:num_documents_to_process])):
        doc_id = doc['id']
        
        # Skip processing if doc_id is already in results
        if doc_id in existing_ids:
            continue
        
        # Generate questions and process
        questions_raw = generate_questions(doc)
        success, data, error_message = robust_json_loads(questions_raw)
        
        if success:
            # Add results to the dictionary
            results[doc_id] = data['questions']
        else:
            # Save the problematic input and error message
            failed_records.append({
                'doc_id': doc_id,
                'questions_raw': questions_raw,
                'error_message': error_message
            })
            # Optionally, add an error message in results
            results[doc_id] = f"Error processing document: {error_message}"

    # Return the failed records for further inspection
    return failed_records


In [None]:
################################################ CALL TO THE LLM FOR ALL THE DOCS OR PART #####################3333

In [138]:
#as the llm fails sometimes to give a valid structure, even when we have made a robust_json_loads() that success
#to parse most of the time(only 2 errors in 74 documents, many more if we do not use this robust_json function
#we now detect the failed records and save a dummy error message and the failed records in a dataframe for debugging

##UPDATE: with the last robust_json_loads() version and model llama3.1 8b and temmperature=0 -->100% success, not even one failed!
#and checking the outputs seem quite right and logic and related to the original question-answer

# Process the documents, add limit to process part of the docs
failed_records = process_documents(documents, results, limit=200)

# Convert failed_records to a DataFrame for easier inspection

failed_df = pd.DataFrame(failed_records)

# Save the failed records to a CSV file for further analysis
failed_df.to_csv('failed_records.csv', index=False)


100%|█████████████████████████████████████████| 200/200 [02:39<00:00,  1.25it/s]


In [139]:
print(failed_df)

Empty DataFrame
Columns: []
Index: []


In [140]:
#to process the numnber of docs for testing purposes
#process_documents(documents, results, limit=100)
#process_documents(documents, results)  # Will process all documents


In [141]:
#now that the robust parser seems to work fine almost 100% we eliminate from results the errors or empty lines if any
#delete the keys with error to reprocess documents and then reorder results with the ids

# Identify failed doc_ids
failed_doc_ids = [
    doc_id for doc_id, value in results.items()
    if (isinstance(value, str) and value.startswith('Error processing document'))
    or (isinstance(value, list) and not value)  # Empty list
]
print(failed_doc_ids)

# Remove failed doc_ids from results
for doc_id in failed_doc_ids:
    results.pop(doc_id, None)  # Remove the failed doc_id from results


[]


In [142]:
#reprocessing of results to add the keys with previous errors that were deleted
# Re-run process_documents on the entire documents list
# Process the documents
failed_records = process_documents(documents, results, limit=1)

# Convert failed_records to a DataFrame for easier inspection
#import pandas as pd
failed_df = pd.DataFrame(failed_records)

# Save the failed records to a CSV file for further analysis
failed_df.to_csv('failed_records.csv', index=False)

100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 22919.69it/s]


In [143]:
len(results)


200

In [146]:
for doc_id, questions in results.items():
    for q in questions:
        #final_results.append((doc_id, q))
        print(doc_id, q)

1 What happens when a machine learning model is too simple?
1 How does overfitting affect a model's performance on new data?
1 Can a model be too complex, and if so, what are the consequences?
2 What are the implications of a test result that incorrectly suggests a condition's presence?
2 How does a false indication of a condition's absence impact diagnosis and treatment?
2 Can you describe a situation where a medical test fails to detect an actual condition?
3 What type of studies are conducted after a drug or medical product is made available to the general public?
3 What is the primary goal of Phase IV studies in terms of monitoring a product's safety and efficacy?
3 How do Phase IV studies contribute to informed decision-making about a product's continued use?
4 What is semi-supervised learning?
4 How does semi-supervised learning improve model performance?
4 What are the benefits of using unlabeled data in machine learning?
5 How can gradient boosting models take advantage of mode

In [147]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [148]:
final_results[0]

(1, 'What happens when a machine learning model is too simple?')

In [149]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [150]:
df_results.head()

Unnamed: 0,id,question
0,1,What happens when a machine learning model is ...
1,1,How does overfitting affect a model's performa...
2,1,"Can a model be too complex, and if so, what ar..."
3,2,What are the implications of a test result tha...
4,2,How does a false indication of a condition's a...


In [151]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [152]:
!head ../data/ground-truth-retrieval.csv

id,question
1,What happens when a machine learning model is too simple?
1,How does overfitting affect a model's performance on new data?
1,"Can a model be too complex, and if so, what are the consequences?"
2,What are the implications of a test result that incorrectly suggests a condition's presence?
2,How does a false indication of a condition's absence impact diagnosis and treatment?
2,Can you describe a situation where a medical test fails to detect an actual condition?
3,What type of studies are conducted after a drug or medical product is made available to the general public?
3,What is the primary goal of Phase IV studies in terms of monitoring a product's safety and efficacy?
3,How do Phase IV studies contribute to informed decision-making about a product's continued use?
