In [None]:
import os
import json
from datasets import load_dataset
from huggingface_hub import login

def recreate_dataset():
    # Log in to the Hugging Face Hub
    hf_api_token = os.getenv("HF_API_TOKEN")
    if not hf_api_token:
        raise EnvironmentError("HF_API_TOKEN is not set in the environment.")
    login(hf_api_token)

    # Load the dataset from Hugging Face
    ds = load_dataset("m-ric/huggingface_doc", split="train")

    # Convert the dataset to a list of dictionaries
    data_list = [doc for doc in ds]

    # Ensure the directory exists
    os.makedirs(os.path.dirname(path_hf_doc_dataset), exist_ok=True)

    # Write the dataset content to a JSON file
    with open(path_hf_doc_dataset, "w", encoding="utf-8") as f:
        json.dump(data_list, f, ensure_ascii=False, indent=4)

    print(f"Dataset saved to {path_hf_doc_dataset}")
    return data_list

# File path for the dataset
path_hf_doc_dataset = "./hf_docs_dataset/huggingface_doc_dataset.json"

# Check if the dataset file exists
if os.path.exists(path_hf_doc_dataset) and os.path.getsize(path_hf_doc_dataset) > 0:
    try:
        with open(path_hf_doc_dataset, "r", encoding="utf-8") as f:
            data_list = json.load(f)
    except (json.JSONDecodeError, OSError) as e:
        print(f"Error loading JSON file: {e}. Recreating dataset.")
        data_list = recreate_dataset()
else:
    data_list = recreate_dataset()


Dataset saved to ./hf_docs_dataset/huggingface_doc_dataset.json


In [None]:
import random

N_GENERATIONS = 25 # We intentionally generate only 10 QA couples here for cost and time considerations

# Save outputs to a JSON file
QA_initial_couples_path = './generated_QAs/initial_generated_qa_couples.json'


def generate_initial_QA():
    print(f"Generating {N_GENERATIONS} QA couples...")

    QA_initial_outputs = []
    for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
        try:
            # Generate QA couple
            output_QA_couple = call_llm(
                llm_client, QA_generation_prompt.format(context=sampled_context.page_content)
            )
            question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
            answer = output_QA_couple.split("Answer: ")[-1]
            assert len(answer) < 500, f"Answer is too long: {answer}"
            QA_initial_outputs.append(
                {
                    "context": sampled_context.page_content,
                    "question": question,
                    "answer": answer,
                    "source_doc": sampled_context.metadata["source"],
                }
            )
        except AssertionError as ae:
            logging.error(f"AssertionError: {ae}. Context: {sampled_context.page_content}")
        except ValueError as ve:
            logging.error(f"ValueError in call_llm: {ve}")
        except Exception as e:
            logging.error(f"Unexpected error during loop iteration: {e}. Context: {sampled_context.page_content}")
        
    # Ensure the directory exists
    os.makedirs(os.path.dirname(QA_initial_couples_path), exist_ok=True)

    with open(QA_initial_couples_path, 'w', encoding='utf-8') as json_file:
        json.dump(QA_initial_outputs, json_file, ensure_ascii=False, indent=4)

    print(f"Generated QA couples saved to {QA_initial_couples_path}")
        
    return QA_initial_outputs



# Check if the dataset file exists
if os.path.exists(QA_initial_couples_path) and os.path.getsize(QA_initial_couples_path) > 0:
    try:
        with open(QA_initial_couples_path, "r", encoding="utf-8") as f:
            QA_initial_outputs = json.load(f)
            print(f"Successfully LOADED {len(QA_initial_outputs)} QA couples")
    except (json.JSONDecodeError, OSError) as e:
        print(f"Error loading JSON file: {e}. Recreating dataset.")
        QA_initial_outputs = generate_initial_QA()
else:
    QA_initial_outputs = generate_initial_QA()
    



# Display the first QA couple to check the format
print(f"Below is the first QA couple generated:")
display(pd.DataFrame(QA_initial_outputs).head(1))


Successfully LOADED 17 QA couples
Below is the first QA couple generated:


Unnamed: 0,context,question,answer,source_doc
0,"`stage3_max_live_parameters` is the upper limit on how many full parameters you want to keep on the GPU at any given\ntime. ""reuse distance"" is a metric we are using to figure out when will a parameter be used again in the future, and we\nuse the `stage3_max_reuse_distance` to decide whether to throw away the parameter or to keep it. If a parameter is\ngoing to be used again in near future (less than `stage3_max_reuse_distance`) then we keep it to reduce communication\noverhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and\nbackward passes a single layer granularity and want to keep the parameter in the forward recompute till the backward\n\nThe following configuration values depend on the model's hidden size:\n\n- `reduce_bucket_size`: `hidden_size*hidden_size`\n- `stage3_prefetch_bucket_size`: `0.9 * hidden_size * hidden_size`\n- `stage3_param_persistence_threshold`: `10 * hidden_size`\n\ntherefore set these values to `auto` and the [`Trainer`] will automatically assign the recommended\nvalues. But, of course, feel free to set these explicitly as well.\n\n`stage3_gather_16bit_weights_on_model_save` enables model fp16 weights consolidation when model gets saved. With large\nmodels and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if\nyou plan to resume the training. Watch out for future updates that will remove this limitation and make things more\nflexible.\n\nIf you're migrating from ZeRO-2 configuration note that `allgather_partitions`, `allgather_bucket_size` and\n`reduce_scatter` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just\nbe ignored.\n\n- `sub_group_size`: `1e9`",What is the purpose of `stage3_max_live_parameters`?\n,`stage3_max_live_parameters` is the upper limit on how many full parameters you want to keep on the GPU at any given time.,huggingface/transformers/blob/main/docs/source/en/main_classes/deepspeed.md


In [None]:
import re # importing the regex library

# Save outputs to a JSON file
QA_critiqued_couples_path = './generated_QAs/critiqued_qa_couples.json'

# This is a helper function to parse the evaluation response
def parse_evaluation(evaluation):
    try:
        # Use regex to extract 'Evaluation' and 'Total rating'
        eval_match = re.search(r"Evaluation:\s*(.+)", evaluation, re.DOTALL)
        score_match = re.search(r"Total rating:\s*(\d+)", evaluation)
        
        # Ensure both matches were found
        if not eval_match or not score_match:
            raise ValueError("Missing 'Evaluation' or 'Total rating' in response")
        
        eval_text = eval_match.group(1).strip()
        score = int(score_match.group(1))
        return score, eval_text
    except Exception as e:
        raise ValueError(f"Error parsing evaluation: {e}")

# This is the main function to generate critiques for each QA couple
def generate_critiqued_QA(QA_initial_outputs):
    
    # Create a new list to store the critiqued QA couples 
    # initialize it with the initial QA couples
    QA_critiqued_outputs = QA_initial_outputs
    
    print("Generating critique for each QA couple...")
    
    for output in tqdm(QA_critiqued_outputs):
        try:
            evaluations = {
                "groundedness": call_llm(
                    llm_client,
                    question_groundedness_critique_prompt.format(
                        context=output["context"], question=output["question"]
                    ),
                ),
                "relevance": call_llm(
                    llm_client,
                    question_relevance_critique_prompt.format(question=output["question"]),
                ),
                "standalone": call_llm(
                    llm_client,
                    question_standalone_critique_prompt.format(question=output["question"]),
                ),
            }
            for criterion, evaluation in evaluations.items():
                try:
                    score, eval_text = parse_evaluation(evaluation)
                    output.update(
                        {
                            f"{criterion}_score": score,
                            f"{criterion}_eval": eval_text,
                        }
                    )
                except ValueError as ve:
                    print(f"Error parsing score or evaluation for criterion '{criterion}': {ve}")
        except Exception as e:
                print(f"Unexpected error for output {output}: {e}")
                continue
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(QA_critiqued_couples_path), exist_ok=True)

    with open(QA_critiqued_couples_path, 'w', encoding='utf-8') as json_file:
        json.dump(QA_critiqued_outputs, json_file, ensure_ascii=False, indent=4)

    print(f"Generated QA couples saved to {QA_critiqued_couples_path}")

    return QA_critiqued_outputs


# Check if the dataset file exists
if os.path.exists(QA_critiqued_couples_path) and os.path.getsize(QA_critiqued_couples_path) > 0:
    try:
        with open(QA_critiqued_couples_path, "r", encoding="utf-8") as f:
            QA_critiqued_outputs = json.load(f)
            print(f"Successfully LOADED {len(QA_critiqued_outputs)} QA Critiques")
    except (json.JSONDecodeError, OSError) as e:
        print(f"Error loading JSON file: {e}. Recreating dataset.")
        QA_critiqued_outputs = generate_critiqued_QA(QA_initial_outputs)
else:
    QA_critiqued_outputs = generate_critiqued_QA(QA_initial_outputs)
    



# Display the first QA couple to check the format
print(f"Below is the CRITIQUED QA couple:")
display(pd.DataFrame(QA_critiqued_outputs).head(1))
    

