In [1]:
#Install and upgrade the necessary libraries for Hugging Face models, LangChain, OpenAI, and Gradio.

!pip -q install --upgrade huggingface_hub
!pip -q install langchain_community
!pip -q install langchain_huggingface
!pip install huggingface_hub
!pip install openai
!pip install gradio
!pip install groq



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.0/468.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m900.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np
import json
#Allows interaction with the operating system.
import os
#Provides access to system-specific parameters and functions.
import sys
#Imports the OpenAI Python library for interacting with OpenAI models like GPT-4.
import openai
from openai import OpenAI
#Provides support for reading configuration files (.ini).
import configparser
#Imports Hugging Face Endpoint support from LangChain.
from langchain_community.llms import HuggingFaceEndpoint
#Imports GenerationConfig, which is used to configure text generation parameters.
from transformers import GenerationConfig
#A popular library for making HTTP requests.
import requests
#Imports TQDM, a library for progress bars.
import tqdm
#Enables Google Drive integration in Google Colab
from google.colab import drive
#Provides pre-trained tokenizer and model utilities for text generation.
#AutoTokenizer: Loads the appropriate tokenizer for a given model.
#AutoModel: Loads a general-purpose model.
#AutoModelForCausalLM: Loads a causal language model (LLM) for text generation.
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import random, math
import gradio as gr
import groq
from groq import Groq

In [3]:
# Mount Google Drive to access files
# 'force_remount=True' ensures that the drive is unmounted and remounted if already mounted
drive.mount("/content/drive/", force_remount=True)

# Define the base folder path where configuration files are stored in Google Drive
# Ensure that the folder "RGB_Data_Config" exists in your Google Drive
RGB_Config_folder_path = "/content/drive/My Drive/RGB_Data_Config/"

Mounted at /content/drive/


In [4]:
def read_config(config_file_path):
    # Check if the configuration file exists at the given path
    if not os.path.exists(config_file_path):
        # Raise an error if the file doesn't exist
        raise FileNotFoundError(f"Configuration file not found: {config_file_path}")

    # Initialize a ConfigParser object to read and handle the INI configuration file
    config = configparser.ConfigParser()

    try:
        # Attempt to read the configuration file
        config.read(config_file_path)
    except Exception as e:
        # If reading the file fails, raise an error with details
        raise ValueError(f"Failed to read configuration file: {config_file_path}. Error: {e}")

    # Return the parsed configuration object

    return config

In [5]:
# Global variables

# Define the path to the debug file. This file will be used for logging debug information.
debug_file_name = RGB_Config_folder_path + "debug.txt"

# Read the configuration from a specified INI file located in the folder defined by RGB_Config_folder_path.
# The configuration is stored in the `Config` object for later use.
Config = read_config(RGB_Config_folder_path + 'config.ini')

# Extract the Hugging Face API key from the 'Settings' section of the configuration file.
hfapi_key = Config.get('Settings', 'hfapi_key')

# Extract the Groq API key from the 'Settings' section of the configuration file.
#groq_key = "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" # Config.get('Settings', 'groq_key')
groq_key = "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"

# Extract the OpenAI API key from the 'Settings' section of the configuration file.
openai_api_key = Config.get('Settings', 'openai_api_key')

# Define the URL for OpenAI's chat completions API.
open_api_url = "https://api.openai.com/v1/chat/completions"

# Define the number of questions to be used or processed.
# This could represent a batch size or a limit in certain operations.
Number_of_questions = 300



In [6]:
# Hugging Face Embedding LLM configuration
from huggingface_hub import login
from huggingface_hub import InferenceClient

class HuggingFaceLLM:
    # Initialization of the class with model and optional URL
    def __init__(self, model, url=None):
        self.model = model  # Set the model name
        self.set_hf_login()  # Set up Hugging Face authentication
        # Initialize the InferenceClient with the model
        self.client = InferenceClient(self.model)

        # Create HuggingFace LLM endpoint with specific configurations for text generation
        self.LLM1 = HuggingFaceEndpoint(
            repo_id=model,  # Model repo
            task="text-generation",  # Task type is text generation
            max_new_tokens=1024,  # Limit on new tokens generated
            temperature=0.1,  # Controls randomness of generation
            top_k=30,  # Limits sampling to top 30 tokens
            repetition_penalty=1.03  # Penalty for repeating text
        )

    # Method to set Hugging Face login by managing environment variables
    def set_hf_login(self):
        if "HF_TOKEN" in os.environ:
            del os.environ["HF_TOKEN"]  # Remove existing token if present
        if "HUGGINGFACEHUB_API_TOKEN" in os.environ:
            del os.environ["HUGGINGFACEHUB_API_TOKEN"]  # Same for the second token
        os.environ["HF_TOKEN"] = hfapi_key  # Set new token
        os.environ["HUGGINGFACEHUB_API_TOKEN"] = hfapi_key  # Set token for Hugging Face Hub

    # Method to generate a response from the Hugging Face model
    def generate(self, text, system="", temperature=0.7, top_p=1.0):
        headers = {"Authorization": f"Bearer {hfapi_key}"}  # Bearer token for authentication

        payload = {
            "inputs": f"{system}\n{text}",  # Combine system instruction and user input
            "parameters": {
                "temperature": temperature,  # Sampling temperature
                "top_p": top_p,  # Nucleus sampling
                "return_full_text": False  # Only return the generated text
            }
        }

        # Use Hugging Face model endpoint to generate the text based on the provided inputs
        response = self.LLM1.invoke(f"{system}\n{text}")
        print(response)  # Print the generated response for debugging purposes

        return response  # Return the generated response


class GroqModel:
    def __init__(self, model_name):
        self.client = groq.Groq(api_key=groq_key)
        self.model_name = model_name

    def generate(self, prompt, temperature, system=None):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=[{"role": "system", "content": system or ""},
                      {"role": "user", "content": prompt}],
            temperature=temperature
        )
        return response.choices[0].message.content

# OpenAI LLM configuration
class OpenAILLM:
    # Initialization with model and max token limit for OpenAI LLM
    def __init__(self, QA_Model, max_tokens):
        openai.api_key = openai_api_key  # Set OpenAI API key
        self.API_KEY = openai_api_key  # Store the key for API calls
        self.Open_AI_LLM_inst = OpenAI(api_key=openai_api_key)  # Create an OpenAI instance
        self.model = QA_Model  # Model name
        self.max_tokens = max_tokens  # Maximum token limit for OpenAI API response
        self.url = open_api_url  # OpenAI API URL

    # Method to generate a response from the OpenAI model
    def generate(self, text: str, temperature=0.7, system="You are a helpful assistant. You can help me by answering my questions. You can also ask me questions.", top_p=1):
        headers = {"Authorization": f"Bearer {self.API_KEY}"}  # Bearer token for OpenAI API

        query = {
            "model": self.model,  # Model to use for text generation
            "temperature": temperature,  # Sampling temperature for randomness
            "top_p": top_p,  # Nucleus sampling parameter
            "messages": [
                {"role": "system", "content": system},  # System message defining the role
                {"role": "user", "content": text}  # User input text
            ],
            "stream": False  # Stream option to get the response in chunks (set to False for a single response)
        }

        responses = requests.post(self.url, headers=headers, json=query)  # Send the request to OpenAI API

        if 'choices' not in responses.json():
            print(text)  # Print the input if no valid response is returned
            print(responses)  # Print the full API response for debugging

        # Print and return the generated response from OpenAI
        print(responses.json()['choices'][0]['message']['content'])
        return responses.json()['choices'][0]['message']['content']

    # Function to get a specific OpenAI response to a prompt
    def get_openai_response(self, prompt):
        try:
            # Prepare the message list with prompt and test question
            messages = [
                {"role": "system", "content": prompt},  # System instruction
                {"role": "user", "content": "test question?"},  # User query
            ]

            # Use OpenAI LLM's beta chat completion API
            response = self.Open_AI_LLM_inst.beta.chat.completions.parse(
                model=self.QA_Model, messages=messages, max_tokens=self.max_tokens
            )

            # Return the content of the first choice (answer) from the OpenAI response
            return response.choices[0].message.content.strip()

        except Exception as e:
            # Handle any errors and return an error message
            return f"Error: {e}"


In [7]:
# Function to process the data for a given instance based on noise rate, passage number, filename, and correctness rate.
def processdata(instance, noise_rate, passage_num, filename, correct_rate):
    # Extract the query and answer from the instance.
    query = instance['query']
    ans = instance['answer']

    # Calculate the number of negative and positive samples based on the noise rate.
    neg_num = math.ceil(passage_num * noise_rate)
    pos_num = passage_num - neg_num

    # Case when the filename contains '_int' (could indicate some internal data or instruction-based format)
    if '_int' in filename:
        # Shuffle the positive samples
        for i in instance['positive']:
            random.shuffle(i)
        print(len(instance['positive']))  # Debugging output: print the length of positive samples
        # Select the first document from each positive set
        docs = [i[0] for i in instance['positive']]

        # If there are not enough positive documents, fill in with additional documents from larger positive sets
        if len(docs) < pos_num:
            maxnum = max([len(i) for i in instance['positive']])
            for i in range(1, maxnum):
                for j in instance['positive']:
                    if len(j) > i:
                        docs.append(j[i])
                        if len(docs) == pos_num:
                            break
                if len(docs) == pos_num:
                    break
        # Calculate the number of negative documents needed
        neg_num = passage_num - len(docs)
        # Add negative documents if needed
        if neg_num > 0:
            negative = instance['negative'][:neg_num]
            docs += negative
    # Case when the filename contains '_fact' (likely for factual questions or true/false type tasks)
    elif '_fact' in filename:
        # Calculate the number of correct documents to select based on the correct rate.
        correct_num = math.ceil(passage_num * correct_rate)
        pos_num = passage_num - neg_num - correct_num
        # Randomly select positive examples
        indexs = list(range(len(instance['positive'])))
        selected = random.sample(indexs, min(len(indexs), pos_num))
        docs = [instance['positive_wrong'][i] for i in selected]

        # Remaining indices after selecting positive examples
        remain = [i for i in indexs if i not in selected]
        # Add correct examples if available
        if correct_num > 0 and len(remain) > 0:
            docs += [instance['positive'][i] for i in random.sample(remain, min(len(remain), correct_num))]

        # Add negative examples if needed
        if neg_num > 0:
            docs += instance['negative'][:neg_num]
    else:
        # General case for other types of data (no '_int' or '_fact' in filename)
        if noise_rate == 1:
            neg_num = passage_num  # All passages are negative if noise_rate is 1
            pos_num = 0
        else:
            # Adjust the number of positive and negative passages based on the available data
            if neg_num > len(instance['negative']):
                neg_num = len(instance['negative'])
                pos_num = passage_num - neg_num
            elif pos_num > len(instance['positive']):
                pos_num = len(instance['positive'])
                neg_num = passage_num - pos_num

        # Select the required number of positive and negative samples
        positive = instance['positive'][:pos_num]
        negative = instance['negative'][:neg_num]

        # Combine the selected positive and negative samples
        docs = positive + negative

    # Shuffle the final document list
    random.shuffle(docs)

    return query, ans, docs  # Return the query, answer, and the processed documents


# Function to check if the predicted answer matches the ground truth.
def checkanswer(prediction, ground_truth):
    prediction = prediction.lower()  # Convert prediction to lowercase for case-insensitive comparison
    if type(ground_truth) is not list:
        ground_truth = [ground_truth]  # Ensure ground_truth is always a list

    labels = []
    for instance in ground_truth:
        flag = True
        if type(instance) == list:  # If the ground truth is a list of possible answers
            flag = False
            instance = [i.lower() for i in instance]  # Lowercase all possible answers
            for i in instance:
                if i in prediction:
                    flag = True  # Set flag to True if any possible answer is found in prediction
                    break
        else:
            instance = instance.lower()  # Lowercase the single ground truth answer
            if instance not in prediction:
                flag = False  # Set flag to False if the ground truth is not found in prediction
        labels.append(int(flag))  # Convert boolean flag to integer (1 for correct, 0 for incorrect)

    return labels  # Return the list of labels (1 for correct, 0 for incorrect)


# Function to evaluate the results, checking for correctness based on a threshold.
def getevalue(results):
    results = np.array(results)  # Convert results to a numpy array for easier manipulation
    results = np.max(results, axis=0)  # Take the max along axis 0 (likely across multiple predictions)
    if 0 in results:
        return False  # If any result is 0 (incorrect), return False
    else:
        return True  # If all results are correct (1), return True


# Main prediction function that interacts with the model and generates the predictions.
def predict(query, ground_truth, docs, model, system, instruction, temperature, dataset):
    '''
    label: 0 for positive, 1 for negative, -1 for not enough information
    '''

    # If no documents are provided, generate a prediction with an empty document list.
    if len(docs) == 0:
        text = instruction.format(QUERY=query, DOCS='')  # Format instruction with query and empty docs
        prediction = model.generate(text, temperature)  # Get prediction from the model
    else:
        # Format instruction with query and the list of documents
        docs = '\n'.join(docs)
        text = instruction.format(QUERY=query, DOCS=docs)
        prediction = model.generate(text, temperature, system)  # Get prediction with system instructions

    # Handle dataset-specific processing, e.g., removing spaces for Chinese datasets
    if 'zh' in dataset:
        prediction = prediction.replace(" ", "")  # Remove spaces if the dataset is in Chinese

    # If the prediction indicates insufficient information, label as -1 (not enough info)
    if '信息不足' in prediction or 'insufficient information' in prediction:
        labels = [-1]
    else:
        # Check if the prediction matches the ground truth and return the label (0 for correct, 1 for incorrect)
        labels = checkanswer(prediction, ground_truth)

    # Initialize a fact label (for factual correctness checking)
    factlabel = 0
    # If the prediction mentions factual errors, set factlabel to 1 (indicating factual error)
    if '事实性错误' in prediction or 'factual errors' in prediction:
        factlabel = 1

    return labels, prediction, factlabel  # Return the labels, prediction, and factual correctness label






For evaluating ChatGPT, you can run as:

python evalue.py \
--dataset en \
--modelname chatgpt \
--temp 0.2 \
--noise_rate 0.6 \
--api_key YourAPIKEY \
--passage_num 5
For evaluating other models, you can run as:

python evalue.py \
--dataset en \
--modelname chatglm2-6b \
--temp 0.2 \
--noise_rate 0.6 \
--plm THUDM/chatglm-6b \
--passage_num 5
You should change modelname and plm for different models, where plm is the path of model.

temp is the temperature of model.

noise_rate is rate of noisy documents in inputs.

passage_num is number of provided documents for LLM (default is 5).

The outputs are:

all_rate: The accuracy (noise_rate<1) or rejection rate (noise_rate=1)
fact_check_rate: the error detection rates (ED)
To evaluate rejection using ChatGPT, you should first run the evalue.py in noise_rate=1 to obtain the generation result, and then run:

python reject_evalue.py \
--dataset en \
--modelname chatglm2-6b \
--api_key YourAPIKEY
The "reject_rate" in the outputs are the reject rate (Rej*).

To evaluate counterfactual robustness using ChatGPT, you should first run the evalue.py in dataset=en_fact/zh_fact to obtain the generation result, and then run:

python fact_evalue.py \
--dataset en_fact \
--modelname chatglm2-6b \
--api_key YourAPIKEY
The "reject_rate" in the outputs are the error detection rates (ED*). The correct_rate in the outputs are the error correction rate (CR)

In [8]:
# Function to determine if a model can identify factual errors in its response
def fact_check(question, answer, url, apikey):
    # Define a prompt to guide the model to evaluate whether factual errors exist
    prompt = '''Please determine whether the model can identify factual errors in the document. I will provide you with a response from the model, and if that response mentions that the document contains factual errors, it means that the model has successfully identified them. Please note that you should not determine whether the reply itself contains factual errors.
Demonstrations:
Response:文档信息有误，2008年的奥运会在北京举办。
Yes, the model has identified the factual errors.

Response:2014年澳网女单冠军是大阪直美。
NO, the model fail to identify the factual errors.

Response: The director of the Silent Hill movie is Justin Kurzel.
NO, the model fail to identify the factual errors.

Response: Harry Potter is written by J. K. Rowling.
NO, the model fail to identify the factual errors.

Response:  There are factual errors in the provided documents. The correct answer is 2023.
Yes, the model has identified the factual errors.

Begin to generate:
Answer: {answer}
    '''
    # Format the prompt with the provided answer to evaluate its factual correctness
    text2 = prompt.format(answer=answer)

    # Call the fact_getdata function to send the prompt to the model and return its response
    return fact_getdata(text2, url, apikey)


# Function to send the prompt to the API and retrieve the model's evaluation
def fact_getdata(text, url, API_KEY):
    # Prepare the data to send to the API, including the model's input (prompt)
    data = {
        "model": "gpt-3.5-turbo",  # Specify the model being used
        "messages": [{"role": "user", "content": text}]  # Format the prompt for the model
    }

    # Set the authorization header with the API key
    headers = {"Authorization": f"Bearer {API_KEY}"}

    # Send a POST request to the API with the data and headers
    completion = requests.post(url, json=data, headers=headers)

    # Parse the response from the API and return the model's evaluation
    completion = completion.json()['choices'][0]['message']['content']
    return completion


# Main function for evaluating the fact-checking ability of the model on a dataset
def fact_evalue(modelname, dataset_file_name, temperature, noise_rate, correct_rate, passage_num):
    # Define the paths for saving the results
    resultpath = 'result-en/fact'
    normal_dump_file = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}.json'
    chatgptresult_dump_file = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}_chatgptresult.json'
    chatgpt_outputfile = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}_chatgpt.json'

    results = []  # List to store the results
    useddata = {}  # Dictionary to store previously processed data (to avoid redundant processing)

    # If results have already been saved in a previous run, load them
    if os.path.exists(chatgpt_outputfile):
        with open(chatgpt_outputfile) as f:
            for line in f:
                data = json.loads(line)
                useddata[data['id']] = data  # Store the data by ID

    # Open the output file to save the new evaluation results
    with open(chatgpt_outputfile, 'w', encoding='utf-8') as f:
        with open(normal_dump_file, 'r', encoding='utf-8') as f2:
            for line in tqdm.tqdm(f2):  # Iterate through each line in the normal dump file
                data = json.loads(line)

                # If the data has already been processed, append it to the results
                if data['id'] in useddata:
                    results.append(useddata[data['id']])
                    f.write(json.dumps(useddata[data['id']], ensure_ascii=False) + '\n')
                    continue

                try:
                    # Extract the question and answer from the data
                    question = data['query']
                    answer = data['prediction']

                    # Call the fact_check function to evaluate the response for factual errors
                    evaluation = fact_check(question, answer, open_api_url, openai_api_key)

                    # Add the evaluation result to the data
                    data['evaluation'] = evaluation
                    results.append(data)

                    # Write the updated data (with evaluation) to the output file
                    f.write(json.dumps(data, ensure_ascii=False) + '\n')
                except Exception as e:
                    # In case of an error, print the error and skip this data point
                    print(e)
                    print(question, answer)
                    continue

    # Initialize counters for various statistics
    rejecttt = 0  # Count of responses where factual errors are identified
    tt = 0  # Count of responses where the model made a correct prediction
    correct_tt = 0  # Count of correct responses that were flagged as having factual errors

    # Loop through the results and compute the statistics
    for i in results:
        if "has identified" in i['evaluation'] or "Yes" in i['evaluation']:
            rejecttt += 1  # Increment reject count if the evaluation mentions factual errors
            if 0 not in i['label'] and 1 in i['label']:
                correct_tt += 1  # If the model correctly identifies the factual error, increment the correct count
        if 0 not in i['label'] and 1 in i['label']:
            tt += 1  # Increment total correct count if the label indicates a correct answer

    # Calculate the evaluation metrics
    scores = {
        'reject_rate': rejecttt / len(results),  # Percentage of responses with identified factual errors
        'all_rate': (tt) / len(results),  # Overall accuracy rate
        'correct_rate': correct_tt / rejecttt if rejecttt > 0 else 0,  # Accuracy rate for factual error identification
        'tt': tt,  # Total correct responses
        'rejecttt': rejecttt,  # Total factual errors identified
        'correct_tt': correct_tt,  # Correct responses that identified factual errors
        'nums': len(results),  # Total number of results
        'noise_rate': noise_rate,  # Noise rate for the evaluation
    }

    # Save the evaluation scores to a JSON file
    json.dump(scores, open(chatgptresult_dump_file, 'w', encoding='utf-8'), ensure_ascii=False, indent=4)

    # Return the evaluation rates
    all_rate = (tt) / len(results)  # Overall accuracy
    reject_rate = rejecttt / len(results)  # Rate of factual errors identified
    correct_rate = correct_tt / rejecttt if rejecttt > 0 else 0  # Accuracy in identifying factual errors

    return all_rate, reject_rate, correct_rate


In [9]:

# This function checks whether the answer to a question is supported or addressed by the document.
def reject_check(question, answer, url, apikey):
    # Define the prompt structure that will ask the AI model to determine if the answer is addressed by the documents
    prompt = '''I will give you a question and an answer generated through document retrieval. Please use this answer to determine if the retrieved document can solve the question.
    Demonstrations:
    Question: 2023年澳网女单冠军是谁
    Answer:文档信息不足，因此我无法基于提供的文档回答该问题。
    No, the question is not addressed by the documents.

    Question: Who is the champion of Australian Open 2023 Women's Singles?
    Answer: Serena Williams
    Yes, the question is addressed by the documents.

    Question: Where is ACL2023 held?
    Answer: Location of ACL2023 has not been confirmed.
    No, the question is not addressed by the documents.

    Question: 2023年中国GDP是多少?
    Answer: I can not answer this question。
    No, the question is not addressed by the documents.

    Begin to generate:
    Question: {question}
    Answer: {answer}
    '''
    # Format the prompt with the actual question and answer
    text2 = prompt.format(question=question, answer=answer)
    # Call the reject_getdata function to get the AI evaluation
    return reject_getdata(text2, url, apikey)


# This function sends the formatted prompt to the API and retrieves the response.
def reject_getdata(text, url, API_KEY):
    # Define the data structure for the POST request with the formatted prompt
    data = {
        "model": "gpt-3.5-turbo",  # We are using GPT-3.5 for this task
        "messages": [{"role": "user", "content": text}]
    }
    # Set up the headers with the API key for authentication
    headers = {"Authorization": f"Bearer {API_KEY}"}
    # Send a POST request to the external API and get the response
    completion = requests.post(url, json=data, headers=headers)
    # Extract the content of the response message and return it
    completion = completion.json()['choices'][0]['message']['content']
    return completion


# This function evaluates the model's answers and checks if they address the questions correctly.
def reject_evalue(modelname, dataset_file_name, temperature, noise_rate, correct_rate, passage_num):
    resultpath = 'result-en'  # Define the directory where results will be stored

    # Construct the file names for storing predictions and results
    normal_dump_file = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}.json'
    chatgptresult_dump_file = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}_chatgptresult.json'
    chatgpt_outputfile = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}_chatgpt.json'

    results = []  # List to store all results
    useddata = {}  # Dictionary to store previously processed data

    # Check if the results already exist in the chatgpt_outputfile, if so, load them
    if os.path.exists(chatgpt_outputfile):
        with open(chatgpt_outputfile) as f:
            for line in f:
                data = json.loads(line)
                useddata[data['id']] = data

    # Open the file to write the results
    with open(chatgpt_outputfile, 'w', encoding='utf-8') as f:
        # Open the original file containing the model predictions
        with open(normal_dump_file, 'r', encoding='utf-8') as f2:
            for line in tqdm.tqdm(f2):  # Iterate through the predictions
                data = json.loads(line)
                # If the data has been processed before, write it back to the output file
                if data['id'] in useddata and data['query'] == useddata[data['id']]['query'] and data['ans'] == useddata[data['id']]['ans']:
                    results.append(useddata[data['id']])
                    f.write(json.dumps(useddata[data['id']], ensure_ascii=False) + '\n')
                    continue
                try:
                    # Extract the question and predicted answer
                    question = data['query']
                    answer = data['prediction']

                    # Call reject_check to determine if the answer addresses the question
                    evaluation = reject_check(question, answer, open_api_url, openai_api_key)
                    # Add the evaluation result to the data
                    data['evaluation'] = evaluation
                    results.append(data)
                    # Write the processed data with the evaluation to the file
                    f.write(json.dumps(data, ensure_ascii=False) + '\n')
                except Exception as e:
                    print(e)  # Print any errors encountered during processing
                    print(question, answer)
                    continue

    # Variables to track the results of the evaluation
    rejecttt = 0
    tt = 0

    # Process the results to calculate metrics
    for i in results:
        # Count how many answers were rejected because they didn't address the question
        if "not addressed" in i['evaluation']:
            rejecttt += 1
        # Count how many answers are considered correct based on labels
        if 0 not in i['label'] and 1 in i['label']:
            tt += 1

    # Prepare the final scores for evaluation
    scores = {
        'reject_rate': rejecttt / len(results),  # Percentage of rejected answers
        'all_rate': (tt) / len(results),  # Overall rate of correct answers
        'tt': tt,  # Total correct answers
        'rejecttt': rejecttt,  # Total rejected answers
        'nums': len(results),  # Total number of evaluated samples
    }

    # Write the evaluation scores to a JSON file
    json.dump(scores, open(chatgptresult_dump_file, 'w', encoding='utf-8'), ensure_ascii=False, indent=4)

    # Calculate the reject rate and overall rate
    reject_rate = rejecttt / len(results)
    all_rate = (tt) / len(results)

    return reject_rate  # Return the reject rate as the output


In [10]:
def evalue(modelname, dataset_file_name, temperature, noise_rate, correct_rate, passage_num, factchecking=False):
    # Initialize list to hold instances of the dataset
    instances = []
    q_no = 0  # Counter for the number of questions processed

    # Open the dataset file and load data into the instances list
    with open(dataset_file_name, 'r') as f:
        for line in f:
            q_no += 1
            if q_no > Number_of_questions:  # Stop once we've processed the desired number of questions
                break
            instances.append(json.loads(line))  # Add each instance (JSON) to the instances list

    # Define the directory for storing results
    resultpath = 'result-en'
    if not os.path.exists(resultpath):
        os.mkdir(resultpath)  # Create the result directory if it doesn't exist

    # Define the system and instruction text based on whether fact checking is enabled
    if factchecking:
        system = "You are an accurate and reliable AI assistant that can answer questions with the help of external documents. Please note that external documents may contain noisy or factually incorrect information. If the information in the document contains the correct answer, you will give an accurate answer. If the information in the document does not contain the answer, you will generate ’I can not answer the question because of the insufficient information in documents.‘. If there are inconsistencies with the facts in some of the documents, please generate the response 'There are factual errors in the provided documents.' and provide the correct answer."
        instruction = "Document:\n{DOCS} \n\nQuestion:\n{QUERY}"
        resultpath = resultpath + '/fact'  # Save fact checking results in a separate folder
    else:
        system = "You are an accurate and reliable AI assistant that can answer questions with the help of external documents. Please note that external documents may contain noisy or factually incorrect information. If the information in the document contains the correct answer, you will give an accurate answer. If the information in the document does not contain the answer, you will generate ’I can not answer the question because of the insufficient information in documents.‘. If there are inconsistencies with the facts in some of the documents, please generate the response 'There are factual errors in the provided documents.' and provide the correct answer."
        instruction = "Document:\n{DOCS} \n\nQuestion:\n{QUERY}"

    # Load the appropriate model based on the model name
    if False:
      if modelname == 'gpt-3.5-turbo':
          model = OpenAILLM("gpt-3.5-turbo", 1250)
      elif modelname == 'Qwen2.5-72B-Instruct':
          model = HuggingFaceLLM("Qwen/Qwen2.5-72B-Instruct")
      elif modelname == 'DeepSeek-R1-Distill-Llama-70B':
          model = HuggingFaceLLM("deepseek-ai/DeepSeek-R1-Distill-Llama-70B")
      elif modelname == 'Mixtral-8x7B-Instruct':
          model = HuggingFaceLLM("mistralai/Mixtral-8x7B-Instruct")
      elif modelname == 'Meta-Llama-3.1-Instruct':
          model = HuggingFaceLLM("meta-llama/Meta-Llama-3-70B-Instruct")
      elif modelname == 'gemma-2-9b-it':
          model = HuggingFaceLLM("google/gemma-2-9b-it")
      else:
          print("Invalid model name")  # Exit if an invalid model name is provided
          sys.exit(1)
    else:
      if modelname == 'gpt-3.5-turbo':
          model = OpenAILLM("gpt-3.5-turbo", 1250)
      elif modelname == 'qwen-2.5-32b':
          model = GroqModel("qwen-2.5-32b")
      elif modelname == 'DeepSeek-R1-Distill-Llama-70B':
          model = GroqModel("deepseek-r1-distill-llama-70b")
      elif modelname == 'Mixtral-8x7B-Instruct':
          model = GroqModel("mixtral-8x7b-32768")
      elif modelname == 'llama3-70b-8192':
          #model = GroqModel("meta-llama/Meta-Llama-3-70B-Instruct")
          model = GroqModel("llama3-70b-8192")
      elif modelname == 'gemma-2-9b-it':
          model = GroqModel("gemma2-9b-it")
      else:
          print("Invalid model name")  # Exit if an invalid model name is provided
          sys.exit(1)

    # Define the result file paths where the predictions will be saved
    normal_dump_file = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}.json'
    result_dump_file = f'{resultpath}/prediction_{dataset_file_name.split("/")[-1].split(".json")[0]}_{modelname}_temp{temperature}_noise{noise_rate}_passage{passage_num}_correct{correct_rate}_result.json'

    # Initialize dictionary to store previously used data (if any)
    useddata = {}
    if os.path.exists(normal_dump_file):
        with open(normal_dump_file) as f:
            for line in f:
                data = json.loads(line)
                useddata[data['id']] = data  # Store previously saved results to avoid duplicate predictions
    else:
        if not os.path.exists(resultpath):
            os.makedirs(resultpath)  # Create the directory if it doesn't exist

    results = []  # List to hold the final results
    with open(normal_dump_file, 'w') as f:
        for instance in tqdm.tqdm(instances):  # Loop through each instance in the dataset
            # If this instance's results have already been processed, use the saved data
            if instance['id'] in useddata and instance['query'] == useddata[instance['id']]['query'] and instance['answer'] == useddata[instance['id']]['ans']:
                results.append(useddata[instance['id']])  # Append the existing result
                f.write(json.dumps(useddata[instance['id']], ensure_ascii=False) + '\n')  # Write to the result file
                continue  # Skip further processing for this instance

            try:
                random.seed(2333)  # Ensure reproducibility by setting a fixed random seed
                # If passage_num is 0, there are no documents to use for context
                if passage_num == 0:
                    query = instance['query']
                    ans = instance['answer']
                    docs = []  # No documents for the query
                else:
                    # Process the data based on noise rate and passage number
                    query, ans, docs = processdata(instance, noise_rate, passage_num, dataset_file_name, correct_rate)

                # Get the model's prediction for the query and documents
                label, prediction, factlabel = predict(query, ans, docs, model, system, instruction, temperature, dataset_file_name)

                # Create a new instance with the prediction and other details
                newinstance = {
                    'id': instance['id'],
                    'query': query,
                    'ans': ans,
                    'label': label,
                    'prediction': prediction,
                    'docs': docs,
                    'noise_rate': noise_rate,
                    'factlabel': factlabel
                }

                # Append the result to the results list and save it to the file
                results.append(newinstance)
                f.write(json.dumps(newinstance, ensure_ascii=False) + '\n')
            except Exception as e:
                print("Error:", e)  # Handle any exceptions that may occur during prediction
                continue  # Skip to the next instance if there's an error

    tt = 0  # Initialize counter for the number of correct predictions
    for i in results:
        label = i['label']
        # Count the correct predictions based on noise rate and labels
        if noise_rate == 1 and label[0] == -1:
            tt += 1
        elif 0 not in label and 1 in label:
            tt += 1

    # Calculate the accuracy and other statistics
    scores = {
        'all_rate': (tt) / len(results),  # Overall accuracy
        'noise_rate': noise_rate,
        'tt': tt,
        'nums': len(results)
    }

    # If the dataset is fact-checking, calculate the fact-checking statistics
    if '_fact' in dataset_file_name:
        fact_tt = 0  # Initialize counter for fact-checking instances
        correct_tt = 0  # Initialize counter for correct fact-checking predictions
        for i in results:
            if i['factlabel'] == 1:  # If there's a factual error
                fact_tt += 1
                if 0 not in i['label']:  # If the prediction was correct
                    correct_tt += 1
        fact_check_rate = fact_tt / len(results)  # Rate of factual error instances
        if fact_tt > 0:
            correct_rate = correct_tt / fact_tt  # Accuracy for fact-checking instances
        else:
            correct_rate = 0
        scores['fact_check_rate'] = fact_check_rate
        scores['correct_rate'] = correct_rate
        scores['fact_tt'] = fact_tt
        scores['correct_tt'] = correct_tt

    # Save the calculated scores to a result file
    json.dump(scores, open(result_dump_file, 'w'), ensure_ascii=False, indent=4)

    # Calculate and return the overall accuracy
    accuracy = (tt / len(results))
    return accuracy



In [11]:
def run_model(Metric, model_name, temperature, noise_rate, correct_rate, passage_num, l_Number_of_questions):
    # Set a global variable for the number of questions
    global Number_of_questions
    Number_of_questions = int(l_Number_of_questions)

    # Checking which metric to use (Noise Robustness, Negative Rejection, Information Integration, or Counterfactual Robustness)
    if Metric == "Noise Robustness":
        # Evaluate the model using the 'Noise Robustness' metric
        # The evalue function is used to evaluate the model with the given temperature, noise rate, correct rate, passage number, etc.
        value = evalue(model_name, RGB_Config_folder_path + "en.json", float(temperature), float(noise_rate), float(correct_rate), int(passage_num), False)
        # Store the accuracy result for the output
        output_text = "accuracy = " + str(value)

    elif Metric == "Negative Rejection":
        # Evaluate the model using the 'Negative Rejection' metric (this is often used to evaluate how well a system rejects irrelevant data)
        # Noise rate is fixed to 1.0 here because we are not evaluating noise robustness but rather rejection ability.
        value = evalue(model_name, RGB_Config_folder_path + "en.json", float(temperature), 1.0, float(correct_rate), int(passage_num), False)
        # Uncomment the following line if the reject_evalue function was used for rejection evaluation
        # value = reject_evalue(model_name, RGB_Config_folder_path + "en.json", float(temperature), 1.0, float(correct_rate), int(passage_num))
        # Set the rejection rate as the output
        output_text = "rejection rate = " + str(value)

    elif Metric == "Information Integration":
        # Evaluate the model using the 'Information Integration' metric
        value = evalue(model_name, RGB_Config_folder_path + "en_int.json", float(temperature), float(noise_rate), float(correct_rate), int(passage_num), False)
        # Store the accuracy result for information integration
        output_text = "accuracy = " + str(value)

    elif Metric == "Counterfactual Robustness":
        # Evaluate the model using the 'Counterfactual Robustness' metric (tests how robust the model is to counterfactual changes)
        # The evalue function evaluates the accuracy, and fact_evalue evaluates the ability to detect and correct factual errors
        value = evalue(model_name, RGB_Config_folder_path + "en_fact.json", float(temperature), float(noise_rate), float(correct_rate), int(passage_num), True)
        # Get the error detection and correction rates using the fact_evalue function
        all_rate, reject_rate, correct_rate = fact_evalue(model_name, RGB_Config_folder_path + "en_fact.json", float(temperature), float(noise_rate), float(correct_rate), int(passage_num))
        # Combine the accuracy with the error detection and correction rates in the output
        output_text = "accuracy = " + str(value) + "\nerror detection rate = " + str(reject_rate) + "\nerror correction rate = " + str(correct_rate)

    else:
        # If an invalid metric is provided, exit the program
        print("Invalid Metric")
        sys.exit(1)

    return output_text  # Return the resulting output text


In [None]:
# Create the Gradio Blocks interface for the model evaluation
with gr.Blocks() as iface:

    # Display a markdown title at the top of the interface
    gr.Markdown("# Run your model with different parameters")

    # Create a row for placing multiple columns side by side
    with gr.Row():

        # First column for the first set of input fields
        with gr.Column():

            # Dropdown for selecting the evaluation metric
            metric = gr.Dropdown(
                choices=["Noise Robustness", "Negative Rejection", "Information Integration", "Counterfactual Robustness"],
                label="Metric"  # Label for this dropdown
            )

            # Dropdown for selecting the model name from a list of models
            model_name = gr.Dropdown(
                choices=["gpt-3.5-turbo", "qwen-2.5-32b", "DeepSeek-R1-Distill-Llama-70B", "Mixtral-8x7B-Instruct", "llama3-70b-8192", "gemma-2-9b-it"],
                label="Model Name"  # Label for this dropdown
            )

            # Slider for selecting the temperature value (0 to 1)
            temperature = gr.Slider(
                value=0.2,  # Initial value set to 0.2
                minimum=0.0,  # Minimum value is 0.0
                maximum=1.0,  # Maximum value is 1.0
                label="Temperature (0 - 1)"  # Label for the slider
            )

        # Second column for the second set of input fields
        with gr.Column():

            # Slider for selecting the noise rate (0 to 1)
            noise_rate = gr.Slider(
                value=0.2,  # Initial value set to 0.2
                minimum=0.0,  # Minimum value is 0.0
                maximum=1.0,  # Maximum value is 1.0
                label="Noise Rate (0 - 1)"  # Label for the slider
            )

            # Slider for selecting the correct rate (0 to 1)
            correct_rate = gr.Slider(
                value=0.2,  # Initial value set to 0.2
                minimum=0.0,  # Minimum value is 0.0
                maximum=1.0,  # Maximum value is 1.0
                label="Correct Rate (0 - 1)"  # Label for the slider
            )

            # Number input for selecting the number of passages (1 to 10)
            passage_number = gr.Number(
                value=5,  # Initial value set to 5
                minimum=1,  # Minimum value is 1
                maximum=10,  # Maximum value is 10
                precision=0,  # No decimal points
                label="Passage Number (1-10)"  # Label for the number input
            )

            # Number input for selecting the number of questions (1 to 300)
            l_Number_of_questions = gr.Number(
                value=10,  # Initial value set to 10
                minimum=1,  # Minimum value is 1
                maximum=300,  # Maximum value is 300
                precision=0,  # No decimal points
                label="Number of questions (1-300)"  # Label for the number input
            )

    # Output textbox where the results of the model evaluation will be displayed
    output = gr.Textbox(label="Output", lines=3)

    # Button to trigger the model evaluation when clicked
    submit_button = gr.Button("Run Model")

    # Define the action when the button is clicked
    submit_button.click(
        run_model,  # The function to be called when the button is clicked
        [metric, model_name, temperature, noise_rate, correct_rate, passage_number, l_Number_of_questions],  # Inputs to be passed to the function
        output  # Output where the results will be displayed
    )

# Launch the Gradio interface with error display and debugging enabled
iface.launch(show_error=True, debug=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://742feb0e8bacb0a56d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


100%|██████████| 10/10 [01:51<00:00, 11.14s/it]
100%|██████████| 10/10 [02:05<00:00, 12.51s/it]
100%|██████████| 10/10 [02:26<00:00, 14.63s/it]
100%|██████████| 10/10 [01:32<00:00,  9.28s/it]
 90%|█████████ | 9/10 [00:01<00:00, 12.63it/s]

Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 499016, Requested 1640. Please try again in 1m53.3138s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 499016, Requested 1740. Please try again in 2m10.5548s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 499016, Requested 1894. Please try again in 2m37

100%|██████████| 10/10 [00:01<00:00,  9.01it/s]


Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 499015, Requested 1830. Please try again in 2m25.9098s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}


 50%|█████     | 5/10 [00:00<00:00, 42.66it/s]

Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498899, Requested 1640. Please try again in 1m33.1248s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498899, Requested 1740. Please try again in 1m50.3648s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498899, Requested 1894. Please try again in 2m16

100%|██████████| 10/10 [00:00<00:00, 30.80it/s]

Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498898, Requested 1574. Please try again in 1m21.512s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498898, Requested 1830. Please try again in 2m5.7108s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}



 50%|█████     | 5/10 [00:00<00:00, 36.84it/s]

Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498825, Requested 1640. Please try again in 1m20.196799999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498824, Requested 1740. Please try again in 1m37.4288s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498824, Requested 1894. Please try again in

100%|██████████| 10/10 [00:00<00:00, 28.85it/s]

Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498823, Requested 1574. Please try again in 1m8.577s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jfm75kxeenzbp5hhvgzxy7nz` service tier `on_demand` on : Limit 500000, Used 498823, Requested 1830. Please try again in 1m52.7708s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': '', 'code': 'rate_limit_exceeded'}}



  0%|          | 0/10 [00:00<?, ?it/s]

2


 10%|█         | 1/10 [00:02<00:19,  2.21s/it]

2


 20%|██        | 2/10 [00:03<00:14,  1.86s/it]

2


 30%|███       | 3/10 [00:05<00:12,  1.76s/it]

2


 40%|████      | 4/10 [00:07<00:10,  1.72s/it]

2


 50%|█████     | 5/10 [00:18<00:25,  5.16s/it]

2


 60%|██████    | 6/10 [00:32<00:32,  8.14s/it]

2


 70%|███████   | 7/10 [00:47<00:31, 10.34s/it]

2


 80%|████████  | 8/10 [01:03<00:24, 12.36s/it]

2


 90%|█████████ | 9/10 [01:16<00:12, 12.45s/it]

2


100%|██████████| 10/10 [01:32<00:00,  9.21s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

2


 10%|█         | 1/10 [00:01<00:12,  1.38s/it]

2


 20%|██        | 2/10 [00:03<00:13,  1.68s/it]

2


 30%|███       | 3/10 [00:04<00:11,  1.59s/it]

2


 40%|████      | 4/10 [00:06<00:09,  1.63s/it]

2


 50%|█████     | 5/10 [00:09<00:10,  2.18s/it]

2


 60%|██████    | 6/10 [00:18<00:17,  4.30s/it]

2


 70%|███████   | 7/10 [00:33<00:23,  7.90s/it]

2


 80%|████████  | 8/10 [00:50<00:21, 10.71s/it]

2


 90%|█████████ | 9/10 [01:03<00:11, 11.56s/it]

2


100%|██████████| 10/10 [01:17<00:00,  7.74s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

2


 10%|█         | 1/10 [00:01<00:11,  1.25s/it]

2


 20%|██        | 2/10 [00:03<00:14,  1.79s/it]

2


 30%|███       | 3/10 [00:04<00:11,  1.67s/it]

2


 40%|████      | 4/10 [00:06<00:09,  1.64s/it]

2


 50%|█████     | 5/10 [00:12<00:16,  3.32s/it]

2


 60%|██████    | 6/10 [00:24<00:24,  6.07s/it]

2


 70%|███████   | 7/10 [00:39<00:27,  9.04s/it]

2


 80%|████████  | 8/10 [00:55<00:22, 11.42s/it]

2


 90%|█████████ | 9/10 [01:09<00:12, 12.06s/it]

2


100%|██████████| 10/10 [01:24<00:00,  8.42s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

2


 10%|█         | 1/10 [00:01<00:12,  1.39s/it]

2


 20%|██        | 2/10 [00:02<00:11,  1.49s/it]

2


 30%|███       | 3/10 [00:04<00:10,  1.56s/it]

2


 40%|████      | 4/10 [00:06<00:10,  1.74s/it]

2


 50%|█████     | 5/10 [00:17<00:24,  4.91s/it]

2


 60%|██████    | 6/10 [00:29<00:29,  7.50s/it]

2


 70%|███████   | 7/10 [00:44<00:29,  9.86s/it]

2


 80%|████████  | 8/10 [01:01<00:24, 12.14s/it]

2


 90%|█████████ | 9/10 [01:14<00:12, 12.40s/it]

2
