In [1]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Necessary Packages

In [2]:
!git clone https://github.com/FlagOpen/FlagEmbedding.git
!pip install -e .

Cloning into 'FlagEmbedding'...
remote: Enumerating objects: 1631, done.[K
remote: Counting objects: 100% (514/514), done.[K
remote: Compressing objects: 100% (229/229), done.[K
remote: Total 1631 (delta 333), reused 430 (delta 285), pack-reused 1117[K
Receiving objects: 100% (1631/1631), 5.80 MiB | 23.11 MiB/s, done.
Resolving deltas: 100% (961/961), done.
Obtaining file:///content
[31mERROR: file:///content does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [None]:
!pip install huggingface_hub
!pip install datasets
!pip install transformers
!pip install loguru -qU
!pip install tokenizers
!pip install langchain -qU
!pip install bitsandbytes -qU
!pip install accelerate==0.21.0
!pip install peft==0.4.0
!pip install trl==0.4.7
!pip install guardrail-ml==0.0.12
!pip install flash-attn --no-build-isolation
!pip install -U FlagEmbedding
!pip install bert-score

In [4]:
class PromptTemplate:
      system_prompt = None


      def __init__(self, system_prompt=None):
          self.system_prompt = system_prompt
          self.user_messages = []
          self.model_replies = []

      def add_user_message(self, message: str, return_prompt=True):
          self.user_messages.append(message)
          if return_prompt:
              return self.build_prompt()

      def add_model_reply(self, reply: str, includes_history=True, return_reply=True):
          reply_ = reply.replace(self.build_prompt(), "") if includes_history else reply
          self.model_replies.append(reply_)
          if len(self.user_messages) != len(self.model_replies):
              raise ValueError(
                  "Number of user messages does not equal number of system replies."
              )
          if return_reply:
              return reply_

      def get_user_messages(self, strip=True):
          return [x.strip() for x in self.user_messages] if strip else self.user_messages

      def get_model_replies(self, strip=True):
          return [x.strip() for x in self.model_replies] if strip else self.model_replies

      def build_prompt(self):
          if len(self.user_messages) != len(self.model_replies) + 1:
              raise ValueError(
                  "Error: Expected len(user_messages) = len(model_replies) + 1. Add a new user message!"
              )

          if self.system_prompt is not None:
              SYS = f"[INST] <<SYS>>\n{self.system_prompt}\n<</SYS>>"
          else:
              SYS = ""

          CONVO = ""
          SYS = "<s>" + SYS
          for i in range(len(self.user_messages) - 1):
              user_message, model_reply = self.user_messages[i], self.model_replies[i]
              conversation_ = f"{user_message} [/INST] {model_reply} </s>"
              if i != 0:
                  conversation_ = "[INST] " + conversation_
              CONVO += conversation_

          CONVO += f"[INST] {self.user_messages[-1]} [/INST]"

          return SYS + CONVO

# Import Necessary Packages

In [5]:
import os
from glob import glob
import pandas as pd
import json
import time
import requests
import random
from loguru import logger
import re
import numpy as np
#from huggingface_hub import HfApi, HfFolder

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np


In [7]:
from transformers import(AutoTokenizer,
                         AutoModelForMultipleChoice,
                         AutoModelForCausalLM,
                         AutoTokenizer,

                         GenerationConfig,
                         BitsAndBytesConfig,

                         pipeline,
                         Conversation,
                         logging,
                         )
from datasets import load_dataset
from tokenizers import Tokenizer

import warnings
warnings.filterwarnings("ignore")


### HELPER FUNCTION for Multi-Agents Debate ###

In [8]:
# this part is used to gen_mmlu

def construct_message(agents, question, idx):
    if len(agents) == 0:
        return {"role": "user", "content": "Can you double check that your answer is correct. Put your final answer in the form (X) at the end of your response."}

    prefix_string = "These are the solutions to the problem from other agents: "

    for agent in agents:
        agent_response = agent[idx]["content"]
        response = "\n\n One agent solution: ```{}```".format(agent_response)

        prefix_string = prefix_string + response

    prefix_string = prefix_string + """\n\n Using the reasoning from other agents as additional advice, can you give an updated answer? Examine your solution and that other agents step by step. /n/n Here is the original question: {}. """.format(question)
    return {"role": "user", "content": prefix_string}


def construct_assistant_message(completion):
    # just construct the assistant_message directly.

    return {"role": "assistant", "content": completion}


def generate_answer(answer_context):
    """
    input: list of dict, answer_context
    output: str, content
    """
    try:
        # Generate a prompt
        messages = answer_context

        tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")

        # Generate a response
        outputs = base_model.generate(
                                tokenized_chat,
                                max_new_tokens=400,
                                max_time=90, # control the generation time

                                do_sample = True,

                                top_k = 50, # both top_k and top_p combined to help me control the quality of logit
                                top_p = 0.9,

                                temperature= 0.1,
                                #num_return_sequences= 1, # control the num of returned sequence, to less the recall api time

                                repetition_penalty= 1.5,
                                )

        # parse output_text
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Removing the query content and everything before it
        # Find the index where the query content ends in the output_text
        end_idx_of_query = output_text.find(messages[0]['content']) + len(messages[0]['content'])
        cleaned_output_text = output_text[end_idx_of_query:].strip()

        # Further clean up if needed
        pattern = r"\[.*?\]|\(.*?\)|\{.*?\}"
        cleaned_output_text = re.sub(pattern, "", cleaned_output_text)
    except:
        print("retrying due to an error......")
        time.sleep(20)
        return generate_answer(answer_context)

    return  cleaned_output_text


def parse_question_answer(df, ix):
    question = df.iloc[ix, 0]
    a = df.iloc[ix, 1]
    b = df.iloc[ix, 2]
    c = df.iloc[ix, 3]
    d = df.iloc[ix, 4]

    question = "Can you answer the following question as accurately as possible? {}: /n/n A) {}, /n B) {}, /n C) {}, /n D) {}. /n Explain your answer, putting the answer in the form (X) at the end of your response.".format(question, a, b, c, d)

    answer = df.iloc[ix, 5]

    return question, answer



# an alternative way to implement generating the text answer
def sample_model(prompt):
    conversation_pipeline = pipeline(
                                    'conversational',
                                    model=model,
                                    tokenizer=tokenizer,
                                    max_new_tokens=300,
                                    max_time=90, # control the generation time

                                    do_sample = True,

                                    top_k = 75, # both top_k and top_p combined to help me control the quality of logit
                                    top_p = 0.9,

                                    temperature= 0.9,
                                    #num_return_sequences= 1, # control the num of returned sequence, to less the recall api time

                                    repetition_penalty= 1.2,
                                    eos_token_id= tokenizer.eos_token_id,
                                    pad_token_id= tokenizer.eos_token_id,
                                    bos_token_id= tokenizer.eos_token_id,
                                    )
    conversation = Conversation(prompt)
    conversation_pipeline([conversation])
    return conversation.generated_responses[-1]

#-------------------------------------------------------- RAG HELPER FUNCTION -----------------------------------------------------------------------#

# Helper function for printing docs

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

def reciprocal_rank_fusion(document_scores, k=60):
    fused_scores = {}
    for doc, score in document_scores.items():
        rank = 1 / score  # Assuming a lower score means a better match
        fused_scores[doc] = 1 / (rank + k)

    reranked_results = {doc: score for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)}
    return reranked_results




# 1. Set up the model

In [9]:

#base_model_path = '/content/drive/MyDrive/Hallucination/Llama2_7b_base/Llama2_7b_Finance_FT_3/Llama2-7b_Finance_FT_3_with_lora'
base_model_path = 'NousResearch/Llama-2-7b-hf'


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf',
                                          use_fast=True,
                                          #chat_template = set_template
                                          )


# Load the trained model
base_model = AutoModelForCausalLM.from_pretrained(base_model_path,
                                             #quantization_config=bnb_config,
                                             trust_remote_code=True,
                                             load_in_8bit=True,
                                             device_map="auto",
                                             #use_flash_attention_2=True,
                                             )

base_model.config.use_cache = False # Because, we just take the performance of single turn into consideration,

#model.push_to_hub("Llama2-7b_Finance_lora_3")

# If you're using a GPU, move the model to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#base_llm = model#.to(device) # int8,int can not put into .to()


Downloading tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

In [10]:
# Print the BOS and EOS tokens
print("BOS Token:", tokenizer.bos_token)
print("EOS Token:", tokenizer.eos_token)
print("PAD Token:", tokenizer.pad_token)
print("SEP Token:", tokenizer.sep_token)
print("MASK Token:", tokenizer.mask_token)

Using sep_token, but it is not set yet.
Using mask_token, but it is not set yet.


BOS Token: <s>
EOS Token: </s>
PAD Token: <unk>
SEP Token: None
MASK Token: None


In [11]:
print("BOS Token id:", tokenizer.bos_token_id)
print("EOS Token id:", tokenizer.eos_token_id)
print("PAD Token id:", tokenizer.pad_token_id)
print("SEP Token id:", tokenizer.sep_token_id)
print("MASK Token id:", tokenizer.mask_token_id)

BOS Token id: 1
EOS Token id: 2
PAD Token id: 0
SEP Token id: None
MASK Token id: None


In [12]:
tokenizer.default_chat_template

"{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must 

# 2. Retrieval Augmention Generation(RAG)

In [None]:
!pip install yfinance
!pip install wikipedia
!pip install faiss-GPU

In [14]:
from langchain.agents import AgentType, initialize_agent


#tools
from langchain.tools.yahoo_finance_news import YahooFinanceNewsTool

#retrievers
from langchain.retrievers import WikipediaRetriever

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.retrievers.document_compressors import EmbeddingsFilter

from FlagEmbedding import (FlagReranker, FlagModel, LLMEmbedder)



# 2.1 tools \[   YahooFinanceNewsTool,\]

In [15]:
# this tool-- YahooFinanceNewsTool only be used for financial test
tools = [YahooFinanceNewsTool()]

# 2.2 wikipedia_retirevers as a import documents source

In [16]:
# because we are dealing with MMLU problem, which is discrimination evaluation, so wikipediaretriever
wiki_retriever = WikipediaRetriever()

# 2.3 Embedding Model, Reranker, ContextualCompressionRetriever

#####2.3.1 Reranker

In [20]:
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16= True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Downloading tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

-1.529296875
[-5.60546875, 5.76171875]


##### 2.3.2 Embedding Model

In [21]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embed_model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="Generate a representation for this sentence to retrieve relevant articles: "
)
embed_model.query_instruction = "Generate a representation for this sentence to retrieve relevant articles: "

Downloading .gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.retrievers.document_compressors import EmbeddingsFilter

# 2.5 Build up the RAG part

In [60]:


# this module useful and helpful to track the latest and every day's news and information
def RAG(answer_context, embed_model, reranker):

    """
    input: str, query to ask question
    output: str, a bunch of documents as the background context
    """

    # query for retirever
    messages = answer_context
    query = messages[0]['content']

    # raw documents from search_engine as the source external knowledge
    raw_documents = wiki_retriever.get_relevant_documents(query = query) # this line is able to become any search engine to import necessary updated external knowledge
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
    texts = text_splitter.split_documents(raw_documents)
    base_retriever = FAISS.from_documents(texts, embed_model).as_retriever()

    # Contextual_compression_retriever_EmbeddingsFilter for contextual compressed documents
    embeddings = embed_model
    embeddings_filter = EmbeddingsFilter(embeddings= embeddings, similarity_threshold = 0.6)
    compression_retriever = ContextualCompressionRetriever(base_compressor = embeddings_filter, base_retriever = base_retriever)
    compressed_docs = compression_retriever.get_relevant_documents(query)

    document_scores = {}

    # Reranker module
    for doc in compressed_docs:
        score = reranker.compute_score([query, doc.page_content])
        document_scores[doc.page_content] = score

    reranked_docs  = reciprocal_rank_fusion(document_scores) # implement the actual rerank part

    documents = "\n\n".join([f"Document {i}: {doc}" for i, doc in enumerate(reranked_docs)] )


    return documents

# RAG TEST

In [61]:
query_2 = [{
            'role': 'user',
            'content': "<s> can you tell me who is kobe bryant? </s>"
}]

In [62]:
tem = RAG(query_2, embed_model, reranker)

In [63]:
tem

"Document 0: === 1997–1998 ===\nBryant was assigned by the Lakers to play in the 1997 NBA Summer League to improve as a team player and learn where to send the ball when he drew double teams. The following season, he was voted as a starter in the 1998 All-Star Game though he was a reserve on the Lakers. The team struggled after the All-Star break, losing seven of their first twelve games, and Bryant had a stretch where he made only 30 of 100 shots. O'Neal wanted a championship immediately, and he did not want to wait for Bryant to mature as a player. Harris thought the NBA and its television broadcaster, NBC, were overexposing Bryant and that he became more of a one-on-one player after the break. Bryant's playing time became reduced. The Lakers were eliminated in the 1998 playoffs in the conference finals after they were swept by the Jazz, 4–0."

# Generation on MMLU data

In [43]:
from bert_score import score as bs

In [47]:
candidates = ["This is a test sentence for BERTscore."]

# Your reference sentence (e.g., a human translation)
references = ["This sentence is a test for BERTscore."]

# Calculating the BERTscore
P, R, F1 = bs(candidates, references, lang='en')

# P, R, and F1 are lists of scores corresponding to Precision, Recall, and F1 score.
print(f"Precisions: {P}")
print(f"Recalls: {R}")
print(f"F1 scores: {F1}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precisions: tensor([0.9614])
Recalls: tensor([0.9690])
F1 scores: tensor([0.9652])


In [48]:
agents = 2
rounds = 2

tasks = glob("/content/drive/MyDrive/Hallucination/Parse_Data/data/test/*.csv")

dfs = [pd.read_csv(task) for task in tasks]

random.seed(123)
response_dict = {}

for i in range(100):
    df = random.choice(dfs)
    ix = len(df)
    idx = random.randint(0, ix-1)

    question, answer = parse_question_answer(df, idx)

    agent_contexts = [[{"role": "user", "content": question}] for agent in range(agents)]

    # initilized debate rounds
    for round in range(rounds):
        for i, agent_context in enumerate(agent_contexts):

            if round != 0:
                agent_contexts_other = agent_contexts[:i] + agent_contexts[i+1:]
                message = construct_message(agent_contexts_other, question, 2 * round - 1)
                agent_context.append(message)

            completion = generate_answer(agent_context)

            assistant_message = construct_assistant_message(completion)
            agent_context.append(assistant_message)
            print(completion)


    # BERTscore determine whether consistent

    main_expert_sentence = [agent_contexts[0][-1]['content']]
    auxiliary_expert_sentence = [agent_contexts[1][-1]['content']]

    P, R, F1 = bs(auxiliary_expert_sentence, main_expert_sentence)

    if F1 <= 0.8:
        #if not consistent, RAG part provides relevant documents for more information

        background_context = RAG(agent_contexts, embed_model, reranker)

        new_template = "{ } /n/n these are extra CONTEXT DOCUMENTS provided to help you reason question below: /n { }".format(background_context,question)

        new_chat_template = [{
            'role' : "user",
            'content' : new_template,
        }]

        completion_rag = generate_answer(new_chat_template)

        assistant_message_rag = construct_assistant_message(completion_rag)

        agent_contexts[0][-1] = assistant_message_rag
        agent_contexts[1][-1] = assistant_message_rag

        response_dict[question] = (agent_contexts, answer)


    else:
        response_dict[question] = (agent_contexts, answer)


json.dump(response_dict, open("mmlu_{}_{}.json".format(agents, rounds), "w"))

#wandb.finish()

Question 12-06543679 - You're looking through an old set...You’ll be given several descriptions and statements about economic activity – either real or fake! Choose from one fo…
The best answers show that they know what all those terms mean and have studied this material well enough so one can be confident he knows something about Economics or will never forget it even if doesn't seem necessary now - which means doing really good work here :) This isn’s just any old test either because most companies paying their own employees would expect people like us who write exams under these circumstances every day…but there aren”t too many places left where someone else could get some nice things without feeling guilty after having done nothing meaningful themselves! It takes time & effort--or maybe not...you need patience instead 🙂 But then again sometimes we do learn from failure right away ;-p So yes please go ahead try out my services by clicking on ____________, I promise no regrets afterw

In [63]:
from google.colab import files

files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Evaluation on MMLU test data

In [48]:

def parse_bullets(sentence):
    bullets_preprocess = sentence.split("\n")
    bullets = []

    for bullet in bullets_preprocess:
        try:
            idx = bullet.find(next(filter(str.isalpha, bullet)))
        except:
            continue

        bullet = bullet[idx:]

        if len(bullet) != 0:
            bullets.append(bullet)

    return bullets


def parse_yes_no(string):
    """
    Parses a string containing "yes" or "no" and returns a boolean value.

    Args:
        string (str): The string to parse.

    Returns:
        bool: True if the string contains "yes", False if the string contains "no".

    Raises:
        ValueError: If the input string does not contain "yes" or "no".
    """
    if "yes" in string.lower():
        return True
    elif "no" in string.lower():
        return False
    else:
        return None


def solve_math_problems(input_str):
    pattern = r"\d+\.?\d*"

    matches = re.findall(pattern, input_str)
    if matches:
        return matches[-1]

    return None

def parse_answer(input_str):
    pattern = r'\((\w)\)'
    matches = re.findall(pattern, input_str)

    solution = None
    # print("predicted solution")
    # print(input_str)
    # print("matches")
    # print(matches)

    for match_str in matches[::-1]:
        solution = match_str.upper()
        if solution:
            break

    return solution


def compute_accuracy(gt, pred_solutions):
    if type(pred_solutions) == list:
        pred_answers = []

        for pred_solution in pred_solutions:
            pred_answer = parse_answer(pred_solution)

            if pred_answer is None:
                pred_answer = solve_math_problems(pred_solution)

            if pred_answer is not None:
                pred_answers.append(pred_answer)

        if pred_answer is None:
            return 0
        pred_answer = most_frequent(pred_answers)
        # pred_answer = pred_answers[0]
    else:
        pred_answer = parse_answer(pred_solutions)
        if pred_answer is None:
            pred_answer = solve_math_problems(pred_solutions)

    if gt == pred_answer:
        return 1
    else:
        return 0


def most_frequent(List):
    counter = 0
    num = List[0]

    for i in List:
        current_frequency = List.count(i)
        if current_frequency > counter:
            counter = current_frequency
            num = i

    return num



In [56]:
response_dict = json.load(open("/content/drive/MyDrive/Hallucination/Generated_data/mmlu/mmlu_2_2.json", "r"))
questions = list(response_dict.keys())

accuracies = []

for question in questions:
    responses, gt = response_dict[question]

    pred_solutions = []
    for response in responses:
        pred_solution = response[-1]['content']

        pred_solutions.append(pred_solution)
        # break

    # pred_solutions = pred_solutions[:1]

    accurate = compute_accuracy(gt, pred_solutions)


    if accurate is not None:
        accuracies.append(float(accurate))
    else:
        import pdb
        pdb.set_trace()
        print(gt)

    print("accuracies:", np.mean(accuracies), np.std(accuracies) / (len(accuracies) ** 0.5))

accuracies: 0.0 0.0
accuracies: 0.0 0.0
accuracies: 0.0 0.0
accuracies: 0.0 0.0
accuracies: 0.0 0.0
