# Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Go to Fastchat

In [2]:
import os

fastchat_directory = '/content/drive/My Drive/FastChat'
os.chdir(fastchat_directory)

**See what is in here**

In [3]:
!ls

13boutput.txt			    model_worker_1844f2a9.log  model_worker_ba16a0e7.log
all_output.txt			    model_worker_223b1e60.log  model_worker_bba35bd9.log
assets				    model_worker_24ede12c.log  model_worker_c1208ce0.log
blue_output.txt			    model_worker_26e5d0f1.log  model_worker_d243a295.log
checkpoints13b			    model_worker_2b2a28a0.log  model_worker_df129024.log
checkpoints13bmembank		    model_worker_2dc32bf9.log  model_worker_e0ef14c3.log
checkpoints13bnewdata		    model_worker_302c1cbb.log  model_worker_e2454bb6.log
checkpoints_ep5lama		    model_worker_32f01e9c.log  model_worker_e4a4bd3a.log
checkpoints_ep5qlorab16e16	    model_worker_34b002eb.log  model_worker_f26a2425.log
checkpoints_epcolab		    model_worker_35428829.log  model_worker_f4cd6e42.log
checkpointsllama213bmembank	    model_worker_39c2d7ae.log  model_worker_fdff2a02.log
checkpointsllama2_7bmembank	    model_worker_407df188.log  new13boutput.txt
checkpoints_vicuna_7b_16k_membank   model_worker_44547119.log  new_chat_open

# install the dependencies

In [4]:
!pip3 install -e ".[model_worker,webui]"
!pip3 install accelerate
!pip3 install deepspeed==0.13.1
!pip3 install peft
!pip3 install bitsandbytes
!pip3 install flash-attn
!pip3 install wandb
!pip3 install transformers
!pip3 install openai
!pip3 install sacrebleu
!pip3 install bert_score
!pip3 install sentence_transformers

Obtaining file:///content/drive/My%20Drive/FastChat
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting fastapi (from fschat==0.2.36)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from fschat==0.2.36)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting markdown2[all] (from fschat==0.2.36)
  Downloading markdown2-2.4.13-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m6.1 

**cross check**

In [6]:
!ls

13boutput.txt			    model_worker_1844f2a9.log  model_worker_ba16a0e7.log
all_output.txt			    model_worker_223b1e60.log  model_worker_bba35bd9.log
assets				    model_worker_24ede12c.log  model_worker_c1208ce0.log
blue_output.txt			    model_worker_26e5d0f1.log  model_worker_d243a295.log
checkpoints13b			    model_worker_2b2a28a0.log  model_worker_df129024.log
checkpoints13bmembank		    model_worker_2dc32bf9.log  model_worker_e0ef14c3.log
checkpoints13bnewdata		    model_worker_302c1cbb.log  model_worker_e2454bb6.log
checkpoints_ep5lama		    model_worker_32f01e9c.log  model_worker_e4a4bd3a.log
checkpoints_ep5qlorab16e16	    model_worker_34b002eb.log  model_worker_f26a2425.log
checkpoints_epcolab		    model_worker_35428829.log  model_worker_f4cd6e42.log
checkpointsllama213bmembank	    model_worker_39c2d7ae.log  model_worker_fdff2a02.log
checkpointsllama2_7bmembank	    model_worker_407df188.log  new13boutput.txt
checkpoints_vicuna_7b_16k_membank   model_worker_44547119.log  new_chat_open

# Opening the process for evaluation

In [7]:
import subprocess
import threading

import os

# Using 127.0.0.1 because localhost does not work properly in Colab

def run_controller():
    subprocess.run(["python3", "-m", "fastchat.serve.controller", "--host", "127.0.0.1"])

def run_model_worker():
    subprocess.run(["python3", "-m", "fastchat.serve.model_worker", "--host", "127.0.0.1", "--controller-address", "http://127.0.0.1:21001", "--model-path", "./checkpoints_vicuna_7b_16k_membank/checkpoint-2245"])

def run_api_server():
    subprocess.run(["python3", "-m", "fastchat.serve.openai_api_server", "--host", "127.0.0.1", "--controller-address", "http://127.0.0.1:21001", "--port", "8000"])

In [8]:
controller_thread = threading.Thread(target=run_controller)
controller_thread.start()

In [9]:
model_worker_thread = threading.Thread(target=run_model_worker)
model_worker_thread.start()

In [10]:
api_server_thread = threading.Thread(target=run_api_server)
api_server_thread.start()

**cross check whether the model works**

In [13]:
!curl http://127.0.0.1:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{ \
    "model": "checkpoint-2245", \
    "messages": [{"role": "user", "content": "Hello, can you tell me a joke for me?"}], \
    "temperature": 0.5 \
  }'

{"id":"chatcmpl-r3NDdSSVryjqiihrmiRfpV","object":"chat.completion","created":1718378921,"model":"checkpoint-2245","choices":[{"index":0,"message":{"role":"assistant","content":"Of course! Here's a joke for you:\n\nWhy did the math book look so sad?\n\nBecause it had too many problems!"},"finish_reason":"stop"}],"usage":{"prompt_tokens":50,"total_tokens":83,"completion_tokens":33}}

**Necessary Imports**

In [14]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import openai
import torch
from sacrebleu.metrics import BLEU
from bert_score import score as bert_score
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import numpy as np
from torch.nn import CrossEntropyLoss

  from tqdm.autonotebook import tqdm, trange


**Embedding Models**

In [15]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**gpt2 model**

In [16]:
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

openai.api_key = "EMPTY"
openai.base_url = "http://localhost:8000/v1/"

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

**Most similar Ground truth conversation**

In [17]:
def find_most_similar_model_output(input_string, ground_truth_conversations, similarity_threshold=0.7):
    input_string = input_string.split("</mem>")[-1]
    ground_truth_texts = [conv['value'] for conv in ground_truth_conversations if conv['from'] == 'gpt']
    new_ground_truth_texts = [text.split("</mem>")[-1] for text in ground_truth_texts]

    embeddings = embedding_model.encode([input_string] + new_ground_truth_texts)
    cosine_similarities = cosine_similarity([embeddings[0]], embeddings[1:]).flatten()

    best_match_index = cosine_similarities.argmax()

    # print("Similarity_score:", cosine_similarities[best_match_index])

    if cosine_similarities[best_match_index] < similarity_threshold:
        return "not mentioned", ground_truth_texts[best_match_index]

    return best_match_index * 2 + 1, ground_truth_texts[best_match_index]

**Calculating Perplexity**

In [19]:
def calculate_perplexity(text, device='cuda'):
    input_ids = gpt2_tokenizer.encode(text, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)

    with torch.no_grad():
        outputs = gpt2_model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits

    # Shift logits and labels for next-token prediction
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = input_ids[..., 1:].contiguous()

    loss_fct = CrossEntropyLoss(reduction="none")
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    # Reshape loss to (batch_size, sequence_length - 1)
    loss = loss.view(input_ids.size(0), input_ids.size(1) - 1)

    # Mask the loss for padding tokens
    attention_mask = attention_mask[..., 1:].contiguous()
    loss = loss * attention_mask

    # Calculate perplexity
    perplexity = torch.exp(loss.sum() / attention_mask.sum())

    return perplexity.item()

**Loading Eval Data**

In [20]:
def load_eval_data(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

# Evaluating the Conversations

**Our Mode**

In [21]:
model = "checkpoint-2245"

In [22]:
def evaluate_conversations(eval_data):
  bleu_scorer = BLEU()
  all_bleu_scores, all_bert_scores, all_perplexities = [], [], []
  all_bleu_scores_real, all_bert_scores_real, all_perplexities_real = [], [], []
  turn_counts, results = {}, []

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  my_count = 0
  for convo in eval_data:
    # sake of debugging
    my_count = my_count+1
    print("conversation no", my_count)

    messages = [{"role": "user", "content": convo['conversations'][0]['value']}]
    extracted_variable_count = convo['conversations'][0]['value'].count(':')
    turn_counts.setdefault(extracted_variable_count, [])
    conversation_end, turn_of_conversation = False, 0
    false_negative, false_positive = 0, 0

    curr_bleu_scores, curr_bert_scores, curr_perplexities = [], [], []
    curr_bleu_scores_real, curr_bert_scores_real, curr_perplexities_real = [], [], []

    asked_questions = set()
    ground_truth_questions = set(i for i, message in enumerate(convo['conversations']) if message['from'] == 'gpt')

    while not conversation_end:
      turn_of_conversation += 1

      response = openai.chat.completions.create(
                model=model,
                messages=messages
      )

      model_output = response.choices[0].message.content

      if turn_of_conversation > 11:
        model_output = model_output + "<Finish>"

      messages.append({"role": "assistant", "content": model_output})

      if '<Finish>' in model_output:
        conversation_end = True
      if '<finish>' in model_output:
        conversation_end = True
      if '<FINISH>' in model_output:
        conversation_end = True

      idx, similar_str = find_most_similar_model_output(model_output, convo['conversations'])

      if idx == "not mentioned":
        false_positive += 1
      else:
        asked_questions.add(idx)

      if not conversation_end:
        if idx == "not mentioned":
          human_response = "not mentioned"
        else:
          human_response = convo['conversations'][idx + 1]['value'] if idx + 1 < len(convo['conversations']) else "not mentioned"
          messages.append({"role": "user", "content": human_response})

      bleu_score1 = bleu_scorer.sentence_score(model_output, [similar_str]).score
      _, _, bert_f1 = bert_score([model_output], [similar_str], lang="en")
      perplexity = calculate_perplexity(model_output, device=device)

      curr_bleu_scores.append(bleu_score1)
      curr_bert_scores.append(bert_f1.item())
      curr_perplexities.append(perplexity)

      if idx != "not mentioned":
        bleu_score1 = bleu_scorer.sentence_score(model_output, [similar_str]).score
        _, _, bert_f1 = bert_score([model_output], [similar_str], lang="en")
        perplexity = calculate_perplexity(model_output, device=device)

        curr_bleu_scores_real.append(bleu_score1)
        curr_bert_scores_real.append(bert_f1.item())
        curr_perplexities_real.append(perplexity)

    result_entry = convo.copy()
    result_entry['conversations'] = messages

    result_entry['bleu_score_with_false_positive'] = sum(curr_bleu_scores) / len(curr_bleu_scores)
    result_entry['bert_score_with_false_positive'] = sum(curr_bert_scores) / len(curr_bert_scores)
    result_entry['perplexity_score_with_false_positive'] = sum(curr_perplexities) / len(curr_perplexities)

    result_entry['bleu_score_without_false_positive'] = 0
    result_entry['bert_score_without_false_positive'] = 0
    result_entry['perplexity_score_without_false_positive'] = 0

    if len(curr_bleu_scores_real) > 0:
            result_entry['bleu_score_without_false_positive'] = sum(curr_bleu_scores_real) / len(curr_bleu_scores_real)
    if len(curr_bert_scores_real) > 0:
            result_entry['bert_score_without_false_positive'] = sum(curr_bert_scores_real) / len(curr_bert_scores_real)
    if len(curr_perplexities_real) > 0:
            result_entry['perplexity_score_without_false_positive'] = sum(curr_perplexities_real) / len(curr_perplexities_real)

    result_entry['false_negative'] = len(ground_truth_questions - asked_questions)
    result_entry['false_positive'] = false_positive

    result_entry['turn_of_conversation'] = turn_of_conversation
    result_entry['extracted_variable_count'] = extracted_variable_count
    results.append(result_entry)

    turn_counts[extracted_variable_count].append(turn_of_conversation)

    all_bleu_scores.append(result_entry['bleu_score_with_false_positive'])
    all_bleu_scores_real.append(result_entry['bleu_score_without_false_positive'])

    all_bert_scores.append(result_entry['bert_score_with_false_positive'])
    all_bert_scores_real.append(result_entry['bert_score_without_false_positive'])

    all_perplexities.append(result_entry['perplexity_score_with_false_positive'])
    all_perplexities_real.append(result_entry['perplexity_score_without_false_positive'])

  avg_bleu_with_false_positive = sum(all_bleu_scores) / len(all_bleu_scores)
  avg_bleu_without_false_positive = sum(all_bleu_scores_real) / len(all_bleu_scores_real)

  avg_bert_with_false_positive = sum(all_bert_scores) / len(all_bert_scores)
  avg_bert_without_false_positive = sum(all_bert_scores_real) / len(all_bert_scores_real)

  avg_perplexity_with_false_positive = sum(all_perplexities) / len(all_perplexities)
  avg_perplexity_without_false_positive = sum(all_perplexities_real) / len(all_perplexities_real)

  return results, avg_bleu_with_false_positive, avg_bleu_without_false_positive, avg_bert_with_false_positive, avg_bert_without_false_positive, avg_perplexity_with_false_positive, avg_perplexity_without_false_positive, turn_counts













**Running The Evaluation**

In [25]:
path = './data/tempo10.json'
results, avg_bleu_with_false_positive, avg_bleu_without_false_positive, avg_bert_with_false_positive, avg_bert_without_false_positive, avg_perplexity_with_false_positive, avg_perplexity_without_false_positive, turn_counts = evaluate_conversations(load_eval_data(path))

conversation no 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

conversation no 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


conversation no 3


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

conversation no 4


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


conversation no 5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

conversation no 6


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

conversation no 7


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


conversation no 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


conversation no 9


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

**Output**

In [26]:
output_file = "./data/tempo10_output.json"
with open(output_file, 'w') as file:
    json.dump(results, file, indent=4)

print(f"Average BLEU Score with false positive: {avg_bleu_with_false_positive}")
print(f"Average BLEU Score without false positive: {avg_bleu_without_false_positive}")

print(f"Average BERT Score with false positive: {avg_bert_with_false_positive}")
print(f"Average BERT Score without false positive: {avg_bert_without_false_positive}")

print(f"Average Perplexity with false positive: {avg_perplexity_with_false_positive}")
print(f"Average Perplexity without false positive: {avg_perplexity_without_false_positive}")

Average BLEU Score with false positive: 48.89599607693921
Average BLEU Score without false positive: 56.66994559738361
Average BERT Score with false positive: 0.9406311103591213
Average BERT Score without false positive: 0.9519618120458391
Average Perplexity with false positive: 168.82659269262246
Average Perplexity without false positive: 233.7648220062256
