<a href="https://colab.research.google.com/github/REELICIT/reqbrain_rep_package/blob/3344cfbf610656025f7c0cfa9ae7a313bfdcd0c6/evaluation_scripts/evaluation_two_metric.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Common imports

In [None]:
!pip install evaluate

In [None]:
import evaluate
import datasets
import numpy as np

# Loading the Instruct Dataset

In [None]:
llama_dataset = datasets.load_from_disk('/home/st/st_us-051500/st_st180358/llama_training/my_zepyra_after_prediction_01052024')
print(llama_dataset)
zephyr_dataset = datasets.load_from_disk('/pfs/data5/home/st/st_us-051500/st_st180358/zephyr_training/my_zepyra_after_prediction_17040204')
print(zephyr_dataset)
llama3_dataset = datasets.load_from_disk('/home/st/st_us-051500/st_st180358/llama3_training/my_llama3_after_prediction_08052024')
gemma_dataset = datasets.load_from_disk('/home/st/st_us-051500/st_st180358/my_gemma_after_prediction_23052024')

In [None]:
# splitting the human written requirements

llama_references = llama_dataset['completion']
zephyr_references = zephyr_dataset['completion']
llama3_references = llama3_dataset['completion']

gemma_dataset_references = gemma_dataset['completion']

# Putting all Metrics Togather

In [None]:
def evaluate_model(references, predictions):
    # bert score
    bertscore = evaluate.load('bertscore')
    bertscore_results = bertscore.compute(predictions = predictions, references = references, model_type = "xlm-mlm-en-2048", lang = 'en')
    # frugal score
    frugalscore = evaluate.load("frugalscore", "moussaKam/frugalscore_medium_roberta_bert-score")
    frugalscore_results = frugalscore.compute(predictions=predictions, references=references, batch_size = 16, max_length = 512, device = "gpu")
    # TER score
    terscore = evaluate.load('ter')
    terscore_results = terscore.compute(predictions=predictions, references=references, case_sensitive=True)
    # BLEU score
    bleuscore = evaluate.load("bleu")
    bleuscore_results = bleuscore.compute(predictions=predictions, references=references)
    # ROUGE score
    rougescore = evaluate.load("rouge")
    rougescore_results = rougescore.compute(predictions=predictions, references=references)
    # Exact match score
    exactmatchscore = evaluate.load("exact_match")
    predictions_word_arrays = [sentence.split() for sentence in predictions]
    print(len(predictions_word_arrays))
    references_word_arrays = [sentence.split() for sentence in references]
    print(len(references_word_arrays))
    re = []
    for index in range(len(predictions_word_arrays)):
        max_length = max(len(predictions_word_arrays[index]), len(references_word_arrays[index]))

# 用"pad"填充数组
        predictions_word_arrays[index] = predictions_word_arrays[index] + ["pad"] * (max_length - len(predictions_word_arrays[index]))
        references_word_arrays[index] = references_word_arrays[index] + ["pad"] * (max_length - len(references_word_arrays[index]))



        
        re.append(exactmatchscore.compute(predictions=predictions_word_arrays[index], references=references_word_arrays[index])['exact_match'])
    exactmatchscore_results = sum(re) / len(re)
    
    
    
    return {'bert_score': bertscore_results, 'frugal_score': frugalscore_results, 'ter_score': terscore_results, 'bleu_score': bleuscore_results, 'rouge_score': rougescore_results, 'exact_match_score': exactmatchscore_results}
    

# Evaluating Trained Models using NLP Human Correlation Metrics


## Evaluating

In [None]:
!pip install bert_score

In [None]:
!pip install -U sacremoses

In [None]:
!pip install sacrebleu

In [None]:
!pip install nltk
!pip install rouge_score

In [None]:
# Evaluate zephyr
zephyr_results = evaluate_model(zephyr_references, zephyr_dataset['zephyr_7b_beta_preds'])
# Evaluate llama
llama_results = evaluate_model(llama_references, llama_dataset['llama2_7b_chat_hf_preds'])
# Evaluate llama3
llama3_results = evaluate_model(llama3_references, llama3_dataset['llama3_8B_Instruct_preds'])
gemma_results = evaluate_model(gemma_dataset_references, gemma_dataset['gemma_preds'])

In [None]:
print(gemma_results)

In [None]:
# print(zephyr_results)
print(gemma_results)

In [None]:
import json

file_path_zephyr = '/pfs/data5/home/st/st_us-051500/st_st180358/zephyr_training/zephyr_evaluation_results_24052024.json'
file_path_llama = '/pfs/data5/home/st/st_us-051500/st_st180358/llama_training/llama_evaluation_results_24052024.json'
file_path_llama3 = '/pfs/data5/home/st/st_us-051500/st_st180358/llama3_training/llama3_evaluation_results_24052024.json'
file_path_gemma = '/pfs/data5/home/st/st_us-051500/st_st180358/gemma_training/gemma_evaluation_results_24052024.json'
# print(type(zephyr_results))

def convert_float32_to_float(d):
    if isinstance(d, dict):
        for key, value in d.items():
            d[key] = convert_float32_to_float(value)
    elif isinstance(d, list):
        for i in range(len(d)):
            d[i] = convert_float32_to_float(d[i])
    elif isinstance(d, np.float32):
        return float(d)
    return d

llama_results = convert_float32_to_float(llama_results)
zephyr_results = convert_float32_to_float(zephyr_results)
llama3_results = convert_float32_to_float(llama3_results)
gemma_results = convert_float32_to_float(gemma_results)




# # Save the result to disk
with open(file_path_zephyr, 'w') as json_file:
    json.dump(zephyr_results, json_file, indent=4)
with open(file_path_gemma, 'w') as json_file:
    json.dump(gemma_results, json_file, indent=4)
with open(file_path_llama, 'w') as json_file:
    json.dump(llama_results, json_file, indent=4)
with open(file_path_llama3, 'w') as json_file:
    json.dump(llama3_results, json_file, indent=4)


In [None]:
print(gemma_results)

In [None]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

zephyr_frugal_score = results['frugal_score']['scores']
zephyr_bert_score = results['bert_score']['recall']
dataset_for_spider_chart = dataset.add_column('zephyr_frugal_score', zephyr_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('zephyr_bert_score', zephyr_bert_score)
dataset_for_spider_chart.save_to_disk('/home/st/st_us-051500/st_st180358/zephyr_training/my_zepyra_result_for_spider_chart_17040204')

In [None]:
for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)