# Metrics for different CEFR levels

Computing the following metrics for every cefr level (A, B, C):
- Bertscores
  - Precision
  - Recall
  - F1
- Edit-distance
- GLEU
- BLEU
- ROUGE

## Download required modules

In [51]:
!pip install editdistance pandas evaluate absl-py nltk rouge_score



## Import required packages

In [None]:
import json
import editdistance
import pandas as pd
from evaluate import load as eval_load

## Helper Functions

In [None]:
def get_bertscore_by_level(data_file_path, metrics_file_path):
    data = json.load(open(data_file_path))
    metrics = json.load(open(metrics_file_path, "r"))

    bertscores_by_level = {}

    levels = data['cefr']
    bertscores = metrics['bertscore']
    precisions, recalls, f1s = bertscores['precision'], bertscores['recall'], bertscores['f1']

    for i in range(len(levels)):
        level = levels[i][0]

        if level not in bertscores_by_level:
            bertscores_by_level[level] = {'precision': [], 'recall': [], 'f1': []}

        bertscores_by_level[level]['precision'].append(precisions[i])
        bertscores_by_level[level]['recall'].append(recalls[i])
        bertscores_by_level[level]['f1'].append(f1s[i])

    for level in bertscores_by_level:
        bertscores_by_level[level]['avg_precision'] = sum(bertscores_by_level[level]['precision']) / len(bertscores_by_level[level]['precision'])
        bertscores_by_level[level]['avg_recall'] = sum(bertscores_by_level[level]['recall']) / len(bertscores_by_level[level]['recall'])
        bertscores_by_level[level]['avg_f1'] = sum(bertscores_by_level[level]['f1']) / len(bertscores_by_level[level]['f1'])

    return bertscores_by_level

In [None]:
def get_edit_dist_by_level(data_file_path):
    data = json.load(open(data_file_path))

    edit_dist_by_level = {}

    levels = data['cefr']
    inputs, predictions = data['inputs'], data['predictions']

    for i in range(len(levels)):
        level = levels[i][0]

        if level not in edit_dist_by_level:
            edit_dist_by_level[level] = {'ed': []}
        
        input_text, prediction = inputs[i], predictions[i]
        ed = editdistance.eval(input_text, prediction) / len(input_text) # Edit distance normalized by input length

        edit_dist_by_level[level]['ed'].append(ed)

    for level in edit_dist_by_level:
        edit_dist_by_level[level]['avg_ed'] = sum(edit_dist_by_level[level]['ed']) / len(edit_dist_by_level[level]['ed'])

    return edit_dist_by_level

In [None]:
def get_gleu_by_level(data_file_path, gleu_file_path):
    data = json.load(open(data_file_path))
    gleu_df = pd.read_csv(gleu_file_path, header=None)

    gleu_scores = gleu_df[0]

    gleu_by_level = {}

    for i in range(len(data['cefr'])):
        level = data['cefr'][i][0]

        if level not in gleu_by_level:
            gleu_by_level[level] = {"gleu": []}

        gleu_by_level[level]['gleu'].append(gleu_scores[i])

    for level in gleu_by_level:
        gleu_by_level[level]['avg_gleu'] = sum(gleu_by_level[level]['gleu']) / len(gleu_by_level[level]['gleu'])

    return gleu_by_level

In [None]:
bleu = eval_load("bleu")
def get_bleu_by_level(data_file_path):
    data = json.load(open(data_file_path))
    predictions, references = data['predictions'], data['references']

    pred_and_ref = {}

    bleu_by_level = {}

    for i in range(len(data['cefr'])):
        level = data['cefr'][i][0]

        if level not in pred_and_ref:
            pred_and_ref[level] = {"pred": [], "ref": []}

        pred_and_ref[level]['pred'].append(predictions[i])
        pred_and_ref[level]['ref'].append([references[i]]) # BLEU expects a list of nested references

    for level in pred_and_ref:
        bleu_scores = bleu.compute(
            predictions=pred_and_ref[level]['pred'],
            references=pred_and_ref[level]['ref']
        )
        bleu_by_level[level] = bleu_scores["bleu"]

    return bleu_by_level

In [None]:
rouge = eval_load("rouge")
def get_rouge_by_level(data_file_path):
    data = json.load(open(data_file_path))
    predictions, references = data['predictions'], data['references']

    pred_and_ref = {}

    rouge_by_level = {}

    for i in range(len(data['cefr'])):
        level = data['cefr'][i][0]

        if level not in pred_and_ref:
            pred_and_ref[level] = {"pred": [], "ref": []}

        pred_and_ref[level]['pred'].append(predictions[i])
        pred_and_ref[level]['ref'].append(references[i])

    for level in pred_and_ref:
        rouge_scores = rouge.compute(
            predictions=pred_and_ref[level]['pred'],
            references=pred_and_ref[level]['ref']
        )
        rouge_by_level[level] = rouge_scores

    return rouge_by_level

In [None]:
def get_num_samples(data_file_path):
    data = json.load(open(data_file_path))

    num_samples = {}

    for lvl in data['cefr']:
        level = lvl[0]
        if level not in num_samples:
            num_samples[level] = 0
        num_samples[level] += 1

    return num_samples

In [None]:
def get_all_metrics_by_level(data_file_path, metrics_file_path, gleu_file_path):
    bertscores_by_level = get_bertscore_by_level(data_file_path, metrics_file_path)
    edit_dist_by_level = get_edit_dist_by_level(data_file_path)
    gleu_by_level = get_gleu_by_level(data_file_path, gleu_file_path)
    bleu_by_level = get_bleu_by_level(data_file_path)
    rouge_by_level = get_rouge_by_level(data_file_path)
    num_samples = get_num_samples(data_file_path)

    all_metrics_by_level = {
        'bertscore': bertscores_by_level,
        'edit_distance': edit_dist_by_level,
        'gleu': gleu_by_level,
        'bleu': bleu_by_level,
        'rouge': rouge_by_level,
        'num_samples': num_samples
    }

    return all_metrics_by_level

## Metrics by levels

### Phi-2

In [None]:
base_phi2_metrics_by_level = get_all_metrics_by_level(
    "./finetuned/base_phi2_data.json",
    "./finetuned/base_phi2_metrics.txt",
    "./finetuned/base_phi2_data_sentence_level_gleu.csv"
)
json.dump(
    base_phi2_metrics_by_level,
    open("./finetuned/base_phi2_metrics_by_level.json", "w")
)

### StableLM without DPO

In [None]:
base_stablelm_metrics_by_level = get_all_metrics_by_level(
    "./finetuned/base_stablelm_data.json",
    "./finetuned/base_stablelm_metrics.txt",
    "./finetuned/base_stablelm_data_sentence_level_gleu.csv"
)
json.dump(
    base_stablelm_metrics_by_level,
    open("./finetuned/base_stablelm_metrics_by_level.json", "w")
)

### StableLM with DPO

#### DPO with Backtranslation

In [None]:
stablelm_dpo_bt_metrics_by_level = get_all_metrics_by_level(
    "./dpo/stablelm_dpo_backtranslation_data.json",
    "./dpo/stablelm_dpo_backtranslation_metrics.txt",
    "./dpo/stablelm_dpo_backtranslation_data_sentence_level_gleu.csv"
)
json.dump(
    stablelm_dpo_bt_metrics_by_level,
    open("./dpo/stablelm_dpo_bt_metrics_by_level.json", "w")
)

#### DPO with Ultrafeedback

In [None]:
stablelm_dpo_uf_metrics_by_level = get_all_metrics_by_level(
    "./dpo/stablelm_dpo_data.json",
    "./dpo/stablelm_dpo_metrics.txt",
    "./dpo/stablelm_dpo_data_sentence_level_gleu.csv"
)
json.dump(
    stablelm_dpo_uf_metrics_by_level,
    open("./dpo/stablelm_dpo_uf_metrics_by_level.json", "w")
)