In [10]:
import json
with open('./test.json', 'r') as f:
    test_data = json.load(f)

In [11]:
import random
test_data = random.sample(test_data, 2000)

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("./results/v5/model")
tokenizer = AutoTokenizer.from_pretrained("./results/v5/tokenizer")

In [8]:
def correct(text):
    inputs = tokenizer(text, return_tensors="pt")
    max_length = 200  # Adjust as needed
    generated = model.generate(inputs["input_ids"], max_length=max_length, num_return_sequences=1)
    corrected_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    return corrected_text

In [12]:
reference = [i['input_text'] for i in test_data]

In [13]:
candidate = [correct(i['input_text']) for i in test_data]

In [15]:
import nltk
from nltk.metrics import edit_distance

In [16]:
def count_linguistic_errors(original, corrected):
    original_tokens = nltk.word_tokenize(original)
    corrected_tokens = nltk.word_tokenize(corrected)
    distance = edit_distance(original_tokens, corrected_tokens)
    return distance

In [17]:
def calculate_ler_for_corpus(original_corpus, corrected_corpus):
    total_errors = 0
    total_tokens = 0

    for original, corrected in zip(original_corpus, corrected_corpus):
        error_count = count_linguistic_errors(original, corrected)
        total_errors += error_count
        total_tokens += len(nltk.word_tokenize(original))

    ler = total_errors / total_tokens
    return ler

In [22]:
corpus_ler = calculate_ler_for_corpus(reference, candidate)

In [36]:
print(f"For v4 : {corpus_ler}")

For v4 : 0.0590315405235841


In [18]:
corpus_ler_v5 = calculate_ler_for_corpus(reference, candidate)

In [19]:
print(f"For v5 : {corpus_ler_v5}")

For v5 : 0.05171591234375497


In [20]:
from nltk.translate.gleu_score import corpus_gleu

In [21]:
corpus_gleu([[i.split()] for i in reference], [i.split() for i in candidate])

0.735216837295136

In [107]:
from nltk.translate.bleu_score import corpus_bleu

In [108]:
corpus_bleu([[i.split()] for i in reference], [i.split() for i in candidate])

0.7388339313827006

In [22]:
def calculate_precision_recall(reference, candidate):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for ref_sent, cand_sent in zip(reference, candidate):
        # Convert the lists of tokens to sets for efficient comparison
        ref_set = set(ref_sent)
        cand_set = set(cand_sent)

        # Calculate true positives
        true_positives += len(ref_set.intersection(cand_set))

        # Calculate false positives
        false_positives += len(cand_set - ref_set)

        # Calculate false negatives
        false_negatives += len(ref_set - cand_set)

    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0.0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0.0

    return precision, recall

precision, recall = calculate_precision_recall([i.split() for i in reference], [i.split() for i in candidate])
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Precision: 0.8766
Recall: 0.8137


In [23]:
def calculate_f0_5_score(precision, recall, beta=0.5):
    if precision == 0 and recall == 0:
        return 0.0
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)


In [24]:
f0_5_score = calculate_f0_5_score(precision, recall)

In [25]:
print(f"F0.5 Score = {f0_5_score}")

F0.5 Score = 0.863267962575421
