# Libraries

In [1]:
!pip install --upgrade --quiet rouge-score
!pip install --upgrade --quiet nltk

In [2]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import pandas as pd
from tqdm.auto import tqdm

## ROUGE

We have decided to use Unigrams, Bigrams and LCS. 
We will take the F1-score which combines the precison and recall as the evaluation metric

In [3]:
def evaluate_rouge(golden, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores_dict = scorer.score(golden, generated)
    rouge1_f1 = scores_dict['rouge1'].fmeasure
    rouge2_f1 = scores_dict['rouge2'].fmeasure
    rougeL_f1 = scores_dict['rougeL'].fmeasure
    return [rouge1_f1, rouge2_f1, rougeL_f1]

## BLEU

We have decided to use method5 as our smoothing funciton based on the results we've got when comparing to the other options and after reading 'A Systematic Comparison of Smoothing Techniques for Sentence-Level
BLEU' by Boxing Chen and Colin Cherry.
We've chosen it since it is quite intuitive and performs well for our purpouses - emphesizing meaning and recognizing the similarity between phrases even if there are slight variations or shifts in wording. 

In [4]:
chencherry = SmoothingFunction()

def get_bleu_score(ref, candidate):
    return sentence_bleu([ref.split()], candidate.split(), smoothing_function=chencherry.method5)

## Evaluate generated docstrings

In [5]:

generated_docstrings_data = pd.read_csv("data_full_docstrings_generated.csv")
generated_docstrings_data.rename({'T5 BaseLine docstring generation': 'T5'}, axis=1, inplace=True)  # column name is too long

models = ['T5', 'Gemini-1.0-pro', 'GPT-3.5 Turbo', 'Claude-instant-1']

tqdm.pandas()

# Add columns for metrics and initialize with None
for model in models:
    generated_docstrings_data[f'ROUGE-1 f-score {model}'] = None
    generated_docstrings_data[f'ROUGE-2 f-score {model}'] = None
    generated_docstrings_data[f'ROUGE-L f-score {model}'] = None
    generated_docstrings_data[f'BLEU score {model}'] = None

# A function to apply the evaluation functions
def evaluate_row(row, model):
    golden = row['Golden Docstring']
    generated = row[model]
    rouge_scores = evaluate_rouge(golden, generated)
    bleu_score = get_bleu_score(golden, generated)
    return pd.Series({
        f'ROUGE-1 f-score {model}': rouge_scores[0],
        f'ROUGE-2 f-score {model}': rouge_scores[1],
        f'ROUGE-L f-score {model}': rouge_scores[2],
        f'BLEU score {model}': bleu_score
    })

# Apply the function for each model and update the DataFrame
for model in models:
    mask = generated_docstrings_data[[f'ROUGE-1 f-score {model}', f'ROUGE-2 f-score {model}', f'ROUGE-L f-score {model}', f'BLEU score {model}']].isna().any(axis=1)
    generated_docstrings_data.loc[mask, [f'ROUGE-1 f-score {model}', f'ROUGE-2 f-score {model}', f'ROUGE-L f-score {model}', f'BLEU score {model}']] = generated_docstrings_data.loc[mask].progress_apply(lambda row: evaluate_row(row, model), axis=1)

generated_docstrings_data.to_csv("data_full_docs_gen_eval_metrics.csv", index=False)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]