### Libraries

In [2]:
import pandas as pd
import os
import ast
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks import get_openai_callback
from langchain_google_genai import ChatGoogleGenerativeAI
from tqdm.auto import tqdm

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

In [3]:
%pip install --upgrade --quiet  langchain-google-genai pillow

!pip install --upgrade --quiet rouge-score
!pip install --upgrade --quiet nltk

Note: you may need to restart the kernel to use updated packages.


### API Keys

In [5]:
os.environ["OPENAI_API_KEY"] = "hidden"
os.environ["ANTHROPIC_API_KEY"] = "hidden"
os.environ["GOOGLE_API_KEY"] = "hidden"

In [6]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

### Prompts for evaluation

In [8]:
system_prompt = """
You have vast knowledge of python functions and classes.
You know and follow the google docstring conventions.
You can evaluate the quality of a docstring when given a code snippet and a corresponding docstring.
"""
my_prompt = """
Given the following code: {code}
and the following generated docstring: {generated_docstring}
Assess the quality of the generated docstring based on the following criteria:
For each evaluation metric, rate the quality of the generated docstring on a scale of 0 to 100.
The evaluation metrics are:
1. Accuracy - How well does the docstring describe the code?
2. Completeness - Does the docstring contain all the necessary information about the code?
3. Relevance - The ability of the docstring to stick to the point and not include irrelevant information.
4. Understandability - How easy is it for a reader to understand the docstring?
5. Readability - How well is the docstring formatted and structured?
Your response shoulde be only a python list of evaluation metrics [<Accuracy score>, <Completeness score>, <Relevance score>, <Understandability score>, <Readability score>],
do not include any other information.
"""

In [9]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", my_prompt),
])

llm_gpt_turbo = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
output_parser = StrOutputParser()
chain_solution_gpt_turbo = prompt | llm_gpt_turbo | output_parser

  warn_deprecated(


In [10]:
def evaluate_docstring(code, generated_docstring, chain_solution):
    evaluation_response = chain_solution.invoke({"code": code, "generated_docstring": generated_docstring})
    try: # try to convert the string to a list.
        return ast.literal_eval(evaluation_response)
    except (ValueError, SyntaxError):
        raise ValueError("The model's response string is not a valid list representation.")

1 docstring evaluation with GPT 3.5 Turbo before using GPT 4

In [None]:
data = pd.read_csv("data_full_docstrings_generated.csv")
function_10_code = data.loc[10, 'Function']
gpt_turbo_docstring_function_10 = data.loc[10, 'GPT-3.5 Turbo']

In [None]:
generated_evaluation_list = evaluate_docstring(function_10_code, gpt_turbo_docstring_function_10, chain_solution_gpt_turbo)
print(generated_evaluation_list)

[70, 80, 90, 75, 85]


1 docstring evaluation with Gemini before using GPT 4

In [None]:
prompt_for_gemini = ChatPromptTemplate.from_template(
    system_prompt + my_prompt
)

gemini_docstring_function_10 = data.loc[10, 'Gemini-1.0-pro']

llm_google = ChatGoogleGenerativeAI(google_api_key=GOOGLE_API_KEY, model="gemini-1.0-pro")
chain_solution_gemini = prompt_for_gemini | llm_google | output_parser

generated_evaluation_list = evaluate_docstring(function_10_code, gemini_docstring_function_10, chain_solution_gemini)
print(generated_evaluation_list)

[80, 90, 95, 85, 90]


## ROUGE

We have decided to use Unigrams, Bigrams and LCS. 
We will take the F1-score which combines the precison and recall as the evaluation metric

In [11]:
def evaluate_rouge(golden, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores_dict = scorer.score(golden, generated)
    rouge1_f1 = scores_dict['rouge1'].fmeasure
    rouge2_f1 = scores_dict['rouge2'].fmeasure
    rougeL_f1 = scores_dict['rougeL'].fmeasure
    return [rouge1_f1, rouge2_f1, rougeL_f1]

## BLEU

We have decided to use method5 as our smoothing funciton based on the results we've got when comparing to the other options and after reading 'A Systematic Comparison of Smoothing Techniques for Sentence-Level
BLEU' by Boxing Chen and Colin Cherry.
We've chosen it since it is quite intuitive and performs well for our purpouses - emphesizing meaning and recognizing the similarity between phrases even if there are slight variations or shifts in wording. 

In [12]:
chencherry = SmoothingFunction()

def get_bleu_score(ref, candidate):
    return sentence_bleu([ref.split()], candidate.split(), smoothing_function=chencherry.method5)

# Evaluation full pipeline

In [13]:
"""
prompt_for_gemini = ChatPromptTemplate.from_template(
    system_prompt + my_prompt
)

llm_google = ChatGoogleGenerativeAI(google_api_key=GOOGLE_API_KEY, model="gemini-1.0-pro")
chain_solution_gemini = prompt_for_gemini | llm_google | output_parser
"""

llm_gpt4 = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4-1106-Preview")
output_parser = StrOutputParser()
chain_gpt4 = prompt | llm_gpt4 | output_parser

generated_docstrings_data = pd.read_csv("data_full_docstrings_generated.csv")
generated_docstrings_data.rename({'T5 BaseLine docstring generation': 'T5'}, axis=1, inplace=True)  # column name is too long

models = ['T5', 'Gemini-1.0-pro', 'GPT-3.5 Turbo', 'Claude-instant-1']

tqdm.pandas()

# Add columns for metrics and initialize with None
for model in models:
    generated_docstrings_data[f'ROUGE-1 f-score {model}'] = None
    generated_docstrings_data[f'ROUGE-2 f-score {model}'] = None
    generated_docstrings_data[f'ROUGE-L f-score {model}'] = None
    generated_docstrings_data[f'BLEU score {model}'] = None
    generated_docstrings_data[f'Accuracy {model}'] = None
    generated_docstrings_data[f'Completeness {model}'] = None
    generated_docstrings_data[f'Relevance {model}'] = None
    generated_docstrings_data[f'Understandability {model}'] = None
    generated_docstrings_data[f'Readability {model}'] = None

# Evaluation functions
def evaluate_row(row, model):
    golden = row['Golden Docstring']
    generated = row[model]
    rouge_scores = evaluate_rouge(golden, generated)
    bleu_score = get_bleu_score(golden, generated)
    accuracy, completeness, relevance, understandability, readability = evaluate_docstring(row['Function'], generated, chain_gpt4)
    return pd.Series({
        f'ROUGE-1 f-score {model}': rouge_scores[0],
        f'ROUGE-2 f-score {model}': rouge_scores[1],
        f'ROUGE-L f-score {model}': rouge_scores[2],
        f'BLEU score {model}': bleu_score,
        f'Accuracy {model}': accuracy,
        f'Completeness {model}': completeness,
        f'Relevance {model}': relevance,
        f'Understandability {model}': understandability,
        f'Readability {model}': readability
    })

# Apply the function to each model and update the DataFrame
for model in models:
    mask = generated_docstrings_data[[f'ROUGE-1 f-score {model}', f'ROUGE-2 f-score {model}', f'ROUGE-L f-score {model}', f'BLEU score {model}', f'Accuracy {model}', f'Completeness {model}', f'Relevance {model}', f'Understandability {model}', f'Readability {model}']].isna().any(axis=1)
    generated_docstrings_data.loc[mask, [f'ROUGE-1 f-score {model}', f'ROUGE-2 f-score {model}', f'ROUGE-L f-score {model}', f'BLEU score {model}', f'Accuracy {model}', f'Completeness {model}', f'Relevance {model}', f'Understandability {model}', f'Readability {model}']] = generated_docstrings_data.loc[mask].progress_apply(lambda row: evaluate_row(row, model), axis=1)

generated_docstrings_data.to_csv("data_full_eval_metrics.csv", index=False)

  0%|          | 0/50 [00:00<?, ?it/s]



  0%|          | 0/50 [00:00<?, ?it/s]



  0%|          | 0/50 [00:00<?, ?it/s]



  0%|          | 0/50 [00:00<?, ?it/s]



  Unnamed: 0                                           Function  \
0  Recursion  def sum_even(lst): \r\n    total = 0\r\n    fo...   
1        NaN  def find_num_changes(n, lst): \n    if n == 0:...   
2        NaN  def sum_nested(lst): \n    if len(lst) == 0:\n...   
3        NaN  def str_decomp(target, word_bank): \n    if ta...   
4        NaN  def n_choose_k(n, k): \n    if k < 0 or k > n:...   

                                    Golden Docstring  \
0  Recursively compute the sum of all the element...   
1  Recursively compute the number of ways to repr...   
2  Recursively compute the absolute sum of all th...   
3  Recursively compute the number of ways to deco...   
4  Compute the number of options to choose k samp...   

                                          Unit Tests  \
0  print(sum_even([1, [2, 3, [4, 5]]]) == 7)  \np...   
1  print(find_num_changes(4, [1, 2, 3]) == 4)\npr...   
2  print(sum_nested([1, 2, [3, 4], [5, [6, 7], 8]...   
3  print(str_decomp("abcdef", ["ab",