In [82]:
# libraries
import pandas as pd
import os
import ast
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks import get_openai_callback
from tqdm.auto import tqdm

In [52]:
%pip install --upgrade --quiet  langchain-google-genai pillow

Note: you may need to restart the kernel to use updated packages.


In [29]:
os.environ["OPENAI_API_KEY"] = "hidden"
os.environ["ANTHROPIC_API_KEY"] = "hidden"
os.environ["GOOGLE_API_KEY"] = "hidden"

In [28]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

In [91]:
system_prompt = """
You have vast knowledge of python coding and libraries.
You know the PEP257 docstring conventions.
You can evaluate the quality of a docstring when given a code snippet and a corresponding docstring.
"""
my_prompt = """
Given the following code: {code}
and the following generated docstring: {generated_docstring}
Assess the quality of the generated docstring based on the following criteria:
For each evaluation metric, rate the quality of the generated docstring on a scale of 0 to 100.
The evaluation metrics are:
1. Accuracy - How well does the docstring describe the code?
2. Completeness - Does the docstring contain all the necessary information about the code?
3. Relevance - The ability of the docstring to stick to the point and not include irrelevant information.
4. Understandability - How easy is it for a reader to understand the docstring?
5. Readability - How well is the docstring formatted and structured?
Your response shoulde be only a python list of evaluation metrics [<Accuracy score>, <Completeness score>, <Relevance score>, <Understandability score>, <Readability score>],
do not include any other information.
"""

In [92]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", my_prompt),
])

llm_gpt_turbo = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

output_parser = StrOutputParser()

chain_solution_gpt_turbo = prompt | llm_gpt_turbo | output_parser

In [93]:
def evaluate_docstring(code, generated_docstring, chain_solution):
    evaluation_response = chain_solution.invoke({"code": code, "generated_docstring": generated_docstring})
    try: # try to convert the string to a list.
        return ast.literal_eval(evaluation_response)
    except (ValueError, SyntaxError):
        raise ValueError("The model's response string is not a valid list representation.")

# 1 docstring evaluation with Gemini

In [85]:
prompt_for_gemini = ChatPromptTemplate.from_template(
    system_prompt + my_prompt
)

data = pd.read_csv("function 10 all 3 models.csv")
function_10_code = data.loc[10, 'Function']
gemini_docstring_function_10 = data.loc[10, 'Gemini']

from langchain_google_genai import ChatGoogleGenerativeAI
llm_google = ChatGoogleGenerativeAI(google_api_key=GOOGLE_API_KEY, model="gemini-1.0-pro")
chain_solution_gemini = prompt_for_gemini | llm_google | output_parser

generated_evaluation_list = evaluate_docstring(function_10_code, gemini_docstring_function_10, chain_solution_gemini)

columns_to_add = ['Accuracy Gemini', 'Completeness Gemini', 'Relevance Gemini', 'Understandability Gemini', 'Readability Gemini']

data.loc[10, columns_to_add] = generated_evaluation_list

data.to_csv("gemini evaluation on gemini docstring.csv", index=False)

# 1 docstring evaluation with GPT 3.5 Turbo

In [94]:
data = pd.read_csv("function 10 all 3 models.csv")
function_10_code = data.loc[10, 'Function']
gpt_turbo_docstring_function_10 = data.loc[10, 'GPT-3.5 Turbo']

generated_evaluation_list = evaluate_docstring(function_10_code, gpt_turbo_docstring_function_10, chain_solution_gpt_turbo)

columns_to_add = ['Accuracy Gpt 3.5 Turbo', 'Completeness Gpt 3.5 Turbo', 'Relevance Gpt 3.5 Turbo', 'Understandability Gpt 3.5 Turbo', 'Readability Gpt 3.5 Turbo']

data.loc[10, columns_to_add] = generated_evaluation_list

data.to_csv("gpt 3.5 turbo evaluation on gpt 3.5 turbo docstring.csv", index=False)

[80, 80, 100, 90, 90]


# Docstrings evaluation for all models using GPT-4

In [None]:
data = pd.read_csv("function 10 all 3 models.csv") # load the data when all docstrings are generated

llm_gpt4 = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4-1106-Preview")
chain_solution_gpt4 = prompt | llm_gpt4 | output_parser

models = ['T5 Baseline', 'GPT-3.5 Turbo', 'Claude-instant-1', 'Gemini-1.0-pro']

tqdm.pandas()

with get_openai_callback() as cb:
    for model in models:
        # Define the new columns to be added to the DataFrame
        columns_to_add = ['Accuracy ' + model, 'Completeness ' + model, 'Relevance ' + model, 'Understandability ' + model, 'Readability ' + model]
        
        # Initialize the new columns with None values
        data[columns_to_add] = None
        
        # Create a mask for rows where any of the new columns have missing values
        missing_values_mask = data[columns_to_add].isna()
        print(f"Evaluating docstrings and filling columns {columns_to_add}...")
        
        # Apply the function and update the DataFrame
        data.loc[missing_values_mask, columns_to_add] = data.loc[missing_values_mask].progress_apply(
            lambda row: pd.Series(evaluate_docstring(row['Function'], row[model], chain_solution_gpt4)),
            axis=1
        )
        
        # Debugging: Print the updated rows
        print(data.loc[missing_values_mask, columns_to_add])

print(cb)