In [1]:
import os
import csv
from typing import List, Type
from pydantic import BaseModel
from tqdm import tqdm
from openai import OpenAI
import code_bert_score
"""import torch
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F"""

'import torch\nfrom transformers import AutoModel, AutoTokenizer\nimport torch.nn.functional as F'

In [2]:
# ------------------------------------------------------------------------------
# Configuration: Adjust these paths as necessary for your environment
# ------------------------------------------------------------------------------
BASE_PATH = "v3/"
PROBLEM_STATEMENTS_PATH = os.path.join(BASE_PATH, "problem_statements_v3")
CORRECT_SOLUTIONS_PATH = os.path.join(BASE_PATH, "correct_solutions_v3")
INCORRECT_SOLUTIONS_PATH = os.path.join(BASE_PATH, "incorrect_solutions_v3")
RUBRICS_PATH = os.path.join(BASE_PATH, "rubrics_v3")
RESULTS_CSV = os.path.join(BASE_PATH, "our_results_new_withhandout_name.csv")
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

NameError: name 'AutoTokenizer' is not defined

In [None]:
# ------------------------------------------------------------------------------
# Ensure the environment variable is set for OpenAI
# ------------------------------------------------------------------------------
if "OPENAI_API_KEY" not in os.environ:
    raise EnvironmentError(
        f"Please set your OpenAI API key in the environment variable 'OPENAI_API_KEY'."
    )

In [None]:
# Initialize the OpenAI client
client = OpenAI()

------------------------------------------------------------------------------
Define Pydantic Classes for Each Prompt's Response
------------------------------------------------------------------------------

In [None]:
class Prompt1Response(BaseModel):
    score: int
    feedback: str

In [3]:
class Prompt2Response(BaseModel):
    score: int
    feedback: str

In [4]:
class Prompt3Response(BaseModel):
    score: int
    feedback: str

In [5]:
class Prompt4Response(BaseModel):
    modified_score: int
    reasoning: str

------------------------------------------------------------------------------
Helper Functions
------------------------------------------------------------------------------

In [6]:
def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

In [7]:
def get_files_in_folder(folder_path):
    return sorted(
        [
            os.path.join(folder_path, fname)
            for fname in os.listdir(folder_path)
            if os.path.isfile(os.path.join(folder_path, fname))
        ]
    )

In [8]:
def filter_matching_solutions(problem_base, solution_files):
    """
    Filters solution files to only include those matching the problem's base name.
    This ensures that only solution files associated with the specific problem are included.
    """
    return [
        sf for sf in solution_files
        if os.path.basename(sf).startswith(problem_base + "-") or os.path.basename(sf).startswith(problem_base + "_solution")
    ]

In [9]:
def evaluate_prompt(prompt: str, response_model: Type[BaseModel]):
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format=response_model
    )
    return completion.choices[0].message.parsed

In [10]:
def save_results_to_csv(rows):
    file_exists = os.path.isfile(RESULTS_CSV)
    with open(RESULTS_CSV, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow([
                "Problem",
                "Submitted Code",
                "Score"
            ])
        writer.writerows(rows)

------------------------------------------------------------------------------
Prompt Definitions
------------------------------------------------------------------------------

In [11]:
def prompt1(handout, sample_solution, submitted_code):
    return f"""
Evaluate the student's code based on the problem description and the provided example solution.

Problem Description:
{handout}

Model (Correct) Solution:
{sample_solution}

Student Code:
{submitted_code}

Provide a score (1-10) and feedback in structured JSON format.
"""

In [12]:
def prompt2(handout, sample_solution, submitted_code):
    return f"""
You are an experienced computer science professor who has taught Python programming for over a decade.
Your teaching style emphasizes clarity, functionality, and efficiency in code. You are committed to helping 
students understand programming concepts while fostering their ability to write maintainable and error-free code.

Below is a problem description, an example solution, and a student's code.
Evaluate the student's code, provide detailed functional suggestions to fix the issues, and assign a score (1-10).
The score must be an integer in the range of 1 to 10.

Problem Description:
{handout}

Model (Correct) Solution:
{sample_solution}

Student Code:
{submitted_code}

Provide your response in structured JSON format.
"""

In [13]:
def prompt3(handout, sample_solution, submitted_code, rubric):
    return f"""
You are a computer science professor teaching introductory programming using Python.

Below is a problem description, an example solution, and a student's code. 
Use the rubric provided to evaluate the code and provide a score (1-10) and feedback. 
The score must be an integer in the range of 1 to 10.

Problem Description:
{handout}

Model (Correct) Solution:
{sample_solution}

Student Code:
{submitted_code}

Rubric:
{rubric}

Provide the response in structured JSON format.
"""

In [14]:
def prompt5(handout, sample_solution, submitted_code, rubric, previous_feedback, previous_score):
    return f"""
You are a computer science professor teaching introductory programming using Python.

Below is the original problem description, an example solution, and the student's code. 
You also have the rubric, the previously assigned feedback, and the previous score. 
Reevaluate the student's score based on that feedback and the rubric. 
Decide if the score should remain the same or change. Justify your decision. 
The modified score must be an integer in the range of 1 to 10.

Problem Description:
{handout}

Model (Correct) Solution:
{sample_solution}

Student Code:
{submitted_code}

Rubric:
{rubric}

Previous Feedback:
{previous_feedback}

Previous Score: {previous_score}

Provide the response in structured JSON format.
"""

------------------------------------------------------------------------------
Main Script
------------------------------------------------------------------------------

In [15]:
"""def tokenize_code(code):
    inputs = tokenizer(code, return_tensors='pt', padding=True, truncation=True)
    return inputs"""

"def tokenize_code(code):\n    inputs = tokenizer(code, return_tensors='pt', padding=True, truncation=True)\n    return inputs"

In [16]:
"""def get_code_embeddings(tokenized_code):
    # Get embeddings for the code using CodeBERT model
    with torch.no_grad():
        outputs = model(**tokenized_code)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Get the mean of token embeddings
    return embeddings"""

'def get_code_embeddings(tokenized_code):\n    # Get embeddings for the code using CodeBERT model\n    with torch.no_grad():\n        outputs = model(**tokenized_code)\n        embeddings = outputs.last_hidden_state.mean(dim=1)  # Get the mean of token embeddings\n    return embeddings'

In [17]:
"""def cosine_similarity(embedding1, embedding2):
    # Compute cosine similarity between two embeddings
    return F.cosine_similarity(embedding1, embedding2)"""

'def cosine_similarity(embedding1, embedding2):\n    # Compute cosine similarity between two embeddings\n    return F.cosine_similarity(embedding1, embedding2)'

In [18]:
def get_expected_score(input_string):
    # Check if input contains "solution"
    if "solution" in input_string:
        return 1
    else:
        # Find the position of "pt" in the input string
        pos = input_string.find("pt")
        if pos != -1 and pos > 0:
            # Get the character that precedes "pt" and convert it to an integer
            preceding_char = input_string[pos - 1]
            if preceding_char.isdigit():
                preceding_int = int(preceding_char)
                return preceding_int / 10
        # If conditions are not met, you can return a default value (e.g., 0)
        return 0

In [21]:
def main():
    problem_files = get_files_in_folder(PROBLEM_STATEMENTS_PATH)
    correct_solution_files = get_files_in_folder(CORRECT_SOLUTIONS_PATH)
    incorrect_solution_files = get_files_in_folder(INCORRECT_SOLUTIONS_PATH)

    for problem_file in tqdm(problem_files, desc="Evaluating problem statements"):
        problem_base = os.path.splitext(os.path.basename(problem_file))[0]
        main_name = problem_base.split("_statement")[0]

        handout = read_file(problem_file)
        rubric_path = os.path.join(RUBRICS_PATH, f"{main_name}_rubric.txt")
        solution_path = os.path.join(CORRECT_SOLUTIONS_PATH, f"{main_name}_solution.txt")

        if not os.path.isfile(rubric_path):
            print(f"Rubric file not found for {main_name}: {rubric_path}")
            continue
        if not os.path.isfile(solution_path):
            print(f"Solution file not found for {main_name}: {solution_path}")
            continue

        rubric = read_file(rubric_path)
        sample_solution = read_file(solution_path)

        # Filter solution files to match only the current problem
        all_solution_files = filter_matching_solutions(
            main_name,
            correct_solution_files + incorrect_solution_files
        )

        if not all_solution_files:
            print(f"No matching solution files found for problem '{main_name}'")
            continue

        for solution_file in tqdm(all_solution_files, desc=f"Evaluating solutions for {main_name}", leave=False):
            # ------------------------------------------------------------------------------
            # handout is the specfic problem statement
            # sample_solution is the model code/ complete correct solution
            # submitted_code is the code sent in for submission
            # place your function calls here
            # all vars will be strings that can be passes as inputs to your function
            # ------------------------------------------------------------------------------
            submitted_code = read_file(solution_file)

            code = [submitted_code]
            reference_code = [sample_solution]
            pred_results = code_bert_score.score(cands = code, refs = reference_code, lang = "python", sources = [handout])
            f3_value = (pred_results[3]).item()
        

            """# Tokenize both the code and the reference
            tokenized_code = tokenize_code(submitted_code)
            tokenized_reference = tokenize_code(sample_solution)

            # Get embeddings for both code and reference code
            code_embedding = get_code_embeddings(tokenized_code)
            reference_embedding = get_code_embeddings(tokenized_reference)

            similarity_score = cosine_similarity(code_embedding, reference_embedding)"""
            
            

            # ------------------------------------------------------------------------------
            # your output values can be added to the row column here in this file
            # ------------------------------------------------------------------------------
            row = [
                problem_base,
                get_expected_score(os.path.basename(solution_file)),
                f3_value
            ]

            save_results_to_csv([row])

------------------------------------------------------------------------------
Entrypoint
------------------------------------------------------------------------------

In [22]:
if __name__ == "__main__":
    main()

Evaluating problem statements:   0%|                                                                                                                                               | 0/10 [00:00<?, ?it/s]
Evaluating solutions for problem10:   0%|                                                                                                                                           | 0/6 [00:00<?, ?it/s][A
Evaluating problem statements:   0%|                                                                                                                                               | 0/10 [00:02<?, ?it/s][A


KeyboardInterrupt: 