In [0]:
pip install -q -U google-generativeai

## Code answers evaluation

In [0]:
import os
import pandas as pd
from consts import DATA_PATH, GEMINI_SIMULATION_DATA_PATH

code_questions_solutions = pd.read_csv(os.path.join(DATA_PATH, 'top_code_questions.csv'))[['question', 'solution']]
code_questions_answers = pd.read_csv(os.path.join(GEMINI_SIMULATION_DATA_PATH, 'code_questions_answers.csv'))
code_answers_evaluation = code_questions_answers[['question', 'answer']].merge(code_questions_solutions, on='question', how='left').drop_duplicates()

code_answers_evaluation['evaluation'] = ''

In [0]:
import google.generativeai as genai
import os
import time
import pandas as pd
from api_keys import API_KEYS
from consts import DATA_PATH

def evaluate_code_answer(question, answer, solution):
    if pd.isna(question) or question.strip() == '':
        return ''
   
    prompt = f"""
        The user has submitted a code solution to a programming question. Your task is to analyze and evaluate the code, considering both correctness and the quality of the approach. Even if the code has syntax errors or does not compile, partial credit should be awarded if the logic or idea is sound. 

        Evaluation Criteria:
        - Correctness – Does the code produce the expected output for all cases? Is it written in the required programming language (if specified)?
        - Functionality – Does the code correctly solve the problem?
        - Efficiency – If there are complexity constraints, is the code optimized in terms of time and space complexity?
        - Readability & Best Practices – Is the code well-structured, using meaningful variable names and following coding standards?
        - Edge Case Handling – Does the solution consider different edge cases, such as empty inputs, extreme values, or invalid data?
        - Logical Soundness (For Non-Compiling Code/Pseudocode) – Even if the code contains syntax errors or is in pseudocode, does it show a clear and correct approach to solving the problem?

        Scoring Guidelines:
        Assign a score between 0 and 100 based on the above criteria. 0 = An entirely incorrect or non-functional solution with no meaningful approach. 100 = A fully correct, efficient (if needed), and well-structured solution. Provide the feedback in the following format: The score in a 'x/100' format, where 'x' is a number between 0 and 100 that represents the score, then a newline, then the feedback message.

        Example Evaluations:
        Example 1: Fully Correct Code.
        Question: Write a Python function that returns the factorial of a number.
        User’s Code: 
            def factorial(n):
                if not isinstance(n, int) or n < 0:
                    raise ValueError('Input must be a non-negative integer')
                result = 1
                for i in range(2, n + 1):
                    result *= i
                return result
        Output: 100/100\nCorrect, handles all cases including invalid input. Efficient (iterative approach avoids recursion depth issues). Readable and follows best practices.

        Now, evaluate the following coding solution:

        Question: {question}
        User's Code: 
        {answer}

        Java Solution: {solution if solution else 'Not provided'}

        If the Java solution is provided, check if the user’s solution is correct by comparing it to the Java solution. Provide feedback based on correctness, efficiency, readability, edge cases, and logical soundness. Assign a score accordingly, giving partial credit for a good idea even if the code does not compile.
    """

    try:
       response = model.generate_content(prompt)
       evaluation = response.text.strip()
       print("response:", evaluation)
       return evaluation if evaluation else ''
    except Exception as e:
        print(f"Error: {e}")
        return ''


start_time = time.time()
running_time = 0
max_time = 900
empty_evaluation_rows = code_answers_evaluation[code_answers_evaluation['answer'].str.strip() != '']

while (running_time < max_time) and (empty_evaluation_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel("gemini-2.0-flash-exp")

        # Filter the rows where the "evaluation" column is empty
        empty_evaluation_rows = code_answers_evaluation[(code_answers_evaluation['evaluation'] == '') & (code_answers_evaluation['answer'].str.strip() != '')]
        if (empty_evaluation_rows.shape[0] == 0) or (time.time() - start_time > max_time):
            break

        # Get the indices of the first 15 rows with empty "evaluation"
        indices_to_update = empty_evaluation_rows.index[:15]

        # Apply the function with all required columns
        code_answers_evaluation.loc[indices_to_update, 'evaluation'] = code_answers_evaluation.loc[indices_to_update].apply(
            lambda row: evaluate_code_answer(row['question'], row['answer'], row['solution']), axis=1
        )
    running_time = time.time() - start_time

In [0]:
code_answers_evaluation.to_csv(os.path.join(GEMINI_SIMULATION_DATA_PATH, 'code_answers_evaluation.csv'))

## Open answers evaluation

In [0]:
import os
import pandas as pd
from consts import DATA_PATH

GEMINI_SIMULATION_DATA_PATH = os.path.join(DATA_PATH, 'gemini_simulation/')
open_questions_answers = pd.read_csv(os.path.join(GEMINI_SIMULATION_DATA_PATH, 'open_questions_answers.csv'))
open_answers_evaluation = open_questions_answers[['question', 'answer']]
open_answers_evaluation['evaluation'] = ''

In [0]:
import google.generativeai as genai
import os
import time
import pandas as pd
from api_keys import API_KEYS
from consts import DATA_PATH

def evaluate_open_answer(question, answer):
    if pd.isna(question) or question.strip() == '':
        return ''
   
    prompt = f"""
    The user has provided an answer to an open-ended question. Your task is to evaluate their response based on the following criteria:
    - Relevance & Completeness – Does the answer directly address the question and cover all key aspects?
    - Clarity & Coherence – Is the response well-structured, logically presented, and easy to understand?
    - Accuracy (for factual questions) or Depth & Justification (for subjective questions). If the question is factual, does the answer provide correct and well-supported information?, If the question is subjective, does the response demonstrate thoughtful reasoning, realistic insights, and a well-supported argument?
    - Balance & Perspective (for subjective questions) – Does the response consider different viewpoints or provide a well-rounded perspective?.
    
    Scoring: Assign a score between 0 and 100 based on the criteria above. 0 means the answer is entirely incorrect, off-topic, or lacks coherence. 100 means the response is fully relevant, well-articulated, and appropriately detailed.
    Provide the feedback in the following format: The score in a 'x/100' format, where 'x' is a number between 0 and 100 that represents the score, then a newline, then the feedback message.
    
    Examples:
    Example 1: Factual Question.
    Question: What are the key benefits of cloud computing?.
    User’s Answer: Cloud computing makes things faster and better.
    Ideal Answer: Cloud computing provides scalability, cost efficiency, remote accessibility, and security improvements. Businesses benefit from reduced infrastructure costs and improved collaboration.
    Feedback:
    50/100
    Strengths: The answer hints at benefits but lacks specifics.
    Improvements: More details on specific advantages like scalability and cost savings would improve clarity.
    
    Example 2: Subjective Question.
    Question: Where do you see yourself in 5 years professionally?.
    User’s Answer: I want to have a good job and be successful.
    Ideal Answer: In five years, I aim to become a project manager in the tech industry, leading cross-functional teams. To prepare, I plan to gain experience in team management, obtain a PMP certification, and refine my leadership skills.
    Feedback:
    60/100
    Strengths: The answer shows ambition.
    Improvements: It lacks specificity regarding career path and steps to achieve success.
    
    Now, evaluate the following response:
    Question:
    {question}

    User's Answer:
    {answer}
    
    Provide constructive feedback and assign a score based on clarity, completeness, reasoning, and relevance.
    """

    try:
       response = model.generate_content(prompt)
       evaluation = response.text.strip()
       print("response:", evaluation)
       return evaluation if evaluation else ''
    except Exception as e:
        print(f"Error: {e}")
        return ''


start_time = time.time()
running_time = 0
max_time = 900
empty_evaluation_rows = open_answers_evaluation[open_answers_evaluation['answer'].str.strip() != '']

while (running_time < max_time) and (empty_evaluation_rows.shape[0] > 0):
    for api_key in API_KEYS.values():
        # Configure Gemini API
        os.environ['GOOGLE_API_KEY'] = api_key
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
        model = genai.GenerativeModel("gemini-2.0-flash-exp")

        # Filter the rows where the "evaluation" column is empty
        empty_evaluation_rows = open_answers_evaluation[(open_answers_evaluation['evaluation'] == '') & (open_answers_evaluation['answer'].str.strip() != '')]
        if (empty_evaluation_rows.shape[0] == 0) or (time.time() - start_time > max_time):
            break

        # Get the indices of the first 15 rows with empty "evaluation"
        indices_to_update = empty_evaluation_rows.index[:15]

        # Apply the function with all required columns
        open_answers_evaluation.loc[indices_to_update, 'evaluation'] = open_answers_evaluation.loc[indices_to_update].apply(
            lambda row: evaluate_open_answer(row['question'], row['answer']), axis=1
        )
    running_time = time.time() - start_time


In [0]:
open_answers_evaluation.to_csv(os.path.join(GEMINI_SIMULATION_DATA_PATH, 'open_answers_evaluation.csv'))