In [None]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
import sys
sys.path.append(os.path.abspath("src"))
from chat import process_user_input
from IPython.display import Markdown, display
load_dotenv()
GEMINI_API_KEY = os.getenv("API_KEY")
genai.configure(api_key=GEMINI_API_KEY)


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
def evaluate_answer_with_gemini(question, ideal_answer, bot_answer):
    prompt = f"""
You are a strict and detailed evaluator of machine learning answers.

You will be given:
- A question,
- The ideal answer from the official book,
- A user's answer generated by a bot.

Your task is to evaluate how relevant and complete the user's answer is compared to the ideal answer, focusing only on correctness and relevance.

Give a score from 0 to 10 based ONLY on:

- How well the user's answer covers the key points from the ideal answer.
- Whether the answer is factually correct.
- Do NOT consider style, grammar, or creativity.

Scoring guidelines:

- 9–10: The answer covers almost all key points accurately.
- 7–8: The answer is mostly correct but misses some less critical details.
- 4–6: The answer misses major points or has some factual errors.
- 1–3: The answer is mostly incorrect or incomplete.
- 0: The answer is completely wrong or unrelated.

In your reasoning, explicitly:

- List any key points from the ideal answer that are missing or under-explained.
- Explain why you gave the score you did, mentioning any errors or omissions.
- Mention if the answer is too brief and missing important details.

Format your response exactly like this:

Score: X/10  
Reasoning: [detailed explanation mentioning missing or incorrect points]

Question:  
{question}

Ideal Answer:  
{ideal_answer}

User Answer:  
{bot_answer}

"""

    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(prompt)
    return response.text.strip()


In [31]:
def display_answer_and_evaluation(bot_answer, evaluation):
    print("Bot Answer:")
    display(Markdown(bot_answer))
    print("Evaluation:")
    display(Markdown(evaluation))

In [32]:
question_1 = "What is the difference between overfitting and underfitting?"
ideal_answer_1 = """
Overfitting happens when a model performs well on the training data but generalizes poorly to new, unseen data. 
It learns not just the underlying patterns but also the noise in the training set. 
Underfitting occurs when a model is too simple to capture the underlying structure of the data, 
resulting in poor performance on both the training and validation sets.
"""

bot_answer_1 = process_user_input(question_1)
evaluation_1 = evaluate_answer_with_gemini(question_1, ideal_answer_1, bot_answer_1)


display_answer_and_evaluation(bot_answer_1, evaluation_1)


Bot Answer:


Overfitting occurs when your model fits the training data too well, but does not generalize well to new examples. Underfitting is the opposite and occurs when your model is too simple to learn the underlying structure of the data.

Evaluation:


Score: 9/10
Reasoning:The user's answer is almost perfect. It correctly defines both overfitting and underfitting. The only minor missing detail is that underfitting leads to poor performance on both training and validation sets, not just unseen data. Otherwise, it's a very good answer.

In [33]:
question_2 = "Why is feature scaling important in machine learning?"
ideal_answer_2 = """
Feature scaling is important because many machine learning algorithms, such as gradient descent and k-nearest neighbors, 
perform better or converge faster when features are on a similar scale. 
Without scaling, features with larger ranges may dominate those with smaller ranges, leading to suboptimal models.
"""

bot_answer_2 = process_user_input(question_2)
evaluation_2 = evaluate_answer_with_gemini(question_2, bot_answer_2, ideal_answer_2)

display_answer_and_evaluation(bot_answer_2, evaluation_2)


Bot Answer:


Machine learning algorithms don’t perform well when the input numerical attributes have very different scales. Without any scaling, most models will be biased.

Evaluation:


Score: 9/10
Reasoning: The user's answer covers the key point that feature scaling is important because algorithms perform better with similar scales. It also mentions specific algorithms like gradient descent and k-NN which is a bonus. The answer is only very slightly more detailed and contains no incorrect information.

In [34]:
question_3 = "What is the purpose of using a validation set?"
ideal_answer_3 = """
A validation set is used to evaluate a model’s performance during training to tune hyperparameters and select the best model. 
It helps estimate how well the model is likely to perform on unseen data, 
and is separate from the training and test sets to avoid information leakage.
"""

bot_answer_3 = process_user_input(question_3)
evaluation_3 = evaluate_answer_with_gemini(question_3, bot_answer_3, ideal_answer_3)

display_answer_and_evaluation(bot_answer_3, evaluation_3)

Bot Answer:


The purpose of using a validation set is to evaluate several candidate models and select the best one.

Evaluation:


Score: 9/10
Reasoning:The user answer is highly relevant and accurate. It correctly identifies the purpose of a validation set in model selection and hyperparameter tuning, and also mentions its role in estimating performance on unseen data while avoiding information leakage. The response provides more context than the ideal answer, but could have been shorter and more direct. However, all key points are covered.

In [47]:
question_4 = "What is the bias-variance tradeoff in machine learning?"
ideal_answer_4 = """
The bias-variance tradeoff describes the balance between two sources of error that affect the generalization ability of a model.

- Bias is error due to overly simplistic assumptions in the model, causing it to miss relevant relations (underfitting).
- Variance is error from sensitivity to small fluctuations in the training set, causing the model to capture noise (overfitting).

A good model balances bias and variance to minimize total generalization error.
"""

bot_answer_4 = process_user_input(question_4)
evaluation_4 = evaluate_answer_with_gemini(question_4, bot_answer_4, ideal_answer_4)

display_answer_and_evaluation(bot_answer_4, evaluation_4)

Bot Answer:


The bias-variance trade-off is the balance between a model's complexity and its ability to generalize. Increasing a model’s complexity will typically increase its variance and reduce its bias. Conversely, reducing a model’s complexity increases its bias and reduces its variance. A high-bias model is most likely to underfit the training data, while a model with high variance is likely to overfit the training data.

Evaluation:


Score: 9/10
Reasoning:The user's answer is nearly complete and correct. The ideal answer mentions that "Increasing a model’s complexity will typically increase its variance and reduce its bias. Conversely, reducing a model’s complexity increases its bias and reduces its variance." While the user's answer does not explicitly state this relationship between model complexity and bias/variance, the core concepts are well explained.

In [48]:
question_5 = "Explain the difference between batch gradient descent and stochastic gradient descent."
ideal_answer_5 = """
Batch gradient descent computes the gradient of the cost function using the entire training dataset before each update, which can be slow but provides a stable gradient estimate.

Stochastic gradient descent (SGD) updates the model parameters using the gradient computed from a single randomly chosen training example at each iteration, which makes updates noisier but often allows faster convergence and can escape local minima.

Mini-batch gradient descent is a compromise, using small batches of data to balance efficiency and stability.
"""

bot_answer_5 = process_user_input(question_5)
evaluation_5 = evaluate_answer_with_gemini(question_5, bot_answer_5, ideal_answer_5)

display_answer_and_evaluation(bot_answer_5, evaluation_5)

Bot Answer:


Batch gradient descent computes the gradients of the cost function using the whole training set at every step, making it slow for large training sets. Stochastic gradient descent, on the other hand, picks a random instance in the training set at every step and computes the gradients based only on that single instance, making it much faster per iteration and allowing it to train on huge datasets. However, stochastic gradient descent is much less regular, with the cost function bouncing up and down, and it may not settle at the minimum, while batch gradient descent gently decreases until it reaches the minimum. Stochastic gradient descent has a better chance of escaping local minima.

Evaluation:


Score: 8/10
Reasoning: The user's answer is mostly correct and covers the key differences between Batch and Stochastic Gradient Descent. It accurately describes how each method computes the gradient and the trade-offs in speed and stability. However, the user's answer is missing the point that batch gradient descent is slow for *large* training sets. The ideal answer also notes that stochastic gradient descent "may not settle at the minimum," which the user's answer did not address. The user answer also included information on mini-batch gradient descent which was not asked for.

In [37]:
question_6 = "What are the advantages of using the ReLU activation function in neural networks?"
ideal_answer_6 = """
The Rectified Linear Unit (ReLU) activation function is popular because:

- It is computationally efficient (simple thresholding at zero).
- It helps mitigate the vanishing gradient problem, allowing deeper networks to train faster.
- It introduces non-linearity, enabling the network to learn complex functions.
- It tends to produce sparse activations, which can improve model efficiency.

However, ReLU can suffer from dying neurons when inputs become negative.
"""

bot_answer_6 = process_user_input(question_6)
evaluation_6 = evaluate_answer_with_gemini(question_6, bot_answer_6, ideal_answer_6)

display_answer_and_evaluation(bot_answer_6, evaluation_6)

Bot Answer:


The ReLU activation function has the advantage of being fast to compute and does not saturate for positive values. Also, the fact that it does not have a maximum output value helps reduce some issues during gradient descent.

Evaluation:


Score: 9/10
Reasoning: The user answer is very good and almost completely covers the ideal answer. The ideal answer mentions "fast to compute" and "does not saturate for positive values", which the user answer accurately captures with "computationally efficient" and "helps mitigate the vanishing gradient problem". The ideal answer also mentions "does not have a maximum output value helps reduce some issues during gradient descent", which is partially captured by "helps mitigate the vanishing gradient problem". The user answer also provides additional relevant advantages, such as "introduces non-linearity", "sparse activations", and potential problems like "dying neurons". The answer is comprehensive and factually correct.

In [38]:
question_7 = "What is early stopping in training neural networks and why is it useful?"
ideal_answer_7 = """
Early stopping is a regularization technique where training is stopped as soon as the validation error starts to increase, even if the training error is still decreasing. 
This prevents the model from overfitting to the training data by stopping training before the model starts to memorize noise or irrelevant patterns.
"""

bot_answer_7 = process_user_input(question_7)
evaluation_7 = evaluate_answer_with_gemini(question_7, bot_answer_7, ideal_answer_7)
display_answer_and_evaluation(bot_answer_7, evaluation_7)

Bot Answer:


Early stopping is one of the best regularization techniques and can prevent overfitting when training deep neural networks.

Evaluation:


Score: 9/10
Reasoning:The user answer is almost perfect. It accurately explains what early stopping is and why it's useful for preventing overfitting. The answer is slightly brief but contains the key information.

In [39]:
question_8 = "How does the dropout technique help prevent overfitting in neural networks?"
ideal_answer_8 = """
Dropout is a regularization technique that randomly sets a fraction of neuron outputs to zero during training. 
This prevents neurons from co-adapting too much and forces the network to learn more robust features that generalize better.
"""

bot_answer_8 = process_user_input(question_8)
evaluation_8 = evaluate_answer_with_gemini(question_8, bot_answer_8, ideal_answer_8)
display_answer_and_evaluation(bot_answer_8, evaluation_8)

Bot Answer:


Neurons trained with dropout cannot co-adapt with their neighboring neurons; they have to be as useful as possible on their own. They also cannot rely excessively on just a few input neurons; they must pay attention to each of their input neurons. They end up being less sensitive to slight changes in the inputs, resulting in a more robust network that generalizes better.

Evaluation:


Score: 8/10
Reasoning: The user's answer captures the core idea of dropout preventing co-adaptation and promoting more robust features. However, it misses the point about neurons not being able to rely excessively on just a few input neurons and the resulting insensitivity to slight input changes, which are crucial for understanding dropout's generalization benefits. The answer is a bit brief but correct.

In [40]:
question_9 = "What is the difference between batch gradient descent and stochastic gradient descent?"
ideal_answer_9 = """
Batch gradient descent computes the gradient of the cost function using the entire training dataset, which can be slow for large datasets. 
Stochastic gradient descent (SGD) updates the model parameters using the gradient from a single training example, making updates noisier but faster and allowing the model to potentially escape local minima.
"""

bot_answer_9 = process_user_input(question_9)
evaluation_9 = evaluate_answer_with_gemini(question_9, bot_answer_9, ideal_answer_9)
display_answer_and_evaluation(bot_answer_9, evaluation_9)

Bot Answer:


The main problem with batch gradient descent is that it uses the whole training set to compute the gradients at every step, making it very slow when the training set is large. Stochastic gradient descent picks a random instance in the training set at every step and computes the gradients based only on that single instance, making it much faster. However, stochastic gradient descent is much less regular than batch gradient descent, and the cost function will bounce up and down, decreasing only on average.

Evaluation:


Score: 8/10
Reasoning: The user's answer is mostly correct and covers the main points. However, it doesn't mention that stochastic gradient descent is much less regular than batch gradient descent, which is an important detail. The answer also does not explicitly state that the cost function in SGD decreases only on average. While it does mention the noisy updates, it does not directly relate this to the average decrease.

In [42]:
question_10 = "What role does the learning rate play in training a neural network?"
ideal_answer_10 = """
The learning rate controls the size of the steps taken during gradient descent when updating model parameters. 
A learning rate that is too high can cause the model to overshoot minima and fail to converge, while a learning rate that is too low can make training very slow and get stuck in local minima. 
Choosing an appropriate learning rate is crucial for efficient and effective training.
"""

bot_answer_10 = process_user_input(question_10)
evaluation_10 = evaluate_answer_with_gemini(question_10, bot_answer_10, ideal_answer_10)
display_answer_and_evaluation(bot_answer_10, evaluation_10)


Bot Answer:


Finding a good learning rate is very important. If you set it much too high, training may diverge. If you set it too low, training will eventually converge to the optimum, but it will take a very long time. If you set it slightly too high, it will make progress very quickly at first, but it will end up dancing around the optimum and never really settling down. The learning rate is arguably the most important hyperparameter. In general, the optimal learning rate is about half of the maximum learning rate (i.e., the learning rate above which the training algorithm diverges.

Evaluation:


Score: 8/10
Reasoning: The user's answer accurately describes the role of the learning rate, mentioning the consequences of setting it too high (overshooting, failure to converge) and too low (slow training, getting stuck in local minima). However, it misses the nuance about "dancing around the optimum" when slightly too high and does not mention that the optimal learning rate is about half of the maximum learning rate. Also, the statement that the learning rate is arguably the most important hyperparameter is not present.

In [50]:
import re

def extract_score(evaluation_text):
    # Use regex to find "Score: X/10"
    match = re.search(r"Score:\s*(\d+)/10", evaluation_text)
    if match:
        return int(match.group(1))
    else:
        return None  # Or raise an error

def average_scores(evaluation_texts):
    scores = [extract_score(text) for text in evaluation_texts]
    # Filter out any None values just in case
    scores = [s for s in scores if s is not None]
    if not scores:
        return None
    return sum(scores) / len(scores)


evaluations = [evaluation_1, evaluation_2, evaluation_3, evaluation_4, evaluation_5,
               evaluation_6, evaluation_7, evaluation_8, evaluation_9, evaluation_10]

avg_score = average_scores(evaluations)
print(f"Average Score: {avg_score:.2f}/10")


if avg_score >= 8.5:
    print("Final Opinion: Excellent performance! The chatbot consistently provides highly accurate and relevant answers.")
elif avg_score >= 7:
    print("Final Opinion: Good performance. The chatbot generally gives accurate answers but there’s room for some improvement.")
elif avg_score >= 5.5:
    print("Final Opinion: Moderate performance. The chatbot answers are somewhat useful but sometimes lack key details or relevance.")
else:
    print("Final Opinion: Poor performance. The chatbot’s responses often miss the mark and require significant improvement.")



Average Score: 8.60/10
Final Opinion: Excellent performance! The chatbot consistently provides highly accurate and relevant answers.
