In [None]:
!pip install nltk rouge_score sacrebleu

In [2]:
import json
import csv
from difflib import SequenceMatcher

# Read the user history from the JSON file (12_conversation.json)
with open('./USERS/12_conversation.json', 'r', encoding='utf-8') as file:
    user_history = json.load(file)

# Load the predefined FAQ data from CSV
faq_data = []
with open('faq_data.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        faq_data.append(row)

# Function to calculate similarity between two strings (for accuracy check)
def similarity(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

# Function to evaluate the chatbot's response
def evaluate_response(user_question, bot_response):
    # Find the predefined answer for the question
    for entry in faq_data:
        if entry['question'].lower() in user_question.lower():
            predefined_answer = entry['answer']
            break
    else:
        predefined_answer = None

    if predefined_answer:
        # 1. Accuracy: Compare the bot's response with the predefined answer
        accuracy_score = similarity(predefined_answer, bot_response)
        accuracy_score = round(accuracy_score * 5, 2)  # Scale to 5 for easier reading

        # 2. Relevance: Check if the response is relevant (simplified check)
        if predefined_answer.lower() in bot_response.lower():
            relevance_score = 5
        else:
            relevance_score = 3  # Lower if not highly relevant

        # 3. User Satisfaction: Based on the clarity of the answer (simple heuristic)
        if len(bot_response.split('\n')) > 3:  # Assuming longer answers are clearer
            satisfaction_score = 5
        else:
            satisfaction_score = 3  # Lower if the answer is too brief

        return {
            "accuracy": accuracy_score,
            "relevance": relevance_score,
            "satisfaction": satisfaction_score
        }
    else:
        # If no predefined answer found, return a neutral score
        return {
            "accuracy": 1,
            "relevance": 1,
            "satisfaction": 1
        }

# Evaluate the chatbot's performance
def evaluate_chatbot_performance(user_history):
    evaluations = []

    for i in range(0, len(user_history), 2):
        if i + 1 < len(user_history):
            user_question = user_history[i]['content']
            bot_response = user_history[i + 1]['content']
            evaluation = evaluate_response(user_question, bot_response)
            evaluations.append({
                "user_question": user_question,
                "bot_response": bot_response,
                "evaluation": evaluation
            })
    
    return evaluations

# Run evaluation
evaluations = evaluate_chatbot_performance(user_history)

# Print the evaluation results
for evaluation in evaluations:
    print(f"User Question: {evaluation['user_question']}")
    print(f"Bot Response: {evaluation['bot_response']}")
    print(f"Accuracy: {evaluation['evaluation']['accuracy']} / 5")
    print(f"Relevance: {evaluation['evaluation']['relevance']} / 5")
    print(f"Satisfaction: {evaluation['evaluation']['satisfaction']} / 5")
    print("-" * 50)


User Question: What is the return policy for items purchased at our store?
Bot Response: Our store offers a comprehensive return policy designed to make your shopping experience hassle-free:

1. **General Return Policy**: 
   - You can return most items within 30 days of purchase for a full refund or exchange.
   - Items must be in their original condition, with all tags and packaging intact.

2. **Non-returnable Items**:
   - Certain items, such as clearance merchandise, perishable goods, and personal care items, are non-returnable. Please check the product description for specific details.

3. **Refund Process**:
   - Refunds will be issued to the original form of payment.
   - Credit card refunds will be credited to your card, while cash or check payments will receive a cash refund.

If you have any further questions or need assistance with a return, feel free to let me know!
Accuracy: 1.7 / 5
Relevance: 3 / 5
Satisfaction: 5 / 5
--------------------------------------------------
Us

In [13]:
import csv
import json
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import sacrebleu

# Function to calculate BLEU score
def calculate_bleu(reference, hypothesis):
    bleu = sacrebleu.sentence_bleu(hypothesis, [reference])
    return bleu.score

# Function to calculate ROUGE score
def calculate_rouge(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return {key: round(value.fmeasure, 4) for key, value in scores.items()}

# Function to calculate METEOR score
def calculate_meteor(reference, hypothesis):
    # Validate inputs
    if not isinstance(reference, str):
        print(f"Invalid reference for METEOR: {reference}")
        return None
    if not isinstance(hypothesis, str):
        print(f"Invalid hypothesis for METEOR: {hypothesis}")
        return None

    try:
        # Calculate METEOR score
        return round(meteor_score([reference], hypothesis), 4)
    except Exception as e:
        print(f"Error calculating METEOR score: {e}")
        return None

# Load chatbot history from JSON file
def load_chatbot_history(json_file_path):
    with open(json_file_path, 'r') as file:
        return json.load(file)

# Load predefined answers from CSV file
def load_predefined_answers(csv_file_path):
    predefined_answers = {}
    with open(csv_file_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            question, answer = row
            predefined_answers[question.strip()] = answer.strip()
    return predefined_answers

# Evaluate chatbot responses
def evaluate_chatbot(chatbot_history, predefined_answers):
    results = []
    for interaction in chatbot_history:
        if interaction['role'] == 'user':
            question = interaction['content']
            predefined_answer = predefined_answers.get(question, "")
            bot_response = chatbot_history[chatbot_history.index(interaction) + 1]['content']

            print(f"\nUser Question: {question}")
            print(f"Bot Response: {bot_response}")
            print(f"Predefined Answer: {predefined_answer}")

            # Skip invalid or empty responses
            if not predefined_answer or not bot_response:
                print("Either predefined answer or bot response is missing. Skipping evaluation.")
                results.append({
                    "question": question,
                    "bot_response": bot_response,
                    "predefined_answer": predefined_answer,
                    "bleu_score": None,
                    "rouge_scores": None,
                    "meteor_score": None
                })
                continue

            # Calculate evaluation metrics
            try:
                bleu_score = calculate_bleu(predefined_answer, bot_response)
                rouge_scores = calculate_rouge(predefined_answer, bot_response)
                meteor = calculate_meteor(predefined_answer, bot_response)
            except Exception as e:
                print(f"Error during evaluation: {e}")
                bleu_score = rouge_scores = meteor = None

            # Store results
            results.append({
                "question": question,
                "bot_response": bot_response,
                "predefined_answer": predefined_answer,
                "bleu_score": bleu_score,
                "rouge_scores": rouge_scores,
                "meteor_score": meteor
            })

            # Display results
            print(f"BLEU Score: {bleu_score}")
            print(f"ROUGE Scores: {rouge_scores}")
            print(f"METEOR Score: {meteor}")
    return results

# Paths to files
chatbot_history_path = './USERS/12_conversation.json'  # Path to the chatbot conversation JSON file
predefined_answers_path = 'faq_data.csv'  # Path to the predefined answers CSV file

# Main script execution
if __name__ == "__main__":
    # Load data
    chatbot_history = load_chatbot_history(chatbot_history_path)
    predefined_answers = load_predefined_answers(predefined_answers_path)

    # Evaluate chatbot
    results = evaluate_chatbot(chatbot_history, predefined_answers)

    # Save results to a JSON file
    with open('evaluation_results.json', 'w') as outfile:
        json.dump(results, outfile, indent=4)



User Question: What is the return policy for items purchased at our store?
Bot Response: Our store offers a comprehensive return policy designed to make your shopping experience hassle-free:

1. **General Return Policy**: 
   - You can return most items within 30 days of purchase for a full refund or exchange.
   - Items must be in their original condition, with all tags and packaging intact.

2. **Non-returnable Items**:
   - Certain items, such as clearance merchandise, perishable goods, and personal care items, are non-returnable. Please check the product description for specific details.

3. **Refund Process**:
   - Refunds will be issued to the original form of payment.
   - Credit card refunds will be credited to your card, while cash or check payments will receive a cash refund.

If you have any further questions or need assistance with a return, feel free to let me know!
Predefined Answer: You can return most items within 30 days of purchase for a full refund or exchange. Item

1. This approach only allow to evaluate the chatbot's performance based on user interactions, predefined questions, and answers which is not the best way to determine the accuracy and the performance of the chatbot
2.  Manual Evaluation:Human Review: A team of human evaluators can assess chatbot responses based on criteria such as clarity, relevance, and helpfulness. This method ensures that context and nuance are considered, which can be missed by automated approaches. which from my own perspective as the human chating the bot, i think the bot is good and the response are accurate.
3. Performance Metrics with Natural Language Processing (NLP):
I  evaluate the chatbot’s responses using more advanced NLP techniques, such as semantic similarity, to assess the quality of the responses based on meaning rather than exact wording.
Steps:

I utilize metrics such as BLEU (Bilingual Evaluation Understudy), ROUGE (Recall-Oriented Understudy for Gisting Evaluation), and METEOR (Metric for Evaluation of Translation with Explicit ORdering) for more sophisticated text evaluation.
