# Looking at the Errors

We'll look at the errors in the best and worst models.

In [1]:
from ast import literal_eval


In [2]:
from datasets import Dataset

sampled_dataset = Dataset.from_csv('/kaggle/input/domain-detection-on-500-sample/domain_detection_on_500_sample.csv')
sampled_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['conversation_idx', 'dialogue_id', 'turn_idx', 'turn_context', 'messages', 'gemma_3-1b', 'gemma_3-4b', 'qwen2.5-0.5b', 'qwen2.5-1.5b', 'qwen2.5-3b', 'gemini_flash2.0', 'domains'],
    num_rows: 500
})

# Remember the Results

In [3]:
target_domains = [literal_eval(domain) for domain in sampled_dataset['domains']]
VALID_DOMAINS = set([domain for domains in target_domains for domain in domains])
VALID_DOMAINS

{'attraction', 'hotel', 'restaurant', 'taxi', 'train'}

In [4]:
def evaluate_domain_predictions(targets, predictions, with_heuristics=False, print_error=True):
    """
    Evaluates domain prediction performance, rounding outputs to 3 decimal places.

    Args:
        targets: A list of lists, where each inner list contains the target domains for a sample.
        predictions: A list of lists, where each inner list contains the predicted domains for a sample.

    Returns:
        A dictionary containing evaluation metrics (rounded to 3dp):
            - 'exact_match_accuracy': The percentage of samples where all predicted domains exactly match the target domains.
            - 'partial_match_accuracy': The percentage of samples where at least one predicted domain matches a target domain.
            - 'precision': The average precision across all samples.
            - 'recall': The average recall across all samples.
            - 'f1': The average F1-score across all samples.
    """

    if len(targets) != len(predictions):
        raise ValueError("Targets and predictions lists must have the same length.")

    num_samples = len(targets)
    exact_match_count = 0
    partial_match_count = 0
    total_precision = 0.0
    total_recall = 0.0
    total_f1 = 0.0

    for i in range(num_samples):
        target_set = set(targets[i])

        try:
            if with_heuristics:
                prediction_set = set([pred for pred in predictions[i] if pred in VALID_DOMAINS])
            else:
                prediction_set = set(predictions[i])
        except Exception as e:
            prediction_set = set()
            if print_error:
                print(f"Something's wrong here {e}. Predicted domain {predictions[i]}. Correct domain {target_set}")
        
        if target_set == prediction_set:
            exact_match_count += 1

        if len(target_set.intersection(prediction_set)) > 0:
            partial_match_count += 1

        if len(prediction_set) > 0:
            precision = len(target_set.intersection(prediction_set)) / len(prediction_set)
        else:
            precision = 0.0

        if len(target_set) > 0:
            recall = len(target_set.intersection(prediction_set)) / len(target_set)
        else:
            recall = 0.0

        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0.0

        total_precision += precision
        total_recall += recall
        total_f1 += f1

    exact_match_accuracy = round(exact_match_count / num_samples, 3)
    partial_match_accuracy = round(partial_match_count / num_samples, 3)
    average_precision = round(total_precision / num_samples, 3)
    average_recall = round(total_recall / num_samples, 3)
    average_f1 = round(total_f1 / num_samples, 3)

    return {
        'exact_match_accuracy': exact_match_accuracy,
        'partial_match_accuracy': partial_match_accuracy,
        'precision': average_precision,
        'recall': average_recall,
        'f1': average_f1,
    }

# Example Usage
targets = [["domain1", "domain2"], ["domain3"], ["domain1", "domain4", "domain5"]]
predictions = [["domain1", "domain2"], ["domain3", "domain6"], ["domain1", "domain4"]]

results = evaluate_domain_predictions(targets, predictions)
print(results)

{'exact_match_accuracy': 0.333, 'partial_match_accuracy': 1.0, 'precision': 0.833, 'recall': 0.889, 'f1': 0.822}


In [5]:
models_used = list(sampled_dataset.features)[5:11]
models_used

['gemma_3-1b',
 'gemma_3-4b',
 'qwen2.5-0.5b',
 'qwen2.5-1.5b',
 'qwen2.5-3b',
 'gemini_flash2.0']

In [6]:
for model_name in models_used:
    print(f'------------- {model_name.upper()} ----------')
    predictions = [literal_eval(pred) for pred in sampled_dataset[model_name]]
    print(evaluate_domain_predictions(target_domains, predictions, with_heuristics=True))

------------- GEMMA_3-1B ----------
{'exact_match_accuracy': 0.61, 'partial_match_accuracy': 0.656, 'precision': 0.638, 'recall': 0.61, 'f1': 0.613}
------------- GEMMA_3-4B ----------
{'exact_match_accuracy': 0.448, 'partial_match_accuracy': 0.772, 'precision': 0.634, 'recall': 0.743, 'f1': 0.661}
------------- QWEN2.5-0.5B ----------
{'exact_match_accuracy': 0.416, 'partial_match_accuracy': 0.576, 'precision': 0.526, 'recall': 0.534, 'f1': 0.517}
------------- QWEN2.5-1.5B ----------
{'exact_match_accuracy': 0.354, 'partial_match_accuracy': 0.45, 'precision': 0.439, 'recall': 0.405, 'f1': 0.413}
------------- QWEN2.5-3B ----------
{'exact_match_accuracy': 0.44, 'partial_match_accuracy': 0.728, 'precision': 0.614, 'recall': 0.694, 'f1': 0.63}
------------- GEMINI_FLASH2.0 ----------
{'exact_match_accuracy': 0.714, 'partial_match_accuracy': 0.774, 'precision': 0.724, 'recall': 0.735, 'f1': 0.714}


# Errors in Qwen2.5-0.5B

In [60]:
model_name = 'qwen2.5-0.5b'

count = 0
mistakes = []

for row in sampled_dataset:
    predicted = set(literal_eval(row[model_name]))
    target = set(literal_eval(row['domains']))
    if predicted != target:
        mistakes.append({'id': count, 'context': row['turn_context'], 'predictions': literal_eval(row[model_name]), 'target': literal_eval(row['domains'])})
        count += 1
    if count and count < 10:
        print(mistakes[-1])
        print('-----------')
        
print(f"Of the 500 samples, qwen2.5-0.5b made mistakes in %d of them."%len(mistakes))

{'id': 0, 'context': 'Customer: Please help me find a train departing Stansted Airport and going to Cambridge. Thanks.\nAgent: Certainly. What day and time will you be traveling?\nCustomer: I would like to leave on Monday after 17:15.\nAgent: TR4096 leaves at 17:24 and arrives in Cambridge at 17:52. How many tickets do you need?\nCustomer: I just need to know the price please.\nAgent: They are 10.10 pounds. Is there anything else I can do to help you?\nCustomer: Can you tell me where the scott polar museum is located.\n', 'predictions': ['train', 'museum'], 'target': ['attraction']}
-----------
{'id': 0, 'context': 'Customer: Please help me find a train departing Stansted Airport and going to Cambridge. Thanks.\nAgent: Certainly. What day and time will you be traveling?\nCustomer: I would like to leave on Monday after 17:15.\nAgent: TR4096 leaves at 17:24 and arrives in Cambridge at 17:52. How many tickets do you need?\nCustomer: I just need to know the price please.\nAgent: They are 1

Looking at the errors, we can see that although the model makes a majority of the mistakes, the annotations are not perfect either. I categorised the errors into 'model_errors' and 'annotation_errors' and prompted **Gemini 2.5 pro** to categorise 50 mistake samples into one of each category. 


**Model Errors typically occurred when:**

- Predicting domains for non-domain utterances: The model incorrectly assigned domains (commonly 'hotel', 'attraction', 'train') to closing statements ("Thank you", "That's all", etc.), confirmations ("I suppose that works"), or other utterances that didn't contain an active request. The target for these should generally be empty ([]). (Examples: IDs 1, 3, 5, 10, 13, 14, 16, 17, 18, 20, 24)
- Hallucinating or carrying over inactive domains: The model predicted one or more domains that were not mentioned in the final utterance, sometimes alongside a correctly predicted domain. This often involved predicting 'hotel' or 'attraction' inappropriately. (Examples: IDs 0, 2, 8, 15, 21)
- Misclassifying the correct domain: The model predicted a completely wrong domain compared to the user's explicit request in the final utterance. (Examples: IDs 4, 9, 19)
- Failing to predict all mentioned domains: The user's utterance clearly mentioned multiple domains, but the model only predicted one. (Example: ID 12)
- Incorrect level of granularity: The model predicted a specific type (e.g., 'museum', 'park') when the target annotation required the more general category ('attraction'). (Examples: IDs 6, 22)
- Predicting specific entities as domains: The model included a proper name (e.g., 'chiquito restaurant') as if it were a domain category. (Example: ID 7)

**Annotation Errors typically occurred when**:

- Target contained unmentioned domains: The target label included a domain that was not requested or implied in the final customer utterance. (Example: ID 11, where 'restaurant' was in the target but only 'attraction' was asked for).
- Target potentially carried over inactive domains: In some cases (like ID 2 and ID 23), the target label included domains relevant earlier in the conversation but not active in the final specific utterance being evaluated.


**Note**
It's possible that I've missed something important about annotation instructions.

In [40]:
llm_judgements = [
  {'id': 0, 'judgement': 'model_error'},
  {'id': 1, 'judgement': 'model_error'},
  {'id': 2, 'judgement': 'model_error'},
  {'id': 3, 'judgement': 'model_error'},
  {'id': 4, 'judgement': 'model_error'},
  {'id': 5, 'judgement': 'model_error'},
  {'id': 6, 'judgement': 'model_error'},
  {'id': 7, 'judgement': 'model_error'},
  {'id': 8, 'judgement': 'model_error'},
  {'id': 9, 'judgement': 'model_error'},
  {'id': 10, 'judgement': 'model_error'},
  {'id': 11, 'judgement': 'annotation_error'},
  {'id': 12, 'judgement': 'model_error'},
  {'id': 13, 'judgement': 'model_error'},
  {'id': 14, 'judgement': 'model_error'},
  {'id': 15, 'judgement': 'model_error'},
  {'id': 16, 'judgement': 'model_error'},
  {'id': 17, 'judgement': 'model_error'},
  {'id': 18, 'judgement': 'model_error'},
  {'id': 19, 'judgement': 'model_error'},
  {'id': 20, 'judgement': 'model_error'},
  {'id': 21, 'judgement': 'model_error'},
  {'id': 22, 'judgement': 'model_error'},
  {'id': 23, 'judgement': 'model_error'},
  {'id': 24, 'judgement': 'model_error'},
  {'id': 25, 'judgement': 'annotation_error'},
  {'id': 26, 'judgement': 'model_error'},
  {'id': 27, 'judgement': 'annotation_error'},
  {'id': 28, 'judgement': 'model_error'},
  {'id': 29, 'judgement': 'model_error'},
  {'id': 30, 'judgement': 'model_error'},
  {'id': 31, 'judgement': 'model_error'},
  {'id': 32, 'judgement': 'annotation_error'},
  {'id': 33, 'judgement': 'model_error'},
  {'id': 34, 'judgement': 'model_error'},
  {'id': 35, 'judgement': 'model_error'},
  {'id': 36, 'judgement': 'model_error'},
  {'id': 37, 'judgement': 'model_error'},
  {'id': 38, 'judgement': 'model_error'},
  {'id': 39, 'judgement': 'model_error'},
  {'id': 40, 'judgement': 'annotation_error'},
  {'id': 41, 'judgement': 'model_error'},
  {'id': 42, 'judgement': 'model_error'},
  {'id': 43, 'judgement': 'model_error'},
  {'id': 44, 'judgement': 'model_error'},
  {'id': 45, 'judgement': 'model_error'},
  {'id': 46, 'judgement': 'model_error'},
  {'id': 47, 'judgement': 'model_error'},
  {'id': 48, 'judgement': 'model_error'},
  {'id': 49, 'judgement': 'model_error'}
]

In [45]:
model_errors = [item['judgement'] for item in llm_judgements if item['judgement'] == 'model_error']
percentage_model_errors = len(model_errors)/len(llm_judgement) * 100
print("%0.1f of the errors came from the model"%percentage_model_errors)

90.0 of the errors came from the model


# See Errors in Gemini Flash 2.0


In [56]:
model_name = 'gemini_flash2.0'
count = 0
mistakes = []

for row in sampled_dataset:
    predicted = set(literal_eval(row[model_name]))
    target = set(literal_eval(row['domains']))
    if predicted != target:
        mistakes.append({'id': count, 'context': row['turn_context'], 'predictions': literal_eval(row[model_name]), 'target': literal_eval(row['domains'])})
        count += 1
        
print(f"Of the 500 samples, %s made mistakes in %d of them."%(model_name, len(mistakes)))

Of the 500 samples, gemini_flash2.0 made mistakes in 143 of them.


Again, 50 of the mistake made by gemini_2.0_flash were analysed using Gemini 2.5 Pro. 

**Model Errors typically occurred when**:

- Incorrectly predicting domains for closing remarks (e.g., id 33, 42, 47).
- Failing to drop a domain when the conversation explicitly shifted away from it (e.g., id 0, 3, 7, 10, 13, 22, 23, 26, 28, 29, 35, 38, 43, 44, 45, 46, 49).
- Failing to predict a domain when the final utterance clearly requested information or action within that domain (e.g., id 14, 18, 21, 27, 31, 34, 37, 41).

**Annotation Errors typically occurred when**:

- The target included domains that were not mentioned or implied in the final customer utterance, even if they were relevant earlier in the conversation (e.g., id 1, 11, 12, 15, 20, 25, 30, 32, 39, 40, 48)
- The target included domains completely unrelated to the final utterance (e.g., id 5).
- The target included a domain when the final utterance was just a closing remark (e.g., id 1, 19, 36).

In [54]:
llm_judgements = [
    {"id": 0, "judgement": "model_error"},
    {"id": 1, "judgement": "annotation_error"},
    {"id": 2, "judgement": "model_error"},
    {"id": 3, "judgement": "model_error"},
    {"id": 4, "judgement": "model_error"},
    {"id": 5, "judgement": "annotation_error"},
    {"id": 6, "judgement": "model_error"},
    {"id": 7, "judgement": "model_error"},
    {"id": 8, "judgement": "model_error"},
    {"id": 9, "judgement": "model_error"},
    {"id": 10, "judgement": "model_error"},
    {"id": 11, "judgement": "annotation_error"},
    {"id": 12, "judgement": "annotation_error"},
    {"id": 13, "judgement": "model_error"},
    {"id": 14, "judgement": "model_error"},
    {"id": 15, "judgement": "annotation_error"},
    {"id": 16, "judgement": "model_error"},
    {"id": 17, "judgement": "model_error"},
    {"id": 18, "judgement": "model_error"},
    {"id": 19, "judgement": "annotation_error"},
    {"id": 20, "judgement": "annotation_error"},
    {"id": 21, "judgement": "model_error"},
    {"id": 22, "judgement": "model_error"},
    {"id": 23, "judgement": "model_error"},
    {"id": 24, "judgement": "model_error"},
    {"id": 25, "judgement": "annotation_error"},
    {"id": 26, "judgement": "model_error"},
    {"id": 27, "judgement": "model_error"},
    {"id": 28, "judgement": "model_error"},
    {"id": 29, "judgement": "model_error"},
    {"id": 30, "judgement": "annotation_error"},
    {"id": 31, "judgement": "model_error"},
    {"id": 32, "judgement": "annotation_error"},
    {"id": 33, "judgement": "model_error"},
    {"id": 34, "judgement": "model_error"},
    {"id": 35, "judgement": "model_error"},
    {"id": 36, "judgement": "annotation_error"},
    {"id": 37, "judgement": "model_error"},
    {"id": 38, "judgement": "model_error"},
    {"id": 39, "judgement": "annotation_error"},
    {"id": 40, "judgement": "annotation_error"},
    {"id": 41, "judgement": "model_error"},
    {"id": 42, "judgement": "model_error"},
    {"id": 43, "judgement": "model_error"},
    {"id": 44, "judgement": "model_error"},
    {"id": 45, "judgement": "model_error"},
    {"id": 46, "judgement": "model_error"},
    {"id": 47, "judgement": "model_error"},
    {"id": 48, "judgement": "annotation_error"},
    {"id": 49, "judgement": "model_error"},
]

In [55]:
model_errors = [item['judgement'] for item in llm_judgements if item['judgement'] == 'model_error']
percentage_model_errors = len(model_errors)/len(llm_judgement) * 100
print("%0.1f of the errors came from the model"%percentage_model_errors)

72.0 of the errors came from the model
