In [37]:
def evaluate_aspect_and_sentiment_micro_macro(predictions_list, ground_truth_list):
    micro_aspect_tp = micro_aspect_fp = micro_aspect_fn = 0
    micro_sentiment_tp = micro_sentiment_fp = micro_sentiment_fn = 0
    
    macro_aspect_results = []
    macro_sentiment_results = []
    macro_aspect_accuracies = []
    macro_sentiment_accuracies = []

    total_cases = 0  # To estimate TN later
    
    for predictions, ground_truth in zip(predictions_list, ground_truth_list):
        aspect_matches = []
        sentiment_matches = []
        matched_ground_truth_indices = set()

        for pred in predictions:
            for index, truth in enumerate(ground_truth):
                if index in matched_ground_truth_indices:
                    continue  # Skip already matched ground truth

                # Check aspect match (aspect term and category)
                try:
                  if pred[0].lower() == truth[0].lower() and pred[3] == truth[3]:
                      aspect_matches.append((pred, truth))
                      matched_ground_truth_indices.add(index)
                      
                      # Check sentiment match (only if aspect matches)
                      if pred[1] == truth[1]:
                          sentiment_matches.append((pred, truth))
                      break
                except Exception as e:
                  print(e)
                  print(pred)
                  print(truth)
                  return 

        # Micro calculations for aspects
        micro_aspect_tp += len(aspect_matches)
        micro_aspect_fp += len(predictions) - len(aspect_matches)
        micro_aspect_fn += len(ground_truth) - len(aspect_matches)

        # Micro calculations for sentiments
        micro_sentiment_tp += len(sentiment_matches)
        micro_sentiment_fp += len(aspect_matches) - len(sentiment_matches)
        micro_sentiment_fn += len(ground_truth) - len(sentiment_matches)

        # Macro calculations for aspects
        aspect_precision = len(aspect_matches) / len(predictions) if predictions else 0
        aspect_recall = len(aspect_matches) / len(ground_truth) if ground_truth else 0
        aspect_f1 = 2 * aspect_precision * aspect_recall / (aspect_precision + aspect_recall) if (aspect_precision + aspect_recall) > 0 else 0
        macro_aspect_results.append((aspect_precision, aspect_recall, aspect_f1))

        # Macro calculations for sentiments
        sentiment_precision = len(sentiment_matches) / len(aspect_matches) if aspect_matches else 0
        sentiment_recall = len(sentiment_matches) / len(ground_truth) if ground_truth else 0
        sentiment_f1 = 2 * sentiment_precision * sentiment_recall / (sentiment_precision + sentiment_recall) if (sentiment_precision + sentiment_recall) > 0 else 0
        macro_sentiment_results.append((sentiment_precision, sentiment_recall, sentiment_f1))

        # Updating total cases for TN approximation
        total_cases += len(predictions) + len(ground_truth)  

        # Estimate instance-wise TN and accuracy for macro accuracy calculation
        aspect_tn = max(0, len(predictions) + len(ground_truth) - (len(aspect_matches) + (len(predictions) - len(aspect_matches)) + (len(ground_truth) - len(aspect_matches))))
        sentiment_tn = max(0, len(predictions) + len(ground_truth) - (len(sentiment_matches) + (len(aspect_matches) - len(sentiment_matches)) + (len(ground_truth) - len(sentiment_matches))))

        aspect_accuracy = (len(aspect_matches) + aspect_tn) / (len(predictions) + len(ground_truth)) if (len(predictions) + len(ground_truth)) > 0 else 0
        sentiment_accuracy = (len(sentiment_matches) + sentiment_tn) / (len(predictions) + len(ground_truth)) if (len(predictions) + len(ground_truth)) > 0 else 0

        macro_aspect_accuracies.append(aspect_accuracy)
        macro_sentiment_accuracies.append(sentiment_accuracy)
    
    # Estimate TN using total cases
    micro_aspect_tn = max(0, total_cases - (micro_aspect_tp + micro_aspect_fp + micro_aspect_fn))
    micro_sentiment_tn = max(0, total_cases - (micro_sentiment_tp + micro_sentiment_fp + micro_sentiment_fn))

    # Micro metrics
    micro_aspect_precision = micro_aspect_tp / (micro_aspect_tp + micro_aspect_fp) if micro_aspect_tp + micro_aspect_fp > 0 else 0
    micro_aspect_recall = micro_aspect_tp / (micro_aspect_tp + micro_aspect_fn) if micro_aspect_tp + micro_aspect_fn > 0 else 0
    micro_aspect_f1 = 2 * micro_aspect_precision * micro_aspect_recall / (micro_aspect_precision + micro_aspect_recall) if (micro_aspect_precision + micro_aspect_recall) > 0 else 0
    micro_aspect_accuracy = (micro_aspect_tp + micro_aspect_tn) / total_cases if total_cases > 0 else 0

    
    micro_sentiment_precision = micro_sentiment_tp / (micro_sentiment_tp + micro_sentiment_fp) if micro_sentiment_tp + micro_sentiment_fp > 0 else 0
    micro_sentiment_recall = micro_sentiment_tp / (micro_sentiment_tp + micro_sentiment_fn) if micro_sentiment_tp + micro_sentiment_fn > 0 else 0
    micro_sentiment_f1 = 2 * micro_sentiment_precision * micro_sentiment_recall / (micro_sentiment_precision + micro_sentiment_recall) if (micro_sentiment_precision + micro_sentiment_recall) > 0 else 0
    micro_sentiment_accuracy = (micro_sentiment_tp + micro_sentiment_tn) / total_cases if total_cases > 0 else 0


    # Macro metrics
    macro_aspect_precision, macro_aspect_recall, macro_aspect_f1 = zip(*macro_aspect_results)
    macro_sentiment_precision, macro_sentiment_recall, macro_sentiment_f1 = zip(*macro_sentiment_results)


    # Accuracy calculations
    micro_aspect_accuracy = (micro_aspect_tp + micro_aspect_tn) / total_cases if total_cases > 0 else 0
    micro_sentiment_accuracy = (micro_sentiment_tp + micro_sentiment_tn) / total_cases if total_cases > 0 else 0

    macro_aspect_accuracy = sum(macro_aspect_accuracies) / len(macro_aspect_accuracies) if macro_aspect_accuracies else 0
    macro_sentiment_accuracy = sum(macro_sentiment_accuracies) / len(macro_sentiment_accuracies) if macro_sentiment_accuracies else 0


    return {
        "Micro": {
            "Aspect": {"Precision": micro_aspect_precision, "Recall": micro_aspect_recall, "F1": micro_aspect_f1, "Accuracy": micro_aspect_accuracy},
            "Sentiment": {"Precision": micro_sentiment_precision, "Recall": micro_sentiment_recall, "F1": micro_sentiment_f1, "Accuracy": micro_sentiment_accuracy}
        },
        "Macro": {
            "Aspect": {"Precision": sum(macro_aspect_precision) / len(macro_aspect_precision), "Recall": sum(macro_aspect_recall) / len(macro_aspect_recall), "F1": sum(macro_aspect_f1) / len(macro_aspect_f1), "Accuracy": macro_aspect_accuracy},
            "Sentiment": {"Precision": sum(macro_sentiment_precision) / len(macro_sentiment_precision), "Recall": sum(macro_sentiment_recall) / len(macro_sentiment_recall), "F1": sum(macro_sentiment_f1) / len(macro_sentiment_f1), "Accuracy": macro_sentiment_accuracy}
        }
    }

# Example usage
predictions_list = [
    [("food", "positive", "good", "FOOD#QUALITY"), ("service", "negative", "bad", "SERVICE#GENERAL")],
    [("food", "positive", "nice", "FOOD#QUALITY")]
]
ground_truth_list = [
    [("food", "positive", "delicious", "FOOD#QUALITY"), ("service", "negative", "poor", "SERVICE#GENERAL")],
    [("food", "positive", "tasty", "FOOD#QUALITY")]
]

results = evaluate_aspect_and_sentiment_micro_macro(predictions_list, ground_truth_list)
print(results)

{'Micro': {'Aspect': {'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'Accuracy': 1.0}, 'Sentiment': {'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'Accuracy': 1.0}}, 'Macro': {'Aspect': {'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'Accuracy': 1.0}, 'Sentiment': {'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'Accuracy': 1.0}}}


In [45]:
import json

# Function to read data from JSONL file
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Convert data to format required for evaluation
def convert_data_for_evaluation(data, name='labels'):
    aspect_sentiment_pairs = []
    for item in data:
        predictions = item.get(name, [])
        aspect_sentiment_pairs.append([(pred['aspect'], pred['polarity'], pred['opinion'], pred['category']) for pred in predictions])
    return aspect_sentiment_pairs

# Paths to output JSONL files containing predictions
output_files = ["../datasets/laptop_quad_test.tsv.jsonl"]
# Paths to ground truth JSONL files
ground_truth_files = ["example_truth_test_data.tsv.jsonl"]

# Read predictions from output files
predictions_list = []
for output_file in output_files:
    predictions_data = read_jsonl(output_file)
    predictions_list.append(convert_data_for_evaluation(predictions_data))

# Read ground truth from ground truth files
ground_truth_list = []
for ground_truth_file in ground_truth_files:
    ground_truth_data = read_jsonl(ground_truth_file)
    ground_truth_list.append(convert_data_for_evaluation(ground_truth_data, name='labels'))
    
print("#####Testing Metrics#####")
results = evaluate_aspect_and_sentiment_micro_macro(predictions_list[0], ground_truth_list[0])
print(results)

#####Testing Metrics#####
{'Micro': {'Aspect': {'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'Accuracy': 1.0}, 'Sentiment': {'Precision': 0.9991386735572783, 'Recall': 0.9991386735572783, 'F1': 0.9991386735572783, 'Accuracy': 0.9991386735572783}}, 'Macro': {'Aspect': {'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'Accuracy': 1.0}, 'Sentiment': {'Precision': 0.9987745098039216, 'Recall': 0.9987745098039216, 'F1': 0.9987745098039216, 'Accuracy': 0.9987745098039216}}}
