In [3]:
import json

def load_data():
    """
    Loads test data and results data from two JSONL files.
    Skips lines that cannot be parsed as valid JSON.
    """
    test_file_path = "../datasets/laptop_quad_test.tsv.jsonl"
    results_file_path = "../datasets/clean_full_results2.jsonl"
    
    test_data = []
    results_data = []
    
    # Load test data
    with open(test_file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                # Skip empty lines
                continue
            try:
                data_item = json.loads(line)
                test_data.append(data_item)
            except json.JSONDecodeError:
                # Skip lines that aren't valid JSON
                continue
            
    # Load results data
    with open(results_file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                # Skip empty lines
                continue
            try:
                data_item = json.loads(line)
                results_data.append(data_item)
            except json.JSONDecodeError:
                # Skip lines that aren't valid JSON
                continue
            
    return test_data, results_data

def evaluate_aspect_and_sentiment(test_data, results_data):
    """
    Evaluates aspect-sentiment pairs for precision, recall, and F1 score.
    Skips any entries that are not dictionaries or do not have a valid 'labels' list.
    
    Debug statements print the test_label_set and result_label_set for every comparison.
    """
    total_test_labels = 0
    total_result_labels = 0
    total_correct = 0

    # Loop over both datasets in parallel
    for test_item, result_item in zip(test_data, results_data):
        # Ensure both items are dictionaries
        if not isinstance(test_item, dict) or not isinstance(result_item, dict):
            continue
        
        # Extract 'labels' as lists (skip if not a list)
        test_labels = test_item.get("labels", [])
        result_labels = result_item.get("labels", [])
        if not isinstance(test_labels, list) or not isinstance(result_labels, list):
            continue
        
        # Build sets of (aspect, polarity) from test data
        test_label_set = set()
        for label in test_labels:
            if isinstance(label, dict):
                aspect = label.get("aspect")
                polarity = label.get("polarity")
                test_label_set.add((aspect, polarity))
        
        # Build sets of (aspect, polarity) from results data
        result_label_set = set()
        for label in result_labels:
            if isinstance(label, dict):
                aspect = label.get("aspect")
                polarity = label.get("polarity")
                result_label_set.add((aspect, polarity))

        # Tally up counts
        total_test_labels += len(test_label_set)
        total_result_labels += len(result_label_set)
        total_correct += len(test_label_set.intersection(result_label_set))
    
    # Compute precision, recall, F1
    precision = total_correct / total_result_labels if total_result_labels else 0.0
    recall = total_correct / total_test_labels if total_test_labels else 0.0
    f1 = (
        (2 * precision * recall) / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )

    metrics = {
        "aspect_sentiment": {
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
    }
    return metrics

def run_evaluation():
    # Load data
    test_data, results_data = load_data()
    
    # Evaluate
    metrics = evaluate_aspect_and_sentiment(test_data, results_data)
    
    # Print final results
    print("\nEvaluation Metrics:")
    print(json.dumps(metrics, indent=4))

if __name__ == "__main__":
    run_evaluation()



Evaluation Metrics:
{
    "aspect_sentiment": {
        "precision": 0.6485097636176773,
        "recall": 0.6272365805168986,
        "f1": 0.6376958059626074
    }
}


In [4]:
# Sentiment evaluation is performed considering all predicted sentiments regardless of the wrong/correct aspects 

def evaluate_aspect_and_sentiment(predictions_list, ground_truth_list):
    """
    Evaluates aspect and sentiment analysis results on a per-review basis, then averages the metrics.

    Args:
        predictions_list: A list of lists, where each inner list contains tuples of (aspect, sentiment, ...).
                            Each inner list represents the predictions for a single review.
        ground_truth_list: A list of lists, where each inner list contains tuples of (aspect, sentiment, ...).
                            Each inner list represents the ground truth for a single review.

    Returns:
        A dictionary containing the averaged aspect and sentiment evaluation metrics.

    Functionality:
        1.  Iterates through each review's predictions and ground truth.
        2.  Calculates aspect metrics (precision, recall, F1, accuracy) for each review.
        3.  Calculates sentiment metrics (precision, recall, F1, accuracy) for each review,
            considering all predicted sentiments against all ground truth sentiments.
        4.  Stores the per-review metrics in a list.
        5.  Averages the per-review metrics to obtain the final, overall metrics.

    Note:
        -   Aspect matching is case-insensitive.
        -   Sentiment evaluation is performed considering all predicted sentiments versus all ground truth sentiments.
        -   This function is designed to provide a more granular evaluation by treating each review
            as an independent unit and then averaging the performance.
        -   **Handling Different Number of Aspects:**
            -   **Aspect Evaluation:** The use of sets (`predicted_aspects`, `actual_aspects`) and set operations (intersection)
                                ensures fair comparison regardless of differing numbers of aspects. Precision and recall are calculated
                                to penalize extra or missing aspects, respectively.
            -   **Example:** If a prediction review has 2 aspects and the ground truth has 3, the aspect recall will be affected,
                                Similarly, if a prediction has extra aspects, the aspect precision will be affected
    """
    review_metrics = []  # List to store metrics for each review

    # Iterate through each review's predictions and ground truth
    for predictions, ground_truth in zip(predictions_list, ground_truth_list):
        # Aspect Evaluation
        predicted_aspects = {p[0].lower() for p in predictions}  # Set of predicted aspects (lowercase)
        actual_aspects = {gt[0].lower() for gt in ground_truth}    # Set of actual aspects (lowercase)

        # Calculate aspect metrics for the current review
        aspect_correct = len(predicted_aspects.intersection(actual_aspects))  # Number of correctly predicted aspects, correctly identifies the aspects that are common to both the prediction and the ground truth
        aspect_predicted = len(predicted_aspects)  # Total number of predicted aspects
        aspect_actual = len(actual_aspects)      # Total number of actual aspects

        # Calculate precision, recall, F1, and accuracy for aspects
        aspect_precision = aspect_correct / aspect_predicted if aspect_predicted > 0 else 0  # penalizes the model for predicting extra aspects that are not in the ground truth.
        aspect_recall = aspect_correct / aspect_actual if aspect_actual > 0 else 0  # penalizes the model for missing aspects that are present in the ground truth.
        aspect_f1 = 2 * (aspect_precision * aspect_recall) / (aspect_precision + aspect_recall) if (aspect_precision + aspect_recall) > 0 else 0
        aspect_accuracy = aspect_correct / aspect_actual if aspect_actual > 0 else 0

        # Sentiment Evaluation (considering all sentiments)
        sentiment_correct = 0
        sentiment_predicted = len(predictions)  # Total number of predicted sentiments.
        sentiment_actual = len(ground_truth)    # Total number of actual sentiments.

        # Calculate correctly predicted sentiments
        for pred in predictions:
            for truth in ground_truth:
                if pred[0].lower() == truth[0].lower() and pred[1] == truth[1]:
                    sentiment_correct += 1

        # Calculate precision, recall, F1, and accuracy for sentiments
        sentiment_precision = sentiment_correct / sentiment_predicted if sentiment_predicted > 0 else 0
        sentiment_recall = sentiment_correct / sentiment_actual if sentiment_actual > 0 else 0
        sentiment_f1 = 2 * (sentiment_precision * sentiment_recall) / (sentiment_precision + sentiment_recall) if (sentiment_precision + sentiment_recall) > 0 else 0
        sentiment_accuracy = sentiment_correct / sentiment_actual if sentiment_actual > 0 else 0

        # Store metrics for the current review
        review_metrics.append({
            "Aspect": {
                "Precision": aspect_precision,
                "Recall": aspect_recall,
                "F1": aspect_f1,
                "Accuracy": aspect_accuracy,
            },
            "Sentiment": {
                "Precision": sentiment_precision,
                "Recall": sentiment_recall,
                "F1": sentiment_f1,
                "Accuracy": sentiment_accuracy,
            },
        })

    # Average metrics
    avg_metrics = {
        "Aspect": {"Precision": 0, "Recall": 0, "F1": 0, "Accuracy": 0},
        "Sentiment": {"Precision": 0, "Recall": 0, "F1": 0, "Accuracy": 0},
    }

    num_reviews = len(review_metrics)  # Number of reviews
    if num_reviews == 0:
        return "No review metrics provided."

    # Sum up metrics from all reviews
    for review in review_metrics:
        for metric_type in ["Aspect", "Sentiment"]:
            for metric in ["Precision", "Recall", "F1", "Accuracy"]:
                avg_metrics[metric_type][metric] += review[metric_type][metric]

    # Calculate average metrics
    for metric_type in ["Aspect", "Sentiment"]:
        for metric in ["Precision", "Recall", "F1", "Accuracy"]:
            avg_metrics[metric_type][metric] /= num_reviews

    # Format the output for better readability
    output = ""
    for metric_type in ["Aspect", "Sentiment"]:
        output += f"{metric_type}: (\n"
        for metric, value in avg_metrics[metric_type].items():
            output += f"   '{metric}': {value:.4f}\n"
        output += ")\n"
    return output


import json

# Function to read data from JSONL file while skipping errors
def read_jsonl(file_path):
    """Reads data from a JSONL file, skipping invalid lines."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # Skip empty lines
            try:
                item = json.loads(line)
                if isinstance(item, dict):
                    data.append(item)
                else:
                    print(f"Warning: Skipping non-dictionary entry in {file_path}: {line}")
            except json.JSONDecodeError as e:
                print(f"Warning: Skipping invalid JSON line in {file_path}: {line} - Error: {e}")
    return data

def convert_data_for_evaluation(data, name='labels'):
    """
    Converts data to a format suitable for aspect and sentiment evaluation.
    Handles cases where 'category' might be missing and fills it with None.

    Args:
        data: A list of dictionaries, where each dictionary contains a 'labels' key (or other specified key).
        name: The key to access the list of predictions within each dictionary.

    Returns:
        A list of lists, where each inner list contains tuples of 
        (aspect, polarity, opinion, category).
    """
    aspect_sentiment_pairs = []
    for item in data:
        predictions = item.get(name, [])
        if not isinstance(predictions, list):
            print(f"Warning: Skipping item with invalid predictions: {item}")
            continue

        converted_predictions = []
        for pred in predictions:
            if not isinstance(pred, dict):
                print(f"Warning: Skipping non-dictionary prediction: {pred}")
                continue

            aspect = pred.get('aspect', None)
            polarity = pred.get('polarity', None)
            opinion = pred.get('opinion', None)
            category = pred.get('category', None)

            # Skip predictions with missing aspect or polarity or opinion
            if aspect is not None and polarity is not None and opinion is not None:
                converted_predictions.append((aspect, polarity, opinion, category))
            else:
                print(f"Warning: Skipping prediction with missing data: {pred}")

        aspect_sentiment_pairs.append(converted_predictions)
    return aspect_sentiment_pairs

# Paths to output JSONL files containing predictions
# output_files = ["../datasets/clean_full_results.jsonl"]
output_files = ["../datasets/clean_full_results2.jsonl"]
# output_files = ["../datasets/deepseek_r1_results.jsonl"]

# Paths to ground truth JSONL files
ground_truth_files = ["../datasets/laptop_quad_test.tsv.jsonl"]

# Read predictions from output files
predictions_list = []
for output_file in output_files:
    predictions_data = read_jsonl(output_file)
    predictions_list.append(convert_data_for_evaluation(predictions_data))

# Read ground truth from ground truth files
ground_truth_list = []
for ground_truth_file in ground_truth_files:
    ground_truth_data = read_jsonl(ground_truth_file)
    ground_truth_list.append(convert_data_for_evaluation(ground_truth_data, name='labels'))

print("#####Testing Metrics#####")
results = evaluate_aspect_and_sentiment(predictions_list[0], ground_truth_list[0])
print(results)


# # Test cases
# predictions_list = [
#     [("battery", "positive"), ("screen", "positive"), ("performance", "positive")],

# ]
# ground_truth_list = [
#     [("battery", "positive"), ("screen", "positive")],
# ]

# print("TEST TEST TEST")
# results = per_review_evaluate_aspect_and_sentiment(predictions_list, ground_truth_list)
# print(results)

#####Testing Metrics#####
Aspect: (
   'Precision': 0.7493
   'Recall': 0.7343
   'F1': 0.7329
   'Accuracy': 0.7343
)
Sentiment: (
   'Precision': 0.7860
   'Recall': 0.8280
   'F1': 0.7767
   'Accuracy': 0.8280
)

