In [16]:
# Sentiment evaluation is performed considering all predicted sentiments regardless of the wrong/correct aspects 

def evaluate_aspect_and_sentiment(predictions_list, ground_truth_list):
    """
    Evaluates aspect and sentiment analysis results on a per-review basis, then averages the metrics.

    Args:
        predictions_list: A list of lists, where each inner list contains tuples of (aspect, sentiment, ...).
                            Each inner list represents the predictions for a single review.
        ground_truth_list: A list of lists, where each inner list contains tuples of (aspect, sentiment, ...).
                            Each inner list represents the ground truth for a single review.

    Returns:
        A dictionary containing the averaged aspect and sentiment evaluation metrics.

    Functionality:
        1.  Iterates through each review's predictions and ground truth.
        2.  Calculates aspect metrics (precision, recall, F1, accuracy) for each review.
        3.  Calculates sentiment metrics (precision, recall, F1, accuracy) for each review,
            considering all predicted sentiments against all ground truth sentiments.
        4.  Stores the per-review metrics in a list.
        5.  Averages the per-review metrics to obtain the final, overall metrics.

    Note:
        -   Aspect matching is case-insensitive.
        -   Sentiment evaluation is performed considering all predicted sentiments versus all ground truth sentiments.
        -   This function is designed to provide a more granular evaluation by treating each review
            as an independent unit and then averaging the performance.
        -   **Handling Different Number of Aspects:**
            -   **Aspect Evaluation:** The use of sets (`predicted_aspects`, `actual_aspects`) and set operations (intersection)
                                ensures fair comparison regardless of differing numbers of aspects. Precision and recall are calculated
                                to penalize extra or missing aspects, respectively.
            -   **Example:** If a prediction review has 2 aspects and the ground truth has 3, the aspect recall will be affected,
                                Similarly, if a prediction has extra aspects, the aspect precision will be affected
    """
    review_metrics = []  # List to store metrics for each review

    # Iterate through each review's predictions and ground truth
    for predictions, ground_truth in zip(predictions_list, ground_truth_list):
        # Aspect Evaluation
        predicted_aspects = {p[0].lower() for p in predictions}  # Set of predicted aspects (lowercase)
        actual_aspects = {gt[0].lower() for gt in ground_truth}    # Set of actual aspects (lowercase)

        # Calculate aspect metrics for the current review
        aspect_correct = len(predicted_aspects.intersection(actual_aspects))  # Number of correctly predicted aspects, correctly identifies the aspects that are common to both the prediction and the ground truth
        aspect_predicted = len(predicted_aspects)  # Total number of predicted aspects
        aspect_actual = len(actual_aspects)      # Total number of actual aspects

        # Calculate precision, recall, F1, and accuracy for aspects
        aspect_precision = aspect_correct / aspect_predicted if aspect_predicted > 0 else 0  # penalizes the model for predicting extra aspects that are not in the ground truth.
        aspect_recall = aspect_correct / aspect_actual if aspect_actual > 0 else 0  # penalizes the model for missing aspects that are present in the ground truth.
        aspect_f1 = 2 * (aspect_precision * aspect_recall) / (aspect_precision + aspect_recall) if (aspect_precision + aspect_recall) > 0 else 0
        aspect_accuracy = aspect_correct / aspect_actual if aspect_actual > 0 else 0

        # Sentiment Evaluation (considering all sentiments)
        sentiment_correct = 0
        sentiment_predicted = len(predictions)  # Total number of predicted sentiments.
        sentiment_actual = len(ground_truth)    # Total number of actual sentiments.

        # Calculate correctly predicted sentiments
        for pred in predictions:
            for truth in ground_truth:
                if pred[0].lower() == truth[0].lower() and pred[1] == truth[1]:
                    sentiment_correct += 1

        # Calculate precision, recall, F1, and accuracy for sentiments
        sentiment_precision = sentiment_correct / sentiment_predicted if sentiment_predicted > 0 else 0
        sentiment_recall = sentiment_correct / sentiment_actual if sentiment_actual > 0 else 0
        sentiment_f1 = 2 * (sentiment_precision * sentiment_recall) / (sentiment_precision + sentiment_recall) if (sentiment_precision + sentiment_recall) > 0 else 0
        sentiment_accuracy = sentiment_correct / sentiment_actual if sentiment_actual > 0 else 0

        # Store metrics for the current review
        review_metrics.append({
            "Aspect": {
                "Precision": aspect_precision,
                "Recall": aspect_recall,
                "F1": aspect_f1,
                "Accuracy": aspect_accuracy,
            },
            "Sentiment": {
                "Precision": sentiment_precision,
                "Recall": sentiment_recall,
                "F1": sentiment_f1,
                "Accuracy": sentiment_accuracy,
            },
        })

    # Average metrics
    avg_metrics = {
        "Aspect": {"Precision": 0, "Recall": 0, "F1": 0, "Accuracy": 0},
        "Sentiment": {"Precision": 0, "Recall": 0, "F1": 0, "Accuracy": 0},
    }

    num_reviews = len(review_metrics)  # Number of reviews
    if num_reviews == 0:
        return "No review metrics provided."

    # Sum up metrics from all reviews
    for review in review_metrics:
        for metric_type in ["Aspect", "Sentiment"]:
            for metric in ["Precision", "Recall", "F1", "Accuracy"]:
                avg_metrics[metric_type][metric] += review[metric_type][metric]

    # Calculate average metrics
    for metric_type in ["Aspect", "Sentiment"]:
        for metric in ["Precision", "Recall", "F1", "Accuracy"]:
            avg_metrics[metric_type][metric] /= num_reviews

    # Format the output for better readability
    output = ""
    for metric_type in ["Aspect", "Sentiment"]:
        output += f"{metric_type}: (\n"
        for metric, value in avg_metrics[metric_type].items():
            output += f"   '{metric}': {value:.4f}\n"
        output += ")\n"
    return output


import json

def read_jsonl(file_path):
    """
    Reads data from a JSONL file (one JSON object per line)
    or a single JSON array file.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        first_char = f.read(1)
        f.seek(0) # Go back to the beginning of the file

        # Assume JSON format
        if first_char == '[': # It's likely a single JSON array
            try:
                data = json.load(f) # Load the entire file as one JSON structure
                if not isinstance(data, list):
                    print(f"Warning: File {file_path} started with '[' but is not a JSON array. Skipping.")
                    data = []
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON array from {file_path}: {e}")
                data = []
        else: # Assume JSONL format
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    item = json.loads(line)
                    if isinstance(item, dict):
                        data.append(item)
                    else:
                        print(f"Warning: Skipping non-dictionary entry in {file_path}: {line}")
                except json.JSONDecodeError as e:
                    print(f"Warning: Skipping invalid JSON line in {file_path}: {line} - Error: {e}")
    return data

def convert_data_for_evaluation(data, name='labels'):
    """
    Converts data to a format suitable for aspect and sentiment evaluation.
    Handles cases where 'category' might be missing and fills it with None.

    Args:
        data: A list of dictionaries, where each dictionary contains a 'labels' key (or other specified key).
        name: The key to access the list of predictions within each dictionary.

    Returns:
        A list of lists, where each inner list contains tuples of 
        (aspect, polarity, opinion, category).
    """
    aspect_sentiment_pairs = []
    for item in data:
        predictions = item.get(name, [])
        if not isinstance(predictions, list):
            print(f"Warning: Skipping item with invalid predictions: {item}")
            continue

        converted_predictions = []
        for pred in predictions:
            if not isinstance(pred, dict):
                print(f"Warning: Skipping non-dictionary prediction: {pred}")
                continue

            aspect = pred.get('aspect', None)
            polarity = pred.get('polarity', None)
            opinion = pred.get('opinion', None)
            category = pred.get('category', None)

            # Skip predictions with missing aspect or polarity or opinion
            if aspect is not None and polarity is not None and opinion is not None:
                converted_predictions.append((aspect, polarity, opinion, category))
            else:
                print(f"Warning: Skipping prediction with missing data: {pred}")

        aspect_sentiment_pairs.append(converted_predictions)
    return aspect_sentiment_pairs

# Define a function to handle the evaluation for a single model
def run_model_evaluation(model_name, prediction_file_path, ground_truth_file_path):
    """
    Reads prediction and ground truth data, converts it, and prints evaluation metrics.

    Args:
        model_name (str): The name of the model being evaluated (for printing).
        prediction_file_path (str): Path to the prediction JSONL/JSON file.
        ground_truth_file_path (str): Path to the ground truth JSONL file.
    """
    print(f"##### Testing Metrics - {model_name} #####")

    # Read predictions
    predictions_data = read_jsonl(prediction_file_path) # Use read_json_flexible if your JSON files are arrays
    model_predictions = convert_data_for_evaluation(predictions_data)

    # Read ground truth (only need to read once if it's the same for all models)
    # However, keeping it inside the function makes it self-contained for each call
    ground_truth_data = read_jsonl(ground_truth_file_path) # Use read_json_flexible if your JSON files are arrays
    actual_ground_truth = convert_data_for_evaluation(ground_truth_data, name='labels')

    # Ensure both lists have the same number of reviews before evaluating
    if len(model_predictions) != len(actual_ground_truth):
        print(f"Warning: Number of predictions ({len(model_predictions)}) does not match "
              f"number of ground truth reviews ({len(actual_ground_truth)}) for {model_name}. "
              "Evaluation might be skewed or incomplete.")
        # You might choose to truncate, pad, or skip evaluation here based on your policy.
        # For simplicity, we'll proceed, but be aware of this potential issue.

    results = evaluate_aspect_and_sentiment(model_predictions, actual_ground_truth)
    print(results)
    print("-" * 50) # Separator for readability

# Paths to ground truth (remains constant)
GROUND_TRUTH_FILE = "../datasets/laptop_quad_test.tsv.jsonl"

# Dictionary of models to evaluate
models_to_evaluate = {
    "Previous Best - DeepSeek-R1-Distill-Qwen": "../datasets/clean_full_results2.jsonl",
    # "DeepSeek Base": "../datasets/DeepSeek-7B-Base_predictions.json",
    # "Qwen Base": "../datasets/Qwen3-8B-Base_predictions.json",
    # "DeepSeek-R1-Distill-Qwen": "../datasets/DeepSeek-R1-Distill-Qwen-7B_predictions.json",
    # Add more models here as needed:
    # "Another Model": "../datasets/another_model_predictions.jsonl",
}

# Run evaluation for each model
for model_name, prediction_path in models_to_evaluate.items():
    run_model_evaluation(model_name, prediction_path, GROUND_TRUTH_FILE)


##### Testing Metrics - Previous Best - DeepSeek-R1-Distill-Qwen #####
Aspect: (
   'Precision': 0.7493
   'Recall': 0.7343
   'F1': 0.7329
   'Accuracy': 0.7343
)
Sentiment: (
   'Precision': 0.7860
   'Recall': 0.8280
   'F1': 0.7767
   'Accuracy': 0.8280
)

--------------------------------------------------


In [None]:
def evaluate_aspect_and_sentiment(predictions_list, ground_truth_list, review_texts=None):
    """
    Evaluates aspect and sentiment analysis results on a per-review basis.
    - Aspect: Uses intersection/max approach to calculate accuracy
    - Sentiment: Uses two-stage approach - only evaluates sentiment on matched aspects

    Args:
        predictions_list: A list of lists, where each inner list contains tuples of (aspect, polarity, opinion, category).
                          Each inner list represents the predictions for a single review.
        ground_truth_list: A list of lists, where each inner list contains tuples of (aspect, polarity, opinion, category).
                          Each inner list represents the ground truth for a single review.
        review_texts: (Optional) A list of review texts corresponding to each review.
                      The length of each text will be used.

    Returns:
        final_accuracy: A dictionary with the averaged aspect and sentiment accuracy.
                        Format: {'Aspect': <avg_aspect_accuracy>, 'Sentiment': <avg_sentiment_accuracy>}
        per_review_output: A list of arrays for each review in the format:
                           [review_text_length, aspect_accuracy, sentiment_accuracy]
    """
    per_review_output = []  # Store per-review metrics
    sum_aspect_accuracy = 0
    sum_sentiment_accuracy = 0
    num_reviews = 0

    # Process each review's predictions and ground truth
    for idx, (predictions, ground_truth) in enumerate(zip(predictions_list, ground_truth_list)):
        # Aspect Evaluation - using sets to find unique aspects
        predicted_aspects = {p[0].lower() for p in predictions}
        actual_aspects = {gt[0].lower() for gt in ground_truth}

        # Intersection = correctly predicted aspects
        matched_aspects = predicted_aspects.intersection(actual_aspects)
        aspect_correct = len(matched_aspects)
        
        # Calculate aspect accuracy using intersection/max approach
        if len(actual_aspects) == 0 and len(predicted_aspects) == 0:
            aspect_accuracy = 1.0  # Perfect accuracy when no aspects exist and none predicted
        elif len(actual_aspects) == 0 or len(predicted_aspects) == 0:
            aspect_accuracy = 0.0  # Zero accuracy when aspects exist but none predicted (or vice versa)
        else:
            # Formula: intersection divided by max of either set
            aspect_accuracy = aspect_correct / max(len(actual_aspects), len(predicted_aspects))

        # TWO-STAGE SENTIMENT EVALUATION
        # 1. First identify matched aspects (already done above)
        # 2. For matched aspects, calculate sentiment accuracy
        
        # Create mappings from aspect to sentiment
        true_aspect_to_sentiment = {}
        pred_aspect_to_sentiment = {}
        
        # Build ground truth mapping
        for gt in ground_truth:
            aspect = gt[0].lower()
            # If multiple sentiments exist for the same aspect, keep the first one
            if aspect not in true_aspect_to_sentiment:
                true_aspect_to_sentiment[aspect] = gt[1]
        
        # Build prediction mapping
        for pred in predictions:
            aspect = pred[0].lower()
            # If multiple sentiments exist for the same aspect, keep the first one
            if aspect not in pred_aspect_to_sentiment:
                pred_aspect_to_sentiment[aspect] = pred[1]
        
        # Count sentiment matches ONLY for matched aspects
        sentiment_correct = 0
        for aspect in matched_aspects:
            if aspect in pred_aspect_to_sentiment and aspect in true_aspect_to_sentiment:
                if pred_aspect_to_sentiment[aspect] == true_aspect_to_sentiment[aspect]:
                    sentiment_correct += 1
        
        # Calculate sentiment accuracy only on matched aspects
        if len(matched_aspects) == 0:
            sentiment_accuracy = 0.0  # No matched aspects to evaluate sentiment on
        else:
            sentiment_accuracy = sentiment_correct / len(matched_aspects)

        # Determine review text length (if provided, else 0)
        if review_texts is not None and idx < len(review_texts):
            text_length = len(review_texts[idx])
        else:
            text_length = 0

        per_review_output.append([text_length, aspect_accuracy, sentiment_accuracy])
        sum_aspect_accuracy += aspect_accuracy
        sum_sentiment_accuracy += sentiment_accuracy
        num_reviews += 1

    # Final averaged accuracy metrics
    if num_reviews > 0:
        final_accuracy = {
            "Aspect": sum_aspect_accuracy / num_reviews,
            "Sentiment": sum_sentiment_accuracy / num_reviews,
        }
    else:
        final_accuracy = {"Aspect": 0, "Sentiment": 0}

    return final_accuracy, per_review_output


def enhanced_evaluate_aspect_and_sentiment(predictions_list, ground_truth_list, review_texts=None, 
                                           ignore_case=True, include_standard_metrics=False):
    """
    Enhanced evaluation with two-stage sentiment approach and detailed error analysis.
    
    Args:
        predictions_list: List of prediction tuples per review
        ground_truth_list: List of ground truth tuples per review
        review_texts: Optional list of review texts
        ignore_case: Whether to perform case-insensitive comparison of aspects
        include_standard_metrics: Whether to include precision, recall, and F1 metrics
        
    Returns:
        final_metrics: Dict with accuracy metrics
        per_review_output: List of detailed metrics for each review
    """
    per_review_output = []
    sum_aspect_accuracy = 0
    sum_sentiment_accuracy = 0
    
    # If including standard metrics
    if include_standard_metrics:
        sum_aspect_precision = 0
        sum_aspect_recall = 0
        sum_aspect_f1 = 0
        sum_sentiment_precision = 0
        sum_sentiment_recall = 0
        sum_sentiment_f1 = 0
    
    num_reviews = 0
    
    # Process each review
    for idx, (predictions, ground_truth) in enumerate(zip(predictions_list, ground_truth_list)):
        review_metrics = {}
        
        # Get review text length if available
        if review_texts is not None and idx < len(review_texts):
            review_metrics['text_length'] = len(review_texts[idx])
        else:
            review_metrics['text_length'] = 0
        
        # Convert aspects based on case sensitivity setting
        if ignore_case:
            # Case-insensitive comparison (convert all to lowercase)
            pred_aspects = [p[0].lower() for p in predictions]
            true_aspects = [gt[0].lower() for gt in ground_truth]
        else:
            # Case-sensitive comparison
            pred_aspects = [p[0] for p in predictions]
            true_aspects = [gt[0] for gt in ground_truth]
        
        # Get unique aspects
        unique_pred_aspects = set(pred_aspects)
        unique_true_aspects = set(true_aspects)
        
        # Get matched aspects (intersection)
        matched_aspects = unique_pred_aspects.intersection(unique_true_aspects)
        
        # Aspect accuracy using intersection/max formula
        if len(unique_true_aspects) == 0 and len(unique_pred_aspects) == 0:
            aspect_accuracy = 1.0
        elif len(unique_true_aspects) == 0 or len(unique_pred_aspects) == 0:
            aspect_accuracy = 0.0
        else:
            aspect_accuracy = len(matched_aspects) / max(len(unique_pred_aspects), len(unique_true_aspects))
        
        # Create mappings from aspect to sentiment
        true_aspect_to_sentiment = {}
        pred_aspect_to_sentiment = {}
        
        # Build mappings (keeping first occurrence of each aspect)
        for i, aspect in enumerate(true_aspects):
            if aspect not in true_aspect_to_sentiment:
                true_aspect_to_sentiment[aspect] = ground_truth[i][1]
                
        for i, aspect in enumerate(pred_aspects):
            if aspect not in pred_aspect_to_sentiment:
                pred_aspect_to_sentiment[aspect] = predictions[i][1]
        
        # Two-stage sentiment evaluation
        sentiment_correct = 0
        for aspect in matched_aspects:
            if pred_aspect_to_sentiment.get(aspect) == true_aspect_to_sentiment.get(aspect):
                sentiment_correct += 1
                
        # Calculate sentiment accuracy on matched aspects
        if len(matched_aspects) == 0:
            sentiment_accuracy = 0.0
        else:
            sentiment_accuracy = sentiment_correct / len(matched_aspects)
        
        # Store primary metrics
        review_metrics['aspect_accuracy'] = aspect_accuracy
        review_metrics['sentiment_accuracy'] = sentiment_accuracy
        
        # Store detailed metrics
        review_metrics['details'] = {
            'matched_aspects': len(matched_aspects),
            'total_pred_aspects': len(unique_pred_aspects),
            'total_true_aspects': len(unique_true_aspects),
            'sentiment_correct': sentiment_correct
        }
        
        # Calculate standard metrics if requested
        if include_standard_metrics:
            # Precision = correct / predicted
            aspect_precision = len(matched_aspects) / len(unique_pred_aspects) if len(unique_pred_aspects) > 0 else 0
            # Recall = correct / actual
            aspect_recall = len(matched_aspects) / len(unique_true_aspects) if len(unique_true_aspects) > 0 else 0
            # F1 = 2 * precision * recall / (precision + recall)
            aspect_f1 = 0
            if aspect_precision + aspect_recall > 0:
                aspect_f1 = 2 * aspect_precision * aspect_recall / (aspect_precision + aspect_recall)
            
            # Store standard metrics
            review_metrics['aspect_standard'] = {
                'precision': aspect_precision,
                'recall': aspect_recall,
                'f1': aspect_f1
            }
            
            # Update sums for standard metrics
            sum_aspect_precision += aspect_precision
            sum_aspect_recall += aspect_recall
            sum_aspect_f1 += aspect_f1
        
        # Error analysis
        false_positive_aspects = unique_pred_aspects - unique_true_aspects
        false_negative_aspects = unique_true_aspects - unique_pred_aspects
        
        # Find aspects with correct identification but wrong sentiment
        correct_aspect_wrong_sentiment = []
        for aspect in matched_aspects:
            if pred_aspect_to_sentiment.get(aspect) != true_aspect_to_sentiment.get(aspect):
                correct_aspect_wrong_sentiment.append({
                    'aspect': aspect,
                    'predicted_sentiment': pred_aspect_to_sentiment.get(aspect),
                    'true_sentiment': true_aspect_to_sentiment.get(aspect)
                })
        
        review_metrics['error_analysis'] = {
            'false_positive_aspects': list(false_positive_aspects),
            'false_negative_aspects': list(false_negative_aspects),
            'correct_aspect_wrong_sentiment': correct_aspect_wrong_sentiment
        }
        
        per_review_output.append(review_metrics)
        sum_aspect_accuracy += aspect_accuracy
        sum_sentiment_accuracy += sentiment_accuracy
        num_reviews += 1
    
    # Calculate average metrics
    final_metrics = {
        'Aspect': sum_aspect_accuracy / num_reviews if num_reviews > 0 else 0,
        'Sentiment': sum_sentiment_accuracy / num_reviews if num_reviews > 0 else 0
    }
    
    # Add standard metrics if requested
    if include_standard_metrics and num_reviews > 0:
        final_metrics['Aspect_Standard'] = {
            'precision': sum_aspect_precision / num_reviews,
            'recall': sum_aspect_recall / num_reviews,
            'f1': sum_aspect_f1 / num_reviews
        }
    
    return final_metrics, per_review_output


# Example usage
if __name__ == "__main__":
    # First use the basic evaluation function (direct replacement for original code)
    final_accuracy, per_review_array = evaluate_aspect_and_sentiment(
        predictions_list[0], ground_truth_list[0], review_texts
    )
    
    print("Final Accuracy Metrics:")
    print(final_accuracy)
    print("\nChecking for invalid accuracy values (outside [0.0, 1.0]):")
    invalid_found = False
    for i in range(len(per_review_array)):
        aspect_accuracy = per_review_array[i][1]
        sentiment_accuracy = per_review_array[i][2]
        
        if aspect_accuracy > 1.0 or aspect_accuracy < 0.0 or sentiment_accuracy > 1.0 or sentiment_accuracy < 0.0:
            print(f"Review {i+1}: {per_review_array[i]} - Invalid accuracy detected!")
            invalid_found = True

    if not invalid_found:
        print("All accuracy values are within valid range [0.0, 1.0]")
    
    # Then use the enhanced version with more detailed output
    final_metrics, detailed_metrics = enhanced_evaluate_aspect_and_sentiment(
        predictions_list[0], ground_truth_list[0], review_texts,
        ignore_case=True, include_standard_metrics=True
    )
    
    print("\nEnhanced Evaluation Results:")
    print("Accuracy metrics:")
    print(f"  Aspect (intersection/max): {final_metrics['Aspect']:.4f}")
    print(f"  Sentiment (on matched aspects): {final_metrics['Sentiment']:.4f}")
    
    if 'Aspect_Standard' in final_metrics:
        print("\nStandard metrics for comparison:")
        print(f"  Aspect precision: {final_metrics['Aspect_Standard']['precision']:.4f}")
        print(f"  Aspect recall: {final_metrics['Aspect_Standard']['recall']:.4f}")
        print(f"  Aspect F1: {final_metrics['Aspect_Standard']['f1']:.4f}")
    
    # Show detailed error analysis for one review
    if len(detailed_metrics) > 0:
        print("\nDetailed Error Analysis for first review:")
        errors = detailed_metrics[0]['error_analysis']
        print(f"  False positive aspects: {errors['false_positive_aspects']}")
        print(f"  False negative aspects: {errors['false_negative_aspects']}")
        print(f"  Correct aspect wrong sentiment: {errors['correct_aspect_wrong_sentiment']}")



Final Accuracy Metrics:
{'Aspect': 0.7163398692810458, 'Sentiment': 0.7006740196078431}

Checking for invalid accuracy values (outside [0.0, 1.0]):
All accuracy values are within valid range [0.0, 1.0]

Enhanced Evaluation Results:
Accuracy metrics:
  Aspect (intersection/max): 0.7163
  Sentiment (on matched aspects): 0.7007

Standard metrics for comparison:
  Aspect precision: 0.7493
  Aspect recall: 0.7343
  Aspect F1: 0.7329

Detailed Error Analysis for first review:
  False positive aspects: ['unit cost']
  False negative aspects: ['unit']
  Correct aspect wrong sentiment: []


In [15]:
import csv

def write_metrics_to_csv(per_review_array, output_filepath):
    """
    Writes review metrics (text length, aspect accuracy, sentiment accuracy) to a CSV file.
    
    Args:
        per_review_array: List of arrays, where each array contains 
                          [review_text_length, aspect_accuracy, sentiment_accuracy]
        output_filepath: Path where the CSV file will be saved
    """
    try:
        with open(output_filepath, 'w', newline='') as csvfile:
            csvwriter = csv.writer(csvfile)
            
            # Write header
            csvwriter.writerow(['Text_Length', 'Aspect_Accuracy', 'Sentiment_Accuracy'])
            
            # Write data for each review
            for metrics in per_review_array:
                text_length = metrics[0]
                aspect_accuracy = metrics[1]
                sentiment_accuracy = metrics[2]
                
                csvwriter.writerow([text_length, aspect_accuracy, sentiment_accuracy])
                
        print(f"Metrics successfully written to {output_filepath}")
        
    except Exception as e:
        print(f"Error writing to CSV: {e}")


# Example usage
if __name__ == "__main__":
    # After running evaluation
    final_accuracy, per_review_array = evaluate_aspect_and_sentiment(
        predictions_list[0], ground_truth_list[0], review_texts
    )
    
    # Write metrics to CSV
    write_metrics_to_csv(per_review_array, "review_metrics.csv")

Metrics successfully written to review_metrics.csv
