# Intent evaluator: Finetuned VS Default

In [None]:
import torch
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score
)
from transformers import (
    Pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TextClassificationPipeline
)
from datasets import Dataset
import scipy.stats as stats

In [None]:
class ModelEvaluator:
    """Comprehensive evaluation of pre-trained vs fine-tuned models."""

    def __init__(
        self,
        default_model_name: str,
        finetuned_model_path: str,
        task_type: str = "sentiment",
        device: str = "cuda" if torch.cuda.is_available() else "cpu"
    ):
        """Initialize model evaluator.

        Args:
            default_model_name: HuggingFace model name/path for default model
            finetuned_model_path: Path to saved fine-tuned model
            task_type: One of ["sentiment", "intent", "topic"]
            device: Device to run models on
        """
        self.task_type = task_type
        self.device = device

        # Load both models
        self.default_tokenizer = AutoTokenizer.from_pretrained(
            default_model_name)
        self.default_model = AutoModelForSequenceClassification.from_pretrained(
            default_model_name
        ).to(device)

        self.finetuned_tokenizer = AutoTokenizer.from_pretrained(
            finetuned_model_path)
        self.finetuned_model = AutoModelForSequenceClassification.from_pretrained(
            finetuned_model_path
        ).to(device)

        # Create pipelines
        self.default_pipeline = TextClassificationPipeline(
            model=self.default_model,
            tokenizer=self.default_tokenizer,
            device=device
        )

        self.finetuned_pipeline = TextClassificationPipeline(
            model=self.finetuned_model,
            tokenizer=self.finetuned_tokenizer,
            device=device
        )

        self.label2id = self.default_model.config.label2id
        self.id2label = self.default_model.config.id2label

    def get_predictions(
        self,
        texts: List[str],
        pipeline: Pipeline
    ) -> Tuple[List[int], List[float]]:
        """Get predictions and confidence scores from a pipeline."""
        results = pipeline(texts)
        predictions = []
        confidences = []

        for result in results:
            label = result['label']
            if isinstance(label, str):
                label_id = self.label2id.get(label, -1)
            else:
                label_id = label
            predictions.append(label_id)
            confidences.append(result['score'])

        return predictions, confidences

    def evaluate_on_dataset(
        self,
        test_dataset: Dataset,
        text_column: str = "text",
        label_column: str = "label"
    ) -> Dict:
        """Evaluate both models on a test dataset."""
        texts = test_dataset[text_column]
        true_labels = test_dataset[label_column]

        # Get predictions from both models
        default_preds, default_conf = self.get_predictions(
            texts, self.default_pipeline)
        finetuned_preds, finetuned_conf = self.get_predictions(
            texts, self.finetuned_pipeline)

        # Calculate metrics
        metrics = {
            'default': self._calculate_metrics(true_labels, default_preds, default_conf),
            'finetuned': self._calculate_metrics(true_labels, finetuned_preds, finetuned_conf)
        }

        # Perform statistical significance tests
        metrics['statistical_tests'] = self._perform_statistical_tests(
            true_labels, default_preds, finetuned_preds
        )

        return metrics

    def _calculate_metrics(
        self,
        true_labels: List[int],
        pred_labels: List[int],
        confidences: List[float]
    ) -> Dict:
        """Calculate comprehensive metrics for model evaluation."""
        metrics = {}

        # Basic classification metrics
        metrics['classification_report'] = classification_report(
            true_labels, pred_labels, output_dict=True
        )
        metrics['confusion_matrix'] = confusion_matrix(
            true_labels, pred_labels)

        # Calculate ROC and PR curves for each class
        metrics['roc_curves'] = {}
        metrics['pr_curves'] = {}

        n_classes = len(self.label2id)
        for i in range(n_classes):
            # Convert to binary problem for each class
            binary_true = [1 if label == i else 0 for label in true_labels]
            binary_conf = [conf if pred == i else 1 -
                           conf for pred, conf in zip(pred_labels, confidences)]

            # ROC curve
            fpr, tpr, _ = roc_curve(binary_true, binary_conf)
            roc_auc = auc(fpr, tpr)
            metrics['roc_curves'][self.id2label[i]] = {
                'fpr': fpr.tolist(),
                'tpr': tpr.tolist(),
                'auc': roc_auc
            }

            # PR curve
            precision, recall, _ = precision_recall_curve(
                binary_true, binary_conf)
            avg_precision = average_precision_score(binary_true, binary_conf)
            metrics['pr_curves'][self.id2label[i]] = {
                'precision': precision.tolist(),
                'recall': recall.tolist(),
                'avg_precision': avg_precision
            }

        # Confidence analysis
        metrics['confidence_stats'] = {
            'mean': np.mean(confidences),
            'std': np.std(confidences),
            'median': np.median(confidences),
            'correct_conf': np.mean([conf for pred, conf, true in zip(pred_labels, confidences, true_labels) if pred == true]),
            'incorrect_conf': np.mean([conf for pred, conf, true in zip(pred_labels, confidences, true_labels) if pred != true])
        }

        return metrics

    def _perform_statistical_tests(
        self,
        true_labels: List[int],
        default_preds: List[int],
        finetuned_preds: List[int]
    ) -> Dict:
        """Perform statistical significance tests between models."""
        # McNemar's test for paired nominal data
        default_correct = [1 if d == t else 0 for d,
                           t in zip(default_preds, true_labels)]
        finetuned_correct = [1 if f == t else 0 for f,
                             t in zip(finetuned_preds, true_labels)]

        contingency_table = np.zeros((2, 2))
        for d, f in zip(default_correct, finetuned_correct):
            contingency_table[d][f] += 1

        mcnemar_statistic, mcnemar_p_value = stats.mcnemar(contingency_table)

        return {
            'mcnemar_test': {
                'statistic': float(mcnemar_statistic),
                'p_value': float(mcnemar_p_value)
            },
            'contingency_table': contingency_table.tolist()
        }

    def plot_evaluation_results(self, metrics: Dict, save_path: str = None):
        """Create comprehensive visualization of evaluation results."""
        fig = plt.figure(figsize=(20, 15))

        # 1. Confusion Matrices
        plt.subplot(2, 3, 1)
        self._plot_confusion_matrix(
            metrics['default']['confusion_matrix'],
            'Default Model Confusion Matrix'
        )

        plt.subplot(2, 3, 2)
        self._plot_confusion_matrix(
            metrics['finetuned']['confusion_matrix'],
            'Fine-tuned Model Confusion Matrix'
        )

        # 2. ROC Curves
        plt.subplot(2, 3, 3)
        self._plot_roc_curves(
            metrics['default']['roc_curves'],
            metrics['finetuned']['roc_curves']
        )

        # 3. PR Curves
        plt.subplot(2, 3, 4)
        self._plot_pr_curves(
            metrics['default']['pr_curves'],
            metrics['finetuned']['pr_curves']
        )

        # 4. Confidence Distribution
        plt.subplot(2, 3, 5)
        self._plot_confidence_distribution(
            metrics['default']['confidence_stats'],
            metrics['finetuned']['confidence_stats']
        )

        # 5. Performance Comparison
        plt.subplot(2, 3, 6)
        self._plot_performance_comparison(
            metrics['default']['classification_report'],
            metrics['finetuned']['classification_report']
        )

        plt.tight_layout()
        if save_path:
            plt.savefig(save_path)
        plt.show()

    def _plot_confusion_matrix(self, cm: np.ndarray, title: str):
        """Plot a confusion matrix."""
        sns.heatmap(
            cm,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=list(self.id2label.values()),
            yticklabels=list(self.id2label.values())
        )
        plt.title(title)
        plt.xlabel('Predicted')
        plt.ylabel('True')

    def _plot_roc_curves(self, default_curves: Dict, finetuned_curves: Dict):
        """Plot ROC curves for all classes."""
        for label in self.id2label.values():
            # Default model
            plt.plot(
                default_curves[label]['fpr'],
                default_curves[label]['tpr'],
                '--',
                label=f'Default - {label} (AUC = {default_curves[label]["auc"]:.2f})'
            )
            # Fine-tuned model
            plt.plot(
                finetuned_curves[label]['fpr'],
                finetuned_curves[label]['tpr'],
                '-',
                label=f'Fine-tuned - {label} (AUC = {finetuned_curves[label]["auc"]:.2f})'
            )

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves Comparison')
        plt.legend(loc="lower right")

    def _plot_pr_curves(self, default_curves: Dict, finetuned_curves: Dict):
        """Plot Precision-Recall curves for all classes."""
        for label in self.id2label.values():
            # Default model
            plt.plot(
                default_curves[label]['recall'],
                default_curves[label]['precision'],
                '--',
                label=f'Default - {label} (AP = {default_curves[label]["avg_precision"]:.2f})'
            )
            # Fine-tuned model
            plt.plot(
                finetuned_curves[label]['recall'],
                finetuned_curves[label]['precision'],
                '-',
                label=f'Fine-tuned - {label} (AP = {finetuned_curves[label]["avg_precision"]:.2f})'
            )

        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curves Comparison')
        plt.legend(loc="lower left")

    def _plot_confidence_distribution(
        self,
        default_stats: Dict,
        finetuned_stats: Dict
    ):
        """Plot confidence score distributions."""
        stats = pd.DataFrame({
            'Model': ['Default', 'Fine-tuned'] * 2,
            'Type': ['Correct', 'Correct', 'Incorrect', 'Incorrect'],
            'Confidence': [
                default_stats['correct_conf'],
                finetuned_stats['correct_conf'],
                default_stats['incorrect_conf'],
                finetuned_stats['incorrect_conf']
            ]
        })

        sns.barplot(x='Model', y='Confidence', hue='Type', data=stats)
        plt.title('Confidence Score Distribution')

    def _plot_performance_comparison(
        self,
        default_report: Dict,
        finetuned_report: Dict
    ):
        """Plot performance metrics comparison."""
        metrics = ['precision', 'recall', 'f1-score']
        labels = list(self.id2label.values()) + ['macro avg']

        comparison_data = []
        for label in labels:
            for metric in metrics:
                comparison_data.append({
                    'Label': label,
                    'Metric': metric,
                    'Default': default_report[label][metric],
                    'Fine-tuned': finetuned_report[label][metric]
                })

        df = pd.DataFrame(comparison_data)
        df_melted = df.melt(
            id_vars=['Label', 'Metric'],
            var_name='Model',
            value_name='Score'
        )

        sns.barplot(
            data=df_melted,
            x='Label',
            y='Score',
            hue='Model',
            palette=['lightblue', 'darkblue'],
            alpha=0.6
        )
        plt.xticks(rotation=45)
        plt.title('Performance Metrics Comparison')
        plt.tight_layout()


# Example usage:
if __name__ == "__main__":
    # For sentiment analysis
    evaluator = ModelEvaluator(
        default_model_name="distilbert-base-uncased-finetuned-sst-2-english",
        finetuned_model_path="path/to/your/finetuned/sentiment/model",
        task_type="sentiment"
    )

    # Load your test dataset
    from datasets import load_dataset
    test_dataset = load_dataset("your_test_dataset")["test"]

    # Run evaluation
    metrics = evaluator.evaluate_on_dataset(
        test_dataset,
        text_column="text",
        label_column="label"
    )

    # Plot results
    evaluator.plot_evaluation_results(
        metrics, save_path="evaluation_results.png")

    # Print statistical significance
    print("\nStatistical Significance Tests:")
    print(
        f"McNemar's test p-value: {metrics['statistical_tests']['mcnemar_test']['p_value']}")
    if metrics['statistical_tests']['mcnemar_test']['p_value'] < 0.05:
        print("The difference between models is statistically significant")
    else:
        print("The difference between models is not statistically significant")

In [None]:
# Intent Recognition & Topic Model Evaluators
class IntentRecognitionEvaluator(ModelEvaluator):
    """Specialized evaluator for intent recognition models."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def evaluate_on_dataset(self, test_dataset: Dataset, *args, **kwargs):
        metrics = super().evaluate_on_dataset(test_dataset, *args, **kwargs)

        # Add intent-specific metrics
        for model_type in ['default', 'finetuned']:
            # Add intent confusion analysis
            metrics[model_type]['intent_confusion'] = self._analyze_intent_confusion(
                test_dataset['label'],
                metrics[model_type]['predictions']
            )

            # Add intent transition analysis
            metrics[model_type]['intent_transitions'] = self._analyze_intent_transitions(
                test_dataset['label'],
                metrics[model_type]['predictions']
            )

        return metrics

    def _analyze_intent_confusion(self, true_intents, pred_intents):
        """Analyze which intents are most commonly confused."""
        confusion_pairs = []
        for true, pred in zip(true_intents, pred_intents):
            if true != pred:
                confusion_pairs.append(
                    (self.id2label[true], self.id2label[pred]))

        confusion_counts = pd.Series(confusion_pairs).value_counts()
        return confusion_counts.to_dict()

    def _analyze_intent_transitions(self, true_intents, pred_intents):
        """Analyze intent prediction transitions in conversation flow."""
        transitions = []
        for i in range(len(true_intents) - 1):
            true_transition = (self.id2label[true_intents[i]],
                               self.id2label[true_intents[i + 1]])
            pred_transition = (self.id2label[pred_intents[i]],
                               self.id2label[pred_intents[i + 1]])
            transitions.append((true_transition, pred_transition))

        return pd.Series(transitions).value_counts().to_dict()


class TopicModelEvaluator(ModelEvaluator):
    """Specialized evaluator for topic classification models."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def evaluate_on_dataset(self, test_dataset: Dataset, *args, **kwargs):
        metrics = super().evaluate_on_dataset(test_dataset, *args, **kwargs)

        # Add topic-specific metrics
        for model_type in ['default', 'finetuned']:
            # Add topic coherence analysis
            metrics[model_type]['topic_coherence'] = self._analyze_topic_coherence(
                test_dataset['text'],
                metrics[model_type]['predictions']
            )

            # Add topic diversity analysis
            metrics[model_type]['topic_diversity'] = self._analyze_topic_diversity(
                test_dataset['text'],
                metrics[model_type]['predictions']
            )

        return metrics

    def _analyze_topic_coherence(self, texts, topic_assignments):
        """Analyze semantic coherence of texts within each topic."""
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity

        coherence_scores = {}

        for topic_id in self.id2label:
            # Get texts for this topic
            topic_texts = [text for text, topic in zip(texts, topic_assignments)
                           if topic == topic_id]

            if not topic_texts:
                continue

            # Calculate TF-IDF vectors
            vectorizer = TfidfVectorizer(stop_words='english')
            tfidf_matrix = vectorizer.fit_transform(topic_texts)

            # Calculate average pairwise similarity
            similarities = cosine_similarity(tfidf_matrix)
            coherence = np.mean(
                similarities[np.triu_indices(similarities.shape[0], k=1)])

            coherence_scores[self.id2label[topic_id]] = float(coherence)

        return coherence_scores

    def _analyze_topic_diversity(self, texts, topic_assignments):
        """Analyze diversity of topics and their distributions."""
        from collections import Counter

        # Topic distribution
        topic_dist = Counter(topic_assignments)

        # Calculate entropy of distribution
        total = sum(topic_dist.values())
        probs = [count/total for count in topic_dist.values()]
        entropy = -sum(p * np.log(p) for p in probs if p > 0)

        return {
            'topic_distribution': {self.id2label[k]: v for k, v in topic_dist.items()},
            'topic_entropy': float(entropy)
        }


# Example usage script showing how to evaluate all three model types
def evaluate_all_models(test_data_paths: Dict[str, str], model_paths: Dict[str, Dict[str, str]]):
    """
    Evaluate sentiment, intent, and topic models.

    Args:
        test_data_paths: Dict with paths to test datasets for each task
        model_paths: Dict with default and finetuned model paths for each task
    """

    results = {}

    # 1. Sentiment Analysis
    print("Evaluating Sentiment Analysis Models...")
    sentiment_evaluator = ModelEvaluator(
        default_model_name=model_paths['sentiment']['default'],
        finetuned_model_path=model_paths['sentiment']['finetuned'],
        task_type="sentiment"
    )

    sentiment_test_data = load_dataset(
        "csv", data_files=test_data_paths['sentiment'])
    results['sentiment'] = sentiment_evaluator.evaluate_on_dataset(
        sentiment_test_data['train']
    )
    sentiment_evaluator.plot_evaluation_results(
        results['sentiment'],
        save_path="sentiment_evaluation.png"
    )

    # 2. Intent Recognition
    print("\nEvaluating Intent Recognition Models...")
    intent_evaluator = IntentRecognitionEvaluator(
        default_model_name=model_paths['intent']['default'],
        finetuned_model_path=model_paths['intent']['finetuned'],
        task_type="intent"
    )

    intent_test_data = load_dataset(
        "csv", data_files=test_data_paths['intent'])
    results['intent'] = intent_evaluator.evaluate_on_dataset(
        intent_test_data['train']
    )
    intent_evaluator.plot_evaluation_results(
        results['intent'],
        save_path="intent_evaluation.png"
    )

    # 3. Topic Classification
    print("\nEvaluating Topic Classification Models...")
    topic_evaluator = TopicModelEvaluator(
        default_model_name=model_paths['topic']['default'],
        finetuned_model_path=model_paths['topic']['finetuned'],
        task_type="topic"
    )

    topic_test_data = load_dataset("csv", data_files=test_data_paths['topic'])
    results['topic'] = topic_evaluator.evaluate_on_dataset(
        topic_test_data['train']
    )
    topic_evaluator.plot_evaluation_results(
        results['topic'],
        save_path="topic_evaluation.png"
    )

    # Print summary of improvements
    print("\nSummary of Improvements:")
    for task in ['sentiment', 'intent', 'topic']:
        print(f"\n{task.upper()} ANALYSIS:")
        default_f1 = results[task]['default']['classification_report']['macro avg']['f1-score']
        finetuned_f1 = results[task]['finetuned']['classification_report']['macro avg']['f1-score']
        improvement = ((finetuned_f1 - default_f1) / default_f1) * 100

        print(f"Default Model F1: {default_f1:.3f}")
        print(f"Fine-tuned Model F1: {finetuned_f1:.3f}")
        print(f"Improvement: {improvement:.1f}%")

        # Print statistical significance
        p_value = results[task]['statistical_tests']['mcnemar_test']['p_value']
        print(f"Statistical Significance (p-value): {p_value:.4f}")

        if task == 'intent':
            print("\nMost Common Intent Confusions (Fine-tuned model):")
            for (true_intent, pred_intent), count in list(results[task]['finetuned']['intent_confusion'].items())[:5]:
                print(f"{true_intent} → {pred_intent}: {count} times")

        elif task == 'topic':
            print("\nTopic Coherence Scores (Fine-tuned model):")
            coherence_scores = results[task]['finetuned']['topic_coherence']
            for topic, score in sorted(coherence_scores.items(), key=lambda x: x[1], reverse=True)[:5]:
                print(f"{topic}: {score:.3f}")

    return results


# Example usage:
if __name__ == "__main__":
    test_data_paths = {
        'sentiment': 'path/to/sentiment_test.csv',
        'intent': 'path/to/intent_test.csv',
        'topic': 'path/to/topic_test.csv'
    }

    model_paths = {
        'sentiment': {
            'default': 'distilbert-base-uncased-finetuned-sst-2-english',
            'finetuned': 'path/to/finetuned/sentiment/model'
        },
        'intent': {
            'default': 'Falconsai/intent_classification',
            'finetuned': 'path/to/finetuned/intent/model'
        },
        'topic': {
            'default': 'dstefa/roberta-base_topic_classification_nyt_news',
            'finetuned': 'path/to/finetuned/topic/model'
        }
    }

    results = evaluate_all_models(test_data_paths, model_paths)