<a href="https://colab.research.google.com/github/Bstrato/medical-conja-evaluator/blob/main/LLM_Reasoning_over_graph_and_tabular.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import random
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from typing import Dict, List, Tuple, Any, Optional
import pandas as pd
from datetime import datetime
import os
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns




    #./medical_conja_finetuned
    #Qwen/Qwen2-0.5B-Instruct
    #Table-R1/Table-R1-SFT-8B
    #PKU-ML/G1-3B
    #zzachw12/llemr-v1

class EnhancedMedicalConJEvaluator:
    """Enhanced Medical admission evaluator using Con-J with LLM judge evaluation"""

    def __init__(self, model_name: str = "Qwen/Qwen2-0.5B-Instruct", seed: int = 2021):
        """Initialize the evaluator with model and seed configuration"""
        self.seed = seed
        self.set_seed()
        self.model_name = model_name
        self.evaluation_history = []
        self.bias_metrics = {}

        print(f"Loading enhanced Medical Con-J evaluator with {model_name}")

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            load_in_8bit=False,
        )

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        print("Enhanced Medical Con-J evaluator loaded successfully!")
        print(f"Model device: {next(self.model.parameters()).device}")
        print(f"Model dtype: {next(self.model.parameters()).dtype}")

    def set_seed(self):
        """Set random seeds for reproducibility"""
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

    def load_admission_patterns(self, file_path: str) -> List[Dict]:
        """Load medical admission patterns from JSONL file"""
        patterns = []

        try:
            if not os.path.exists(file_path):
                print(f"Warning: File {file_path} not found. Creating sample data.")
                return self.create_sample_patterns()

            with open(file_path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    try:
                        if line.strip():
                            raw_pattern = json.loads(line.strip())
                            processed_pattern = self.process_raw_pattern(raw_pattern)
                            if processed_pattern:
                                patterns.append(processed_pattern)
                    except json.JSONDecodeError as e:
                        print(f"Warning: Skipping malformed JSON on line {line_num}: {e}")
                        continue

            print(f"Successfully loaded and processed {len(patterns)} patterns from {file_path}")
            return patterns

        except Exception as e:
            print(f"Error loading patterns: {e}")
            return self.create_sample_patterns()

    def process_raw_pattern(self, raw_pattern: Dict) -> Optional[Dict]:
        """Process and structure raw pattern data from JSONL format"""
        try:
            question = raw_pattern.get('question', '')
            answer_1 = raw_pattern.get('answer_1', {})
            answer_2 = raw_pattern.get('answer_2', {})

            if not all([question, answer_1, answer_2]):
                return None

            graph_answer = None
            table_answer = None

            if answer_1.get('type') == 'graph':
                graph_answer = answer_1
                table_answer = answer_2
            elif answer_1.get('type') == 'table':
                table_answer = answer_1
                graph_answer = answer_2
            else:
                if 'nodes' in answer_1.get('content', {}):
                    graph_answer = answer_1
                    table_answer = answer_2
                else:
                    table_answer = answer_1
                    graph_answer = answer_2

            graph_representation = self.format_graph_representation(graph_answer.get('content', {}))
            table_representation = self.format_table_representation(table_answer.get('content', {}))

            return {
                'question': question,
                'graph_representation': graph_representation,
                'tabular_representation': table_representation,
                'graph_content': graph_answer.get('content', {}),
                'table_content': table_answer.get('content', {}),
                'raw_data': raw_pattern
            }

        except Exception as e:
            print(f"Error processing pattern: {e}")
            return None

    def format_graph_representation(self, graph_content: Dict) -> str:
        """Convert graph data structure to readable text format"""
        nodes = graph_content.get('nodes', [])
        edges = graph_content.get('edges', [])

        representation = "Graph Structure:\n"
        if nodes:
            representation += f"Nodes: {', '.join(str(node) for node in nodes)}\n"

        representation += "Relationships:\n"
        for edge in edges:
            if len(edge) >= 3:
                source, relation, target = edge[0], edge[1], edge[2]
                representation += f"  • {source} {relation} {target}\n"
            elif len(edge) == 2:
                source, target = edge[0], edge[1]
                representation += f"  • {source} → {target}\n"

        return representation

    def format_table_representation(self, table_content: Dict) -> str:
        """Convert tabular data structure to readable text format"""
        headers = table_content.get('header', [])
        rows = table_content.get('rows', [])

        representation = "Tabular Structure:\n"
        if headers:
            representation += f"Headers: {', '.join(headers)}\n"

        representation += "Data rows:\n"
        for i, row in enumerate(rows, 1):
            if headers and len(row) == len(headers):
                row_dict = dict(zip(headers, row))
                representation += f"  Row {i}: {row_dict}\n"
            else:
                representation += f"  Row {i}: {row}\n"

        return representation

    def create_sample_patterns(self) -> List[Dict]:
        """Generate sample medical patterns for testing when no data file is available"""
        return [
            {
                'question': 'Outline the unit transitions for Patient 10000032 during the URGENT admission.',
                'graph_representation': 'Graph Structure:\nNodes: Patient_10000032, Admission_22595853.0, URGENT, Emergency Department, Transplant\nRelationships:\n  • Patient_10000032 has_admission Admission_22595853.0\n  • Admission_22595853.0 admission_type URGENT\n  • Admission_22595853.0 transferred_to Emergency Department\n  • Admission_22595853.0 transferred_to Transplant',
                'tabular_representation': 'Tabular Structure:\nHeaders: hadm_id, intime, outtime, eventtype, careunit\nData rows:\n  Row 1: {\'hadm_id\': \'22595853.0\', \'intime\': \'2180-05-06 19:17:00\', \'outtime\': \'2180-05-06 23:30:00\', \'eventtype\': \'ED\', \'careunit\': \'Emergency Department\'}\n  Row 2: {\'hadm_id\': \'22595853.0\', \'intime\': \'2180-05-06 23:30:00\', \'outtime\': \'2180-05-07 17:21:27\', \'eventtype\': \'admit\', \'careunit\': \'Transplant\'}',
                'graph_content': {},
                'table_content': {},
                'raw_data': {}
            }
        ]

    def generate_structure_response(self, question: str, representation: str, structure_type: str) -> str:
        """Generate LLM response for a given data structure representation"""
        prompt = f"""<|im_start|>system
You are a medical expert analyzing patient unit transitions and admissions. Based on the {structure_type} representation provided, answer the clinical question with detailed analysis focusing on:

- Patient flow and unit transitions
- Timing and sequence of movements
- Types of care units and their purposes
- Clinical implications of transfers
- Duration in each unit<|im_end|>

<|im_start|>user
Clinical Question: {question}

{structure_type.title()} Representation:
{representation}

Provide a comprehensive analysis of the unit transitions based on this {structure_type} representation.<|im_end|>

<|im_start|>assistant
"""

        try:
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=1024,
                padding=False
            )

            device = next(self.model.parameters()).device
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=250,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    top_k=20,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.05,
                    length_penalty=1.0,
                    use_cache=True
                )

            torch.cuda.empty_cache() if torch.cuda.is_available() else None

            input_length = inputs['input_ids'].shape[1]
            new_tokens = outputs[0][input_length:]
            response = self.tokenizer.decode(new_tokens, skip_special_tokens=True)

            return response.strip()

        except Exception as e:
            print(f"Error during generation: {e}")
            return f"Generated {structure_type} response: Analysis of unit transitions showing patient flow and timing patterns."

    def create_llm_judge_prompt(self, question: str, graph_response: str, tabular_response: str) -> Tuple[str, Dict]:
        """Create evaluation prompt for LLM judge with randomized response order"""
        if random.random() > 0.5:
            response_a, response_b = graph_response, tabular_response
            mapping = {'A': 'graph', 'B': 'tabular'}
        else:
            response_a, response_b = tabular_response, graph_response
            mapping = {'A': 'tabular', 'B': 'graph'}

        prompt = f"""<|im_start|>system
You are an expert medical evaluator specializing in healthcare data analysis. Your task is to determine which of two clinical responses better answers the given question about patient unit transitions. Evaluate based on:

1. Accuracy and completeness of information
2. Clarity in describing unit transitions and timing
3. Medical relevance and clinical insight
4. Proper interpretation of the data structure
5. Usefulness for healthcare decision-making

Respond in JSON format with your evaluation and detailed rationale.<|im_end|>

<|im_start|>user
Question: {question}

Response A: {response_a}

Response B: {response_b}

Which response (A or B) better answers the question about unit transitions? Consider the accuracy, completeness, clarity, and clinical relevance of each response.

Respond in JSON format:
{{"better_response": "A" or "B", "rationale": "detailed explanation comparing both responses and why one is superior", "evaluation_criteria": {{"accuracy": "A" or "B", "completeness": "A" or "B", "clarity": "A" or "B", "clinical_relevance": "A" or "B"}}, "confidence": "high/medium/low"}}<|im_end|>

<|im_start|>assistant
"""

        return prompt, mapping

    def evaluate_with_llm_judge(self, pattern: Dict) -> Dict:
        """Evaluate a single pattern using LLM judge methodology"""
        graph_response = self.generate_structure_response(
            pattern['question'],
            pattern['graph_representation'],
            'graph'
        )

        tabular_response = self.generate_structure_response(
            pattern['question'],
            pattern['tabular_representation'],
            'tabular'
        )

        judge_prompt, order_mapping = self.create_llm_judge_prompt(
            pattern['question'],
            graph_response,
            tabular_response
        )

        judge_response = self.generate_judge_evaluation(judge_prompt)
        judge_result = self.parse_judge_response(judge_response)

        if judge_result['better_response'] == 'A':
            winner = order_mapping['A']
        elif judge_result['better_response'] == 'B':
            winner = order_mapping['B']
        else:
            winner = 'unknown'

        return {
            'pattern': pattern,
            'graph_response': graph_response,
            'tabular_response': tabular_response,
            'judge_evaluation': judge_result,
            'winner': winner,
            'order_mapping': order_mapping,
            'judge_raw_response': judge_response
        }

    def generate_judge_evaluation(self, prompt: str) -> str:
        """Generate judge evaluation response using the LLM"""
        try:
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=1536,
                padding=False
            )

            device = next(self.model.parameters()).device
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=400,
                    temperature=0.3,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.05,
                    use_cache=True
                )

            torch.cuda.empty_cache() if torch.cuda.is_available() else None

            input_length = inputs['input_ids'].shape[1]
            new_tokens = outputs[0][input_length:]
            response = self.tokenizer.decode(new_tokens, skip_special_tokens=True)

            return response.strip()

        except Exception as e:
            print(f"Error during judge generation: {e}")
            return '{"better_response": "A", "rationale": "Error in evaluation", "evaluation_criteria": {"accuracy": "A", "completeness": "A", "clarity": "A", "clinical_relevance": "A"}, "confidence": "low"}'

    def parse_judge_response(self, response: str) -> Dict:
        """Parse and extract structured evaluation from judge response"""
        try:
            start_idx = response.find('{')
            end_idx = response.rfind('}') + 1

            if start_idx >= 0 and end_idx > start_idx:
                json_str = response[start_idx:end_idx]
                parsed = json.loads(json_str)

                return {
                    'better_response': parsed.get('better_response', 'unknown'),
                    'rationale': parsed.get('rationale', 'No rationale provided'),
                    'evaluation_criteria': parsed.get('evaluation_criteria', {}),
                    'confidence': parsed.get('confidence', 'unknown')
                }

            if 'response a' in response.lower() or '"a"' in response.lower():
                better = 'A'
            elif 'response b' in response.lower() or '"b"' in response.lower():
                better = 'B'
            else:
                better = 'unknown'

            return {
                'better_response': better,
                'rationale': response[:300],
                'evaluation_criteria': {},
                'confidence': 'unknown'
            }

        except Exception as e:
            print(f"Error parsing judge response: {e}")
            return {
                'better_response': 'unknown',
                'rationale': 'Parse error',
                'evaluation_criteria': {},
                'confidence': 'low'
            }

    def run_comprehensive_evaluation(self, patterns: List[Dict], max_patterns: int = 10) -> Dict:
        """Execute comprehensive evaluation across multiple patterns using LLM judge"""
        print(f"Running comprehensive evaluation on {min(len(patterns), max_patterns)} patterns...")

        results = []
        test_patterns = patterns[:max_patterns]

        for i, pattern in enumerate(test_patterns):
            print(f"Evaluating pattern {i+1}/{len(test_patterns)}")

            try:
                result = self.evaluate_with_llm_judge(pattern)
                results.append(result)

                print(f"  - Winner: {result['winner']}")
                print(f"  - Confidence: {result['judge_evaluation'].get('confidence', 'unknown')}")
                print(f"  - Rationale preview: {result['judge_evaluation']['rationale'][:80]}...")

            except Exception as e:
                print(f"Error evaluating pattern {i+1}: {e}")
                continue

        analysis = self.analyze_comprehensive_results(results)

        return {
            'individual_results': results,
            'analysis': analysis,
            'metadata': {
                'total_patterns': len(test_patterns),
                'successful_evaluations': len(results),
                'model_name': self.model_name,
                'timestamp': datetime.now().isoformat()
            }
        }

    def analyze_comprehensive_results(self, results: List[Dict]) -> Dict:
        """Analyze and generate statistics from comprehensive evaluation results"""
        if not results:
            return {'error': 'No results to analyze'}

        winners = [r['winner'] for r in results if r['winner'] != 'unknown']
        winner_counts = Counter(winners)

        confidence_levels = [r['judge_evaluation'].get('confidence', 'unknown') for r in results]
        confidence_counts = Counter(confidence_levels)

        criteria_analysis = {
            'accuracy': Counter(),
            'completeness': Counter(),
            'clarity': Counter(),
            'clinical_relevance': Counter()
        }

        for result in results:
            criteria = result['judge_evaluation'].get('evaluation_criteria', {})
            for criterion, winner in criteria.items():
                if criterion in criteria_analysis:
                    criteria_analysis[criterion][winner] += 1

        rationale_lengths = [
            len(r['judge_evaluation']['rationale'].split())
            for r in results
        ]

        return {
            'winner_distribution': dict(winner_counts),
            'graph_wins': winner_counts.get('graph', 0),
            'tabular_wins': winner_counts.get('tabular', 0),
            'unknown_results': len([r for r in results if r['winner'] == 'unknown']),
            'graph_win_rate': winner_counts.get('graph', 0) / len(winners) if winners else 0,
            'tabular_win_rate': winner_counts.get('tabular', 0) / len(winners) if winners else 0,
            'confidence_distribution': dict(confidence_counts),
            'criteria_analysis': {k: dict(v) for k, v in criteria_analysis.items()},
            'rationale_quality': {
                'avg_rationale_length': np.mean(rationale_lengths) if rationale_lengths else 0,
                'min_rationale_length': min(rationale_lengths) if rationale_lengths else 0,
                'max_rationale_length': max(rationale_lengths) if rationale_lengths else 0
            },
            'total_evaluated': len(results)
        }

    def generate_evaluation_report(self, evaluation_results: Dict) -> str:
        """Generate comprehensive formatted evaluation report"""
        analysis = evaluation_results['analysis']
        metadata = evaluation_results['metadata']

        report = "="*60 + "\n"
        report += "MEDICAL UNIT TRANSITIONS CON-J EVALUATION REPORT\n"
        report += "="*60 + "\n\n"

        report += "EXECUTIVE SUMMARY\n"
        report += "-" * 20 + "\n"
        report += f"• Evaluation timestamp: {metadata['timestamp']}\n"
        report += f"• Model used: {metadata['model_name']}\n"
        report += f"• Total patterns evaluated: {analysis['total_evaluated']}\n"
        report += f"• Successful evaluations: {metadata['successful_evaluations']}\n"

        if 'error' not in analysis:
            report += f"• Graph structure wins: {analysis['graph_wins']} ({analysis['graph_win_rate']:.1%})\n"
            report += f"• Tabular structure wins: {analysis['tabular_wins']} ({analysis['tabular_win_rate']:.1%})\n"
            report += f"• Unknown/tied results: {analysis['unknown_results']}\n\n"

            report += "PERFORMANCE ANALYSIS\n"
            report += "-" * 20 + "\n"
            report += f"Winner Distribution: {analysis['winner_distribution']}\n\n"

            report += "CONFIDENCE ANALYSIS\n"
            report += "-" * 15 + "\n"
            report += f"Confidence Distribution: {analysis['confidence_distribution']}\n\n"

            report += "EVALUATION CRITERIA BREAKDOWN\n"
            report += "-" * 30 + "\n"
            for criterion, scores in analysis['criteria_analysis'].items():
                if scores:
                    report += f"{criterion.title()}: {scores}\n"
            report += "\n"

            report += "JUDGE EVALUATION QUALITY\n"
            report += "-" * 25 + "\n"
            rq = analysis['rationale_quality']
            report += f"• Average rationale length: {rq['avg_rationale_length']:.1f} words\n"
            report += f"• Rationale length range: {rq['min_rationale_length']}-{rq['max_rationale_length']} words\n\n"

        report += "INDIVIDUAL CASE RESULTS\n"
        report += "-" * 25 + "\n"

        for i, result in enumerate(evaluation_results['individual_results'][:5]):
            report += f"Case {i+1}:\n"
            report += f"  Question: {result['pattern']['question'][:80]}...\n"
            report += f"  Winner: {result['winner']}\n"
            report += f"  Confidence: {result['judge_evaluation'].get('confidence', 'unknown')}\n"
            report += f"  Rationale: {result['judge_evaluation']['rationale'][:120]}...\n\n"

        report += "RECOMMENDATIONS\n"
        report += "-" * 15 + "\n"

        if 'error' not in analysis:
            if analysis['graph_win_rate'] > 0.7:
                report += "• Graph structures show strong advantage for unit transition queries\n"
                report += "• Consider why graph relationships excel for this task\n"
            elif analysis['tabular_win_rate'] > 0.7:
                report += "• Tabular structures show strong advantage for unit transition queries\n"
                report += "• Consider why systematic tabular data excels for this task\n"
            else:
                report += "• Results show balanced performance between structures\n"
                report += "• Both approaches have merit for different aspects of unit transitions\n"

            high_confidence = analysis['confidence_distribution'].get('high', 0)
            total_evals = sum(analysis['confidence_distribution'].values())
            if high_confidence / total_evals < 0.5:
                report += "• Consider improving data quality or judge prompts for higher confidence\n"

        report += "\nEND OF REPORT\n"
        report += "="*60

        return report

    def print_detailed_case_results(self, evaluation_results: Dict, case_numbers: List[int] = None):
        """Print detailed analysis results for specific evaluation cases"""
        results = evaluation_results['individual_results']

        if case_numbers is None:
            case_numbers = list(range(min(3, len(results))))

        for i in case_numbers:
            if i >= len(results):
                continue

            result = results[i]
            print(f"\n{'='*80}")
            print(f"DETAILED CASE {i+1} RESULTS")
            print(f"{'='*80}")

            print(f"\nQUESTION:")
            print(f"{result['pattern']['question']}")

            print(f"\nGRAPH STRUCTURE RESPONSE:")
            print(f"Representation:\n{result['pattern']['graph_representation']}")
            print(f"Generated Response: {result['graph_response']}")

            print(f"\nTABULAR STRUCTURE RESPONSE:")
            print(f"Representation:\n{result['pattern']['tabular_representation']}")
            print(f"Generated Response: {result['tabular_response']}")

            print(f"\nJUDGE EVALUATION:")
            print(f"Order mapping: {result['order_mapping']}")
            print(f"Judge chose: Response {result['judge_evaluation']['better_response']}")
            print(f"Final winner: {result['winner']}")
            print(f"Confidence: {result['judge_evaluation'].get('confidence', 'unknown')}")

            criteria = result['judge_evaluation'].get('evaluation_criteria', {})
            if criteria:
                print(f"Evaluation criteria breakdown: {criteria}")

            print(f"\nJUDGE RATIONALE:")
            print(f"{result['judge_evaluation']['rationale']}")

            print(f"\nRAW JUDGE RESPONSE:")
            print(f"{result['judge_raw_response']}")
            print(f"{'='*80}\n")


def run_llm_judge_evaluation_pipeline():
    """Execute the complete LLM judge evaluation pipeline for medical unit transitions"""
    print("="*60)
    print("MEDICAL UNIT TRANSITIONS CON-J EVALUATION PIPELINE")
    print("="*60)

    try:
        evaluator = EnhancedMedicalConJEvaluator()
        print("✓ Evaluator loaded successfully!")
    except Exception as e:
        print(f"✗ Failed to load evaluator: {e}")
        return None

    print(f"\n{'='*40}")
    print("LOADING UNIT TRANSITIONS DATASET")
    print(f"{'='*40}")

    patterns = evaluator.load_admission_patterns('/content/sample_data/diagnosis_restructured.jsonl')

    if not patterns:
        print("No patterns loaded. Using sample data...")
        patterns = evaluator.create_sample_patterns()

    print(f"Loaded {len(patterns)} patterns for evaluation")

    if patterns:
        print(f"\nSample pattern structure:")
        sample = patterns[0]
        print(f"Question: {sample['question']}")
        print(f"Graph representation preview: {sample['graph_representation'][:200]}...")
        print(f"Table representation preview: {sample['tabular_representation'][:200]}...")

    print(f"\n{'='*40}")
    print("RUNNING LLM JUDGE EVALUATION")
    print(f"{'='*40}")

    max_patterns = min(50, len(patterns))
    evaluation_results = evaluator.run_comprehensive_evaluation(patterns, max_patterns)

    print(f"\n{'='*40}")
    print("EVALUATION REPORT")
    print(f"{'='*40}")

    report = evaluator.generate_evaluation_report(evaluation_results)
    print(report)

    print(f"\n{'='*40}")
    print("DETAILED CASE ANALYSIS")
    print(f"{'='*40}")

    evaluator.print_detailed_case_results(evaluation_results, [0, 1])

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"unit_transitions_evaluation_{timestamp}.json"

    try:
        serializable_results = {
            'analysis': evaluation_results['analysis'],
            'metadata': evaluation_results['metadata'],
            'individual_results_summary': [
                {
                    'question': r['pattern']['question'],
                    'winner': r['winner'],
                    'confidence': r['judge_evaluation'].get('confidence', 'unknown'),
                    'rationale_preview': r['judge_evaluation']['rationale'][:200],
                    'evaluation_criteria': r['judge_evaluation'].get('evaluation_criteria', {})
                }
                for r in evaluation_results['individual_results']
            ]
        }

        with open(results_file, 'w') as f:
            json.dump(serializable_results, f, indent=2)

        print(f"\n✓ Results saved to {results_file}")

    except Exception as e:
        print(f"Warning: Could not save results to file: {e}")

    print(f"\n✓ Unit Transitions LLM Judge evaluation pipeline completed!")
    return evaluation_results


if __name__ == "__main__":
    results = run_llm_judge_evaluation_pipeline()