In [12]:
# %% [markdown]
# # RAG System Performance Evaluation Framework
# 
# Implement a complete RAG system evaluation based on the provided paper data structure

# %%
# First install necessary libraries (if not already installed)
import sys
!{sys.executable} -m pip install numpy==1.26.4 sentence-transformers scikit-learn pandas matplotlib seaborn tqdm --quiet

# Set Matplotlib backend to avoid matplotlib_inline conflicts
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend

# Import other libraries
import json
import numpy as np
import pandas as pd
import requests
import time
from typing import List, Dict, Any, Tuple, Optional
import re
from dataclasses import dataclass, field
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import hashlib
from tqdm.auto import tqdm
import warnings
import threading
warnings.filterwarnings('ignore')

# Try to import visualization libraries, skip if failed
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    VISUALIZATION_AVAILABLE = True
    # Set font for Chinese display (keeping this as it's a configuration)
    plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
    plt.rcParams['axes.unicode_minus'] = False
except ImportError:
    VISUALIZATION_AVAILABLE = False
    print("Warning: Cannot import matplotlib/seaborn, visualization features will be disabled")

# %%
# ==================== Configuration Area ====================
class Config:
    """Configuration parameters"""
    # API Configuration
    DEEPSEEK_API_KEY = "sk-79990d599cd74bc0a56f6ca2f200a621"  # Replace with your API key
    API_BASE_URL = "https://api.deepseek.com/v1"
    
    # Vector Database Configuration
    EMBEDDING_MODEL = "all-MiniLM-L6-v2"  # Smaller model to reduce memory usage
    TOP_K_RETRIEVAL = 5  # Number of documents to retrieve
    SIMILARITY_THRESHOLD = 0.7  # Similarity threshold
    
    # Evaluation Configuration
    MAX_ANSWER_LENGTH = 1500
    TEMPERATURE = 0.1
    
    # File Paths
    DATA_FILE = "cleaned_papers.jsonl"
    REPORT_FILE = "rag_evaluation_report.json"
    EVALUATION_CSV = "evaluation_results.csv"
    
    # Test Questions
    TEST_QUESTIONS = [
         
    {
        "question": "Based on the latest papers from this month, could you please introduce to me the technological breakthroughs that have emerged in the field of video dynamic editing recently? And could you briefly explain them to me based on the abstracts of the papers?",
        "topic": "computer vision, video editing",
        "difficulty": "medium"
    },
    {
        "question": "How many core bottlenecks exist in current autoregressive video diffusion models?",
        "topic": "machine learning, computer vision",
        "difficulty": "medium"
    },
    {
        "question": "What noteworthy papers in the machine learning and computer vision fields have been published in the past month? Please provide links or DOIs.",
        "topic": "machine learning, computer vision",
        "difficulty": "medium"
    }

    ]

config = Config()

# %%
# ==================== 1. Data Loading and Preprocessing ====================

@dataclass
class Paper:
    """Paper data structure (adapted to your JSONL format)"""
    paper_id: str
    title: str
    abstract: str
    authors: List[str]
    first_author: str
    topic: str
    categories: List[str]
    publish_date: str
    url: str
    embedding_text: str
    quality_scores: Dict[str, float]
    quality_tier: str
    basic_keywords: List[str]
    domain_keywords: List[str]
    update_date: str = ""  # Add update_date field with default value
    
    def __post_init__(self):
        """Post-initialization processing"""
        # Ensure all fields are of correct type
        if isinstance(self.authors, str):
            self.authors = [a.strip() for a in self.authors.split(',')]
        elif not isinstance(self.authors, list):
            self.authors = []
        
        if isinstance(self.categories, str):
            self.categories = [c.strip() for c in self.categories.split(',')]
        elif not isinstance(self.categories, list):
            self.categories = []
            
        if isinstance(self.basic_keywords, str):
            self.basic_keywords = [k.strip() for k in self.basic_keywords.split(',')]
        elif not isinstance(self.basic_keywords, list):
            self.basic_keywords = []
            
        if isinstance(self.domain_keywords, str):
            self.domain_keywords = [k.strip() for k in self.domain_keywords.split(',')]
        elif not isinstance(self.domain_keywords, list):
            self.domain_keywords = []
    
    def to_text(self) -> str:
        """Convert paper to text for embedding"""
        # Use embedding_text field (already available in your data)
        if self.embedding_text:
            return self.embedding_text
        
        # If embedding_text doesn't exist, construct text
        text_parts = [
            f"Paper Title: {self.title}",
            f"Research Topic: {self.topic}",
            f"Authors: {', '.join(self.authors[:3])}",
            f"Abstract: {self.abstract[:500]}",
        ]
        
        if self.categories:
            text_parts.append(f"Categories: {', '.join(self.categories)}")
        
        if self.basic_keywords:
            text_parts.append(f"Keywords: {', '.join(self.basic_keywords[:5])}")
        
        return "\n".join(text_parts)
    
    def get_quality_score(self) -> float:
        """Get paper quality score"""
        if self.quality_scores and 'overall_quality_score' in self.quality_scores:
            return self.quality_scores['overall_quality_score']
        return 0.5
    
    @classmethod
    def from_dict(cls, data: Dict) -> 'Paper':
        """Create Paper object from dictionary (adapted to your JSONL format)"""
        # Provide default values to avoid KeyError
        defaults = {
            'paper_id': '',
            'title': '',
            'abstract': '',
            'authors': [],
            'first_author': '',
            'topic': '',
            'categories': [],
            'publish_date': '',
            'url': '',
            'embedding_text': '',
            'quality_scores': {},
            'quality_tier': 'medium',
            'basic_keywords': [],
            'domain_keywords': [],
            'update_date': ''  # Add update_date default value
        }
        
        # Merge data with defaults, keep only fields defined in Paper class
        merged_data = {**defaults, **{k: v for k, v in data.items() if k in defaults}}
        
        return cls(**merged_data)

class DataLoader:
    """Data loader"""
    
    @staticmethod
    def load_from_jsonl(file_path: str) -> List[Paper]:
        """Load paper data from JSONL file (adapted to your format)"""
        papers = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    if line.strip():
                        try:
                            data = json.loads(line.strip())
                            paper = Paper.from_dict(data)
                            papers.append(paper)
                        except json.JSONDecodeError as e:
                            print(f"JSON parsing error on line {line_num}: {e}")
                            continue
                        except Exception as e:
                            print(f"Data conversion error on line {line_num}: {e}")
                            continue
            
            print(f"✓ Successfully loaded {len(papers)} papers from {file_path}")
            
            # Print statistics
            if papers:
                DataLoader._print_statistics(papers)
            
        except FileNotFoundError:
            print(f"✗ File {file_path} not found")
            # Create some sample data for testing
            papers = DataLoader.create_sample_data()
        
        return papers
    
    @staticmethod
    def create_sample_data() -> List[Paper]:
        """Create sample data"""
        print("Creating sample data for testing...")
        
        sample_papers = [
            Paper(
                paper_id="2511.20650",
                title="MedROV: Towards Real-Time Open-Vocabulary Detection Across Diverse Medical Imaging Modalities",
                abstract="Traditional object detection models in medical imaging operate within a closed-set paradigm...",
                authors=["Tooba Tehreem Sheikh", "Jean Lahoud", "Rao Muhammad Anwer"],
                first_author="Tooba Tehreem Sheikh",
                topic="artificial intelligence",
                categories=["cs.CV", "cs.AI"],
                publish_date="2025-11-25",
                url="http://arxiv.org/abs/2511.20650",
                embedding_text="Paper Title: MedROV: Towards Real-Time Open-Vocabulary Detection...",
                quality_scores={"overall_quality_score": 0.975},
                quality_tier="high",
                basic_keywords=["detection", "medical", "imaging"],
                domain_keywords=["object detection", "medical imaging"]
            ),
            Paper(
                paper_id="2511.20640",
                title="MotionV2V: Editing Motion in a Video",
                abstract="While generative video models have achieved remarkable fidelity and consistency...",
                authors=["Ryan Burgert", "Charles Herrmann", "Forrester Cole"],
                first_author="Ryan Burgert",
                topic="artificial intelligence",
                categories=["cs.CV", "cs.AI", "cs.GR"],
                publish_date="2025-11-25",
                url="http://arxiv.org/abs/2511.20640",
                embedding_text="Paper Title: MotionV2V: Editing Motion in a Video...",
                quality_scores={"overall_quality_score": 0.9},
                quality_tier="high",
                basic_keywords=["video", "motion", "editing"],
                domain_keywords=["video editing", "motion control"]
            )
        ]
        
        print(f"✓ Created {len(sample_papers)} sample papers")
        return sample_papers
    
    @staticmethod
    def _print_statistics(papers: List[Paper]):
        """Print dataset statistics"""
        print("\nDataset Statistics:")
        print("-" * 40)
        print(f"Total Papers: {len(papers)}")
        
        # Topic distribution
        topics = [p.topic for p in papers]
        unique_topics = set(topics)
        print(f"Number of Topics: {len(unique_topics)}")
        
        # Quality distribution
        quality_tiers = [p.quality_tier for p in papers]
        tier_counts = {tier: quality_tiers.count(tier) for tier in set(quality_tiers)}
        print("\nQuality Tier Distribution:")
        for tier, count in tier_counts.items():
            print(f"  {tier}: {count} papers ({count/len(papers)*100:.1f}%)")
        
        # Author statistics
        all_authors = [author for p in papers for author in p.authors]
        unique_authors = set(all_authors)
        print(f"Total Authors: {len(unique_authors)}")
        
        # Latest paper date
        dates = [p.publish_date for p in papers if p.publish_date]
        if dates:
            print(f"Latest Paper Date: {max(dates)}")
    
    @staticmethod
    def analyze_topic_distribution(papers: List[Paper]) -> pd.DataFrame:
        """Analyze topic distribution"""
        topic_counts = {}
        for paper in papers:
            topic = paper.topic
            topic_counts[topic] = topic_counts.get(topic, 0) + 1
        
        # Convert to DataFrame
        df = pd.DataFrame(list(topic_counts.items()), columns=['topic', 'count'])
        df = df.sort_values('count', ascending=False)
        
        return df

# %%
# ==================== 2. Embedding and Vector Database ====================

class EmbeddingModel:
    """Embedding model wrapper"""
    
    def __init__(self, model_name: str = None, use_api: bool = False, api_key: str = None):
        """
        Initialize embedding model
        
        Args:
            model_name: Local model name
            use_api: Whether to use API
            api_key: API key
        """
        self.use_api = use_api
        self.api_key = api_key
        
        if use_api and api_key:
            self.model = None
            print("✓ Using DeepSeek API for embedding")
        else:
            print(f"✓ Using local embedding model")
            try:
                from sentence_transformers import SentenceTransformer
                model_name = model_name or config.EMBEDDING_MODEL
                self.model = SentenceTransformer(model_name)
                print(f"  Model: {model_name}")
                print(f"  Dimension: {self.model.get_sentence_embedding_dimension()}")
            except ImportError:
                print("Warning: Cannot import sentence_transformers, using simple embedding")
                self.model = None
    
    def embed(self, texts: List[str]) -> np.ndarray:
        """Generate embedding vectors"""
        if isinstance(texts, str):
            texts = [texts]
        
        if not texts:
            return np.array([])
        
        if self.use_api and self.api_key:
            return self._embed_api(texts)
        elif self.model:
            return self._embed_local(texts)
        else:
            # Fall back to simple word vectors
            return self._embed_simple(texts)
    
    def _embed_local(self, texts: List[str]) -> np.ndarray:
        """Embed using local model"""
        try:
            # Batch processing to avoid memory issues
            batch_size = 32
            embeddings = []
            
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i + batch_size]
                batch_embeddings = self.model.encode(batch, show_progress_bar=False)
                embeddings.append(batch_embeddings)
            
            return np.vstack(embeddings) if embeddings else np.array([])
        except Exception as e:
            print(f"Local embedding failed: {e}")
            return self._embed_simple(texts)
    
    def _embed_api(self, texts: List[str]) -> np.ndarray:
        """Embed using DeepSeek API"""
        try:
            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }
            
            # Only process first few texts to avoid API limits
            texts = texts[:10] if len(texts) > 10 else texts
            
            data = {
                "model": "text-embedding-3-small",
                "input": texts,
                "encoding_format": "float"
            }
            
            response = requests.post(
                f"{config.API_BASE_URL}/embeddings",
                headers=headers,
                json=data,
                timeout=30
            )
            response.raise_for_status()
            
            result = response.json()
            embeddings = [item["embedding"] for item in result["data"]]
            return np.array(embeddings)
            
        except Exception as e:
            print(f"API embedding failed: {e}")
            # Fall back to local model
            if self.model:
                return self._embed_local(texts)
            else:
                return self._embed_simple(texts)
    
    def _embed_simple(self, texts: List[str]) -> np.ndarray:
        """Simple word vector embedding (fallback solution)"""
        print("Using simple embedding method...")
        # Create simple word frequency vectors
        vocab = {}
        for text in texts:
            words = text.lower().split()
            for word in words:
                if word not in vocab:
                    vocab[word] = len(vocab)
        
        embeddings = []
        for text in texts:
            vector = np.zeros(len(vocab))
            words = text.lower().split()
            for word in words:
                if word in vocab:
                    vector[vocab[word]] += 1
            # Normalize
            if np.linalg.norm(vector) > 0:
                vector = vector / np.linalg.norm(vector)
            embeddings.append(vector)
        
        return np.array(embeddings)
    
    def get_dimension(self) -> int:
        """Get embedding dimension"""
        if self.use_api:
            return 1536  # Dimension of DeepSeek text-embedding-3-small
        elif self.model:
            return self.model.get_sentence_embedding_dimension()
        return 100  # Dimension of simple embedding

class VectorStore:
    """Vector database"""
    
    def __init__(self, embedder: EmbeddingModel):
        self.embedder = embedder
        self.documents: List[str] = []
        self.metadata: List[Dict] = []
        self.embeddings: np.ndarray = None
    
    def add_papers(self, papers: List[Paper], use_embedding_text: bool = True):
        """Add papers to vector store"""
        print(f"Processing {len(papers)} papers...")
        
        for paper in tqdm(papers, desc="Adding papers"):
            # Convert to text
            if use_embedding_text and paper.embedding_text:
                doc_text = paper.embedding_text
            else:
                doc_text = paper.to_text()
            
            # Store document and metadata
            self.documents.append(doc_text)
            self.metadata.append({
                'paper_id': paper.paper_id,
                'title': paper.title,
                'authors': paper.authors,
                'first_author': paper.first_author,
                'topic': paper.topic,
                'categories': paper.categories,
                'publish_date': paper.publish_date,
                'quality_score': paper.get_quality_score(),
                'quality_tier': paper.quality_tier
            })
        
        # Generate embedding vectors
        print("Generating embedding vectors...")
        self.embeddings = self.embedder.embed(self.documents)
        
        print(f"✓ Vector store built successfully")
        print(f"  Number of documents: {len(self.documents)}")
        print(f"  Embedding dimension: {self.embeddings.shape[1]}")
    
    def search(self, query: str, top_k: int = None, threshold: float = None) -> List[Dict]:
        """Semantic search"""
        if top_k is None:
            top_k = config.TOP_K_RETRIEVAL
        if threshold is None:
            threshold = config.SIMILARITY_THRESHOLD
        
        if len(self.documents) == 0 or self.embeddings is None:
            print("Warning: Vector store is empty")
            return []
        
        # Query embedding
        query_embedding = self.embedder.embed(query)
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        # Calculate similarities
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        # Get most similar documents
        indices = np.argsort(similarities)[::-1]
        
        results = []
        for idx in indices:
            similarity = float(similarities[idx])
            
            # Apply threshold filtering
            if similarity < threshold and len(results) >= top_k:
                continue
            
            result = {
                'paper_id': self.metadata[idx]['paper_id'],
                'document': self.documents[idx],
                'metadata': self.metadata[idx],
                'similarity': similarity,
                'rank': len(results) + 1
            }
            results.append(result)
            
            if len(results) >= top_k:
                break
        
        return results
    
    def get_stats(self) -> Dict:
        """Get statistics"""
        if len(self.documents) == 0:
            return {'total_documents': 0}
        
        stats = {
            'total_documents': len(self.documents),
            'embedding_dimension': self.embeddings.shape[1] if self.embeddings is not None else 0,
            'unique_topics': len(set(m['topic'] for m in self.metadata))
        }
        
        return stats

# %%
# ==================== 3. RAG System ====================

class DeepSeekClient:
    """DeepSeek API client"""
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = config.API_BASE_URL
        self.total_tokens = 0
    
    def generate_response(self, prompt: str, temperature: float = None, 
                         max_tokens: int = None, model: str = "deepseek-chat") -> str:
        """Generate answer"""
        if temperature is None:
            temperature = config.TEMPERATURE
        if max_tokens is None:
            max_tokens = config.MAX_ANSWER_LENGTH
        
        # If no API key, return mock response
        if not self.api_key or self.api_key == "your-deepseek-api-key":
            print("Warning: Using mock API response (please set correct API key)")
            return self._mock_response(prompt)
        
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        data = {
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": temperature,
            "max_tokens": max_tokens,
            "stream": False
        }
        
        try:
            response = requests.post(
                f"{self.base_url}/chat/completions",
                headers=headers,
                json=data,
                timeout=60
            )
            response.raise_for_status()
            
            result = response.json()
            
            # Record token usage
            if 'usage' in result:
                self.total_tokens += result['usage']['total_tokens']
            
            return result["choices"][0]["message"]["content"]
            
        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}")
            return f"Error: Failed to generate answer ({str(e)})"
        except KeyError as e:
            print(f"API response parsing failed: {e}")
            return "Error: Incorrect response format"
    
    def _mock_response(self, prompt: str) -> str:
        """Mock API response"""
        time.sleep(0.5)  # Simulate delay
        
        # Generate mock answer based on prompt content
        if "医学影像" in prompt or "medical" in prompt.lower():
            return """Open-Vocabulary Object Detection (OVOD) in medical imaging is a technology that can detect object categories not seen during training.

It solves the following problems of traditional object detection:
1. Closed vocabulary limitation: Traditional methods can only detect categories present in the training set
2. Data scarcity issue: Medical image annotation data is difficult to obtain
3. Generalization ability: Can identify new lesions or anatomical structures

MedROV is the latest research in this area, achieving real-time open-vocabulary detection [Paper 1]."""
        elif "视频运动编辑" in prompt or "video" in prompt.lower():
            return """Recent technical breakthroughs in video motion editing include:

1. MotionV2V model: Modifies video motion by editing sparse trajectories [Paper 2]
2. Motion counterfactual generation: Creates video pairs with the same content but different motions
3. Timestamp control: Can start editing from any time point and propagate naturally
4. User studies show that MotionV2V achieves over 65% preference rate in comparison tests"""
        else:
            return "This is a mock response. Please set a correct DeepSeek API key for actual use."
    
    def evaluate_answer(self, question: str, answer: str) -> Dict:
        """Evaluate answer quality using DeepSeek"""
        # If no API key, return mock evaluation
        if not self.api_key or self.api_key == "your-deepseek-api-key":
            return self._mock_evaluation(question, answer)
        
        prompt = f"""Please evaluate the quality of the following answer generated by a Retrieval-Augmented Generation (RAG) system.

Question: {question}

Answer: {answer}

Reference Paper Context Hint: The RAG system retrieves information from the latest academic papers (including but not limited to paper IDs: 2511.20650, 2511.20649). The answer should leverage the unique, up-to-date knowledge from these papers.

Please provide a score from 1-5 (5 being best) for each of the following dimensions:
1. Accuracy: Whether the answer content is factually correct and consistent with the retrieved paper information.
2. Completeness: Whether the question is answered comprehensively, including both basic information and insights from the papers.
3. Relevance: Whether the answer is closely related to the question, and whether the cited paper content is directly relevant to the question.
4. Specificity: Whether it contains specific details (e.g., model names, datasets, metrics, technical components) and examples from the retrieved papers.
5. Timeliness & Paper Utilization: Whether the answer uses up-to-date knowledge from the retrieved papers that cannot be obtained from general SPI calls; whether it correctly references the papers' core innovations.
6. Overall quality: Comprehensive rating considering both text quality and RAG-specific advantages.

Please return the results in JSON format, including scores and brief reasons for each dimension, as well as a total score (average of all scores, rounded to 2 decimal places)."""
        
        try:
            response = self.generate_response(prompt, temperature=0.1, max_tokens=800)
            
            # Extract JSON part
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                evaluation = json.loads(json_match.group())
                return evaluation
            else:
                return {
                    "accuracy": {"score": 3, "reason": "Cannot parse evaluation result"},
                    "completeness": {"score": 3, "reason": "Cannot parse evaluation result"},
                    "relevance": {"score": 3, "reason": "Cannot parse evaluation result"},
                    "specificity": {"score": 3, "reason": "Cannot parse evaluation result"},
                    "overall_quality": {"score": 3, "reason": "Cannot parse evaluation result"},
                    "total_score": 3.0
                }
                
        except Exception as e:
            print(f"Evaluation failed: {e}")
            return self._mock_evaluation(question, answer)
    
    def _mock_evaluation(self, question: str, answer: str) -> Dict:
        """Mock evaluation result"""
        return {
            "accuracy": {"score": 4, "reason": "Answer content is basically accurate"},
            "completeness": {"score": 3, "reason": "Answers the main question but not comprehensively enough"},
            "relevance": {"score": 4, "reason": "Highly relevant to the question"},
            "specificity": {"score": 3, "reason": "Contains some specific information"},
            "overall_quality": {"score": 3.5, "reason": "Good overall quality"},
            "total_score": 3.5
        }

class RAGSystem:
    """RAG system"""
    
    def __init__(self, vector_store: VectorStore, llm_client: DeepSeekClient):
        self.vector_store = vector_store
        self.llm = llm_client
    
    def query(self, question: str, top_k: int = None, include_context: bool = True) -> Dict:
        """Execute RAG query"""
        
        # 1. Retrieve relevant documents
        retrieved_docs = self.vector_store.search(question, top_k=top_k)
        
        # 2. Build prompt
        if include_context and retrieved_docs:
            context = self._build_context(retrieved_docs)
            prompt = self._build_rag_prompt(question, context)
            method = "RAG"
        else:
            prompt = self._build_baseline_prompt(question)
            method = "Baseline"
        
        # 3. Generate answer
        answer = self.llm.generate_response(prompt)
        
        return {
            'question': question,
            'answer': answer,
            'retrieved_docs': retrieved_docs,
            'method': method,
            'prompt_preview': prompt[:200] + "..." if len(prompt) > 200 else prompt,
        }
    
    def _build_context(self, docs: List[Dict]) -> str:
        """Build context"""
        context_parts = ["Based on the following research paper information:"]
        
        for i, doc in enumerate(docs, 1):
            meta = doc['metadata']
            context_parts.append(
                f"[Paper {i}] {meta['title']}\n"
                f"Authors: {meta['first_author']} et al.\n"
                f"Abstract: {self._truncate_text(meta.get('abstract', doc['document']), 200)}\n"
                f"Relevance: {doc['similarity']:.3f}"
            )
        
        return "\n\n".join(context_parts)
    
    def _build_rag_prompt(self, question: str, context: str) -> str:
        """Build RAG prompt"""
        return f"""You are an AI research assistant. Please answer the user's question based on the provided academic literature.

Available Literature:
{context}

User Question: {question}

Please answer according to the following requirements:
1. Mainly based on the provided literature information
2. Cite relevant literature in your answer, formatted as [Paper 1], [Paper 2], etc.
3. If the literature information is insufficient, you can appropriately supplement relevant knowledge
4. Maintain academic rigor

Please provide a detailed and accurate answer:"""
    
    def _build_baseline_prompt(self, question: str) -> str:
        """Build baseline prompt (without RAG)"""
        return f"""You are an AI research assistant. Please answer the following academic question.

Question: {question}

Please provide a detailed and accurate answer:"""
    
    @staticmethod
    def _truncate_text(text: str, max_length: int) -> str:
        """Truncate text"""
        if len(text) <= max_length:
            return text
        return text[:max_length] + "..."

# %%
# ==================== 4. Dual Evaluation System ====================

class AutoEvaluationMetrics:
    """Automatic evaluation metrics calculation"""
    
    def evaluate_response(self, 
                         question: str,
                         answer: str,
                         retrieved_docs: List[Dict] = None,
                         baseline_answer: str = None) -> Dict:
        """Evaluate a single response"""
        
        metrics = {
            'answer_length': len(answer),
            'word_count': len(re.findall(r'\w+', answer)),
            'has_error': 1 if "Error:" in answer or "error" in answer.lower() else 0,
        }
        
        # Retrieval-related metrics
        if retrieved_docs:
            metrics.update(self._calculate_retrieval_metrics(retrieved_docs))
        
        # Content quality metrics
        metrics.update(self._calculate_content_metrics(answer, question))
        
        # Citation quality metrics
        metrics.update(self._calculate_citation_metrics(answer))
        
        # Comparison with baseline
        if baseline_answer:
            metrics.update(self._calculate_comparison_metrics(answer, baseline_answer))
        
        # Calculate automatic evaluation total score
        metrics['auto_score'] = self._calculate_overall_score(metrics)
        
        return metrics
    
    def _calculate_retrieval_metrics(self, retrieved_docs: List[Dict]) -> Dict:
        """Calculate retrieval-related metrics"""
        if not retrieved_docs:
            return {}
        
        similarities = [doc['similarity'] for doc in retrieved_docs]
        return {
            'retrieved_docs_count': len(retrieved_docs),
            'avg_similarity': np.mean(similarities),
            'max_similarity': max(similarities),
        }
    
    def _calculate_content_metrics(self, answer: str, question: str) -> Dict:
        """Calculate content quality metrics"""
        # Technical term detection
        technical_terms = ['model', 'algorithm', 'detection', 'learning', 'training', 'accuracy', 'precision']
        tech_term_count = sum(1 for term in technical_terms if term in answer)
        
        # Question keyword matching
        question_words = set(re.findall(r'\w+', question.lower()))
        answer_words = set(re.findall(r'\w+', answer.lower()))
        keyword_matches = len(question_words.intersection(answer_words))
        
        return {
            'technical_terms': tech_term_count,
            'keyword_matches': keyword_matches,
            'has_citation': 1 if '[' in answer and ']' in answer else 0,
        }
    
    def _calculate_citation_metrics(self, answer: str) -> Dict:
        """Calculate citation quality metrics"""
        # Detect citations
        citations = re.findall(r'\[.*?\d+.*?\]', answer)
        return {
            'citation_count': len(citations),
            'unique_citations': len(set(citations)),
        }
    
    def _calculate_comparison_metrics(self, rag_answer: str, baseline_answer: str) -> Dict:
        """Calculate comparison metrics"""
        rag_length = len(rag_answer)
        baseline_length = len(baseline_answer)
        
        return {
            'length_ratio': rag_length / max(baseline_length, 1),
            'length_difference': rag_length - baseline_length,
        }
    
    def _calculate_overall_score(self, metrics: Dict) -> float:
        """Calculate automatic evaluation total score"""
        score = 0.0
        
        # Length score (moderate is better)
        length = metrics.get('answer_length', 0)
        if 200 <= length <= 800:
            score += 0.3
        elif length > 50:
            score += 0.2
        
        # Technical term score
        tech_terms = metrics.get('technical_terms', 0)
        score += min(tech_terms * 0.1, 0.3)
        
        # Citation score
        citations = metrics.get('citation_count', 0)
        score += min(citations * 0.2, 0.3)
        
        # Similarity score
        avg_sim = metrics.get('avg_similarity', 0)
        score += avg_sim * 0.1
        
        # No error bonus
        if metrics.get('has_error', 1) == 0:
            score += 0.1
        
        return min(score, 1.0)

class CombinedEvaluator:
    """Combined evaluator (automatic evaluation + API evaluation)"""
    
    def __init__(self, rag_system: RAGSystem, llm_client: DeepSeekClient):
        self.rag_system = rag_system
        self.llm_client = llm_client
        self.auto_evaluator = AutoEvaluationMetrics()
        self.results = []
    
    def evaluate_question(self, question_data: Dict) -> Dict:
        """Evaluate a single question"""
        question = question_data['question']
        topic = question_data.get('topic', '')
        
        print(f"\nEvaluating question: {question}")
        
        # RAG query
        rag_response = self.rag_system.query(question, include_context=True)
        
        # Baseline query
        baseline_response = self.rag_system.query(question, include_context=False)
        
        # Automatic evaluation (RAG)
        auto_metrics_rag = self.auto_evaluator.evaluate_response(
            question=question,
            answer=rag_response['answer'],
            retrieved_docs=rag_response['retrieved_docs'],
            baseline_answer=baseline_response['answer']
        )
        
        # Automatic evaluation (baseline)
        auto_metrics_baseline = self.auto_evaluator.evaluate_response(
            question=question,
            answer=baseline_response['answer'],
            retrieved_docs=None,  # Baseline has no retrieved documents
            baseline_answer=None  # Baseline has no comparison object
        )
        
        # API evaluation (RAG answer)
        api_evaluation_rag = self.llm_client.evaluate_answer(question, rag_response['answer'])
        
        # API evaluation (baseline answer)
        api_evaluation_baseline = self.llm_client.evaluate_answer(question, baseline_response['answer'])
        
        # Build result
        result = {
            'question_id': len(self.results) + 1,
            'question': question,
            'topic': topic,
            
            # RAG results
            'rag_answer': rag_response['answer'],
            'rag_method': rag_response['method'],
            'rag_retrieved_docs': [
                {
                    'title': doc['metadata']['title'],
                    'similarity': float(doc['similarity']),
                    'first_author': doc['metadata']['first_author']
                }
                for doc in rag_response['retrieved_docs']
            ],
            
            # Baseline results
            'baseline_answer': baseline_response['answer'],
            'baseline_method': baseline_response['method'],
            
            # Automatic evaluation results (RAG)
            'auto_evaluation_rag': auto_metrics_rag,
            'auto_score_rag': auto_metrics_rag.get('auto_score', 0),
            
            # Automatic evaluation results (baseline)
            'auto_evaluation_baseline': auto_metrics_baseline,
            'auto_score_baseline': auto_metrics_baseline.get('auto_score', 0),
            
            # API evaluation results
            'api_evaluation_rag': api_evaluation_rag,
            'api_evaluation_baseline': api_evaluation_baseline,
            'api_score_rag': api_evaluation_rag.get('total_score', 0),
            'api_score_baseline': api_evaluation_baseline.get('total_score', 0),
            
            # Comprehensive comparison
            'auto_improvement': auto_metrics_rag.get('auto_score', 0) - auto_metrics_baseline.get('auto_score', 0),
            'api_improvement': api_evaluation_rag.get('total_score', 0) - api_evaluation_baseline.get('total_score', 0),
        }
        
        self.results.append(result)
        
        # Print brief results
        print(f"  Automatic scores: RAG={auto_metrics_rag.get('auto_score', 0):.2f}, Baseline={auto_metrics_baseline.get('auto_score', 0):.2f}")
        print(f"  API scores: RAG={api_evaluation_rag.get('total_score', 0):.2f}, Baseline={api_evaluation_baseline.get('total_score', 0):.2f}")
        print(f"  Retrieved documents: {len(rag_response['retrieved_docs'])}")
        
        return result
    
    def evaluate_all(self, questions: List[Dict]) -> List[Dict]:
        """Evaluate all questions"""
        print(f"Evaluating {len(questions)} questions...")
        
        for question_data in questions:
            self.evaluate_question(question_data)
        
        print(f"\n✓ All evaluations completed")
        return self.results
    
    def generate_report(self) -> Dict:
        """Generate evaluation report"""
        if not self.results:
            return {}
        
        # Collect statistics
        auto_scores_rag = [r['auto_score_rag'] for r in self.results]
        auto_scores_baseline = [r['auto_score_baseline'] for r in self.results]
        api_scores_rag = [r['api_score_rag'] for r in self.results]
        api_scores_baseline = [r['api_score_baseline'] for r in self.results]
        auto_improvements = [r['auto_improvement'] for r in self.results]
        api_improvements = [r['api_improvement'] for r in self.results]
        
        # Calculate correlation (automatic evaluation vs API evaluation)
        if len(auto_scores_rag) > 1:
            correlation = np.corrcoef(auto_scores_rag, api_scores_rag)[0, 1]
        else:
            correlation = 0
        
        report = {
            'summary': {
                'total_questions': len(self.results),
                'avg_auto_score_rag': float(np.mean(auto_scores_rag)),
                'avg_auto_score_baseline': float(np.mean(auto_scores_baseline)),
                'avg_api_score_rag': float(np.mean(api_scores_rag)),
                'avg_api_score_baseline': float(np.mean(api_scores_baseline)),
                'avg_auto_improvement': float(np.mean(auto_improvements)),
                'avg_api_improvement': float(np.mean(api_improvements)),
                'auto_improvement_rate': sum(1 for imp in auto_improvements if imp > 0) / len(auto_improvements),
                'api_improvement_rate': sum(1 for imp in api_improvements if imp > 0) / len(api_improvements),
                'correlation_auto_vs_api': float(correlation),
                'total_tokens_used': self.llm_client.total_tokens
            },
            'detailed_results': self.results
        }
        
        return report
    
    def save_results(self):
        """Save results to files"""
        # Save JSON report
        report = self.generate_report()
        with open(config.REPORT_FILE, 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)
        print(f"✓ Evaluation report saved to {config.REPORT_FILE}")
        
        # Save CSV results
        df_data = []
        for result in self.results:
            row = {
                'question_id': result['question_id'],
                'question': result['question'],
                'topic': result['topic'],
                'rag_answer_length': len(result['rag_answer']),
                'baseline_answer_length': len(result['baseline_answer']),
                'auto_score_rag': result['auto_score_rag'],
                'auto_score_baseline': result['auto_score_baseline'],
                'api_score_rag': result['api_score_rag'],
                'api_score_baseline': result['api_score_baseline'],
                'auto_improvement': result['auto_improvement'],
                'api_improvement': result['api_improvement'],
                'retrieved_docs_count': len(result['rag_retrieved_docs']),
            }
            
            # Add RAG automatic evaluation metrics
            for key, value in result['auto_evaluation_rag'].items():
                if isinstance(value, (int, float)):
                    row[f'auto_rag_{key}'] = value
            
            # Add baseline automatic evaluation metrics
            for key, value in result['auto_evaluation_baseline'].items():
                if isinstance(value, (int, float)):
                    row[f'auto_baseline_{key}'] = value
            
            df_data.append(row)
        
        df = pd.DataFrame(df_data)
        df.to_csv(config.EVALUATION_CSV, index=False, encoding='utf-8-sig')
        print(f"✓ Detailed results saved to {config.EVALUATION_CSV}")
        
        return df

# %%
# ==================== 5. Visualization and Main Program ====================

def create_visualizations(results: List[Dict], save_dir: str = "."):
    """Create visualization charts"""
    if not VISUALIZATION_AVAILABLE:
        print("Warning: Visualization libraries not available, skipping chart generation")
        return
    
    try:
        # Prepare data
        df = pd.DataFrame([{
            'question_id': r['question_id'],
            'auto_score_rag': r['auto_score_rag'],
            'auto_score_baseline': r['auto_score_baseline'],
            'api_score_rag': r['api_score_rag'],
            'api_score_baseline': r['api_score_baseline'],
            'auto_improvement': r['auto_improvement'],
            'api_improvement': r['api_improvement'],
            'retrieved_docs': len(r['rag_retrieved_docs'])
        } for r in results])
        
        # Create charts
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # 1. API Score Comparison
        x = range(len(df))
        width = 0.35
        axes[0, 0].bar([i - width/2 for i in x], df['api_score_baseline'], width, label='Baseline', alpha=0.7)
        axes[0, 0].bar([i + width/2 for i in x], df['api_score_rag'], width, label='RAG', alpha=0.7)
        axes[0, 0].set_xlabel('Question ID')
        axes[0, 0].set_ylabel('API Score')
        axes[0, 0].set_title('RAG vs Baseline API Score Comparison')
        axes[0, 0].legend()
        axes[0, 0].set_xticks(x)
        axes[0, 0].set_xticklabels(df['question_id'])
        
        # 2. Automatic Score Comparison
        axes[0, 1].bar([i - width/2 for i in x], df['auto_score_baseline'], width, label='Baseline', alpha=0.7)
        axes[0, 1].bar([i + width/2 for i in x], df['auto_score_rag'], width, label='RAG', alpha=0.7)
        axes[0, 1].set_xlabel('Question ID')
        axes[0, 1].set_ylabel('Automatic Score')
        axes[0, 1].set_title('RAG vs Baseline Automatic Score Comparison')
        axes[0, 1].legend()
        axes[0, 1].set_xticks(x)
        axes[0, 1].set_xticklabels(df['question_id'])
        
        # 3. API Improvement Distribution
        axes[1, 0].bar(x, df['api_improvement'], color='green' if df['api_improvement'].mean() > 0 else 'red')
        axes[1, 0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
        axes[1, 0].set_xlabel('Question ID')
        axes[1, 0].set_ylabel('API Improvement Score')
        axes[1, 0].set_title('RAG API Improvement Score Distribution')
        axes[1, 0].set_xticks(x)
        axes[1, 0].set_xticklabels(df['question_id'])
        
        # 4. Automatic Improvement Distribution
        axes[1, 1].bar(x, df['auto_improvement'], color='blue' if df['auto_improvement'].mean() > 0 else 'red')
        axes[1, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
        axes[1, 1].set_xlabel('Question ID')
        axes[1, 1].set_ylabel('Automatic Evaluation Improvement Score')
        axes[1, 1].set_title('RAG Automatic Evaluation Improvement Score Distribution')
        axes[1, 1].set_xticks(x)
        axes[1, 1].set_xticklabels(df['question_id'])
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/evaluation_visualization.png', dpi=150, bbox_inches='tight')
        plt.close()  # Close figure to avoid memory leak
        print(f"✓ Visualization chart saved to {save_dir}/evaluation_visualization.png")
        
    except Exception as e:
        print(f"Visualization generation failed: {e}")
        print("Skipping visualization generation")

def print_detailed_report(report: Dict):
    """Print detailed report"""
    summary = report['summary']
    
    print("\n" + "="*60)
    print("RAG System Evaluation Report")
    print("="*60)
    
    print(f"\nOverall Statistics:")
    print(f"  Total Questions: {summary['total_questions']}")
    print(f"  Average Automatic Score (RAG): {summary['avg_auto_score_rag']:.3f}")
    print(f"  Average Automatic Score (Baseline): {summary['avg_auto_score_baseline']:.3f}")
    print(f"  Average API Score (RAG): {summary['avg_api_score_rag']:.3f}")
    print(f"  Average API Score (Baseline): {summary['avg_api_score_baseline']:.3f}")
    print(f"  Average Automatic Improvement: {summary['avg_auto_improvement']:.3f}")
    print(f"  Average API Improvement: {summary['avg_api_improvement']:.3f}")
    print(f"  Automatic Improvement Rate: {summary['auto_improvement_rate']:.2%}")
    print(f"  API Improvement Rate: {summary['api_improvement_rate']:.2%}")
    print(f"  Correlation between Auto and API Evaluation: {summary['correlation_auto_vs_api']:.3f}")
    print(f"  Total Tokens Used: {summary['total_tokens_used']}")
    
    # Print detailed results for each question
    print(f"\nDetailed Results:")
    for result in report['detailed_results']:
        print(f"\nQuestion {result['question_id']}: {result['question'][:50]}...")
        print(f"  Automatic Scores: RAG={result['auto_score_rag']:.3f}, Baseline={result['auto_score_baseline']:.3f}")
        print(f"  API Scores: RAG={result['api_score_rag']:.3f}, Baseline={result['api_score_baseline']:.3f}")
        print(f"  Improvement: Auto={result['auto_improvement']:.3f}, API={result['api_improvement']:.3f}")
        print(f"  Retrieved Documents: {len(result['rag_retrieved_docs'])}")
        
        # Show retrieved documents
        if result['rag_retrieved_docs']:
            print(f"  Relevant Documents:")
            for doc in result['rag_retrieved_docs'][:2]:  # Show only first 2
                print(f"    - {doc['title'][:50]}... (Similarity: {doc['similarity']:.3f})")

# %%
# ==================== Main Program ====================

def main():
    """Main function"""
    print("="*60)
    print("RAG System Evaluation Framework")
    print("="*60)
    
    # 1. Load data
    print("\n1. Loading paper data...")
    papers = DataLoader.load_from_jsonl(config.DATA_FILE)
    
    if not papers:
        print("Error: No paper data loaded")
        return
    
    # 2. Initialize embedding model
    print("\n2. Initializing embedding model...")
    embedder = EmbeddingModel(
        model_name=config.EMBEDDING_MODEL,
        use_api=False,  # Use local model to avoid API limits
        api_key=config.DEEPSEEK_API_KEY
    )
    
    # 3. Build vector database
    print("\n3. Building vector database...")
    vector_store = VectorStore(embedder)
    vector_store.add_papers(papers)
    
    # Print vector store statistics
    stats = vector_store.get_stats()
    print(f"  Total Documents: {stats['total_documents']}")
    print(f"  Number of Topics: {stats['unique_topics']}")
    
    # 4. Initialize RAG system
    print("\n4. Initializing RAG system...")
    llm_client = DeepSeekClient(config.DEEPSEEK_API_KEY)
    rag_system = RAGSystem(vector_store, llm_client)
    
    # 5. Run evaluation
    print("\n5. Running evaluation...")
    evaluator = CombinedEvaluator(rag_system, llm_client)
    results = evaluator.evaluate_all(config.TEST_QUESTIONS)
    
    # 6. Generate report
    print("\n6. Generating report...")
    report = evaluator.generate_report()
    
    # 7. Save results
    print("\n7. Saving results...")
    df = evaluator.save_results()
    
    # 8. Create visualizations
    if VISUALIZATION_AVAILABLE and len(results) > 0:
        print("\n8. Creating visualization charts...")
        create_visualizations(results)
    
    # 9. Print report
    print("\n9. Evaluation Report:")
    print_detailed_report(report)
    
    # 10. Display DataFrame
    print("\n10. Results DataFrame:")
    print(df[['question_id', 'auto_score_rag', 'auto_score_baseline', 'api_score_rag', 'api_score_baseline', 'auto_improvement', 'api_improvement']].to_string())
    
    print("\n" + "="*60)
    print("Evaluation completed!")
    print("="*60)

# %%
# Run main program
if __name__ == "__main__":
    main()

# %%
# Quick test function (optional)
def quick_test():
    """Quick test function"""
    print("Running quick test...")
    
    # Create sample data
    papers = DataLoader.create_sample_data()
    
    # Initialize components
    embedder = EmbeddingModel(use_api=False)
    vector_store = VectorStore(embedder)
    vector_store.add_papers(papers)
    
    llm_client = DeepSeekClient(config.DEEPSEEK_API_KEY)
    rag_system = RAGSystem(vector_store, llm_client)
    
    # Test single question
    test_question = "什么是医学影像中的开放词汇目标检测？"
    print(f"\nTest question: {test_question}")
    
    response = rag_system.query(test_question)
    print(f"\nRAG answer preview: {response['answer'][:200]}...")
    
    # Evaluate
    evaluator = AutoEvaluationMetrics()
    metrics = evaluator.evaluate_response(
        question=test_question,
        answer=response['answer'],
        retrieved_docs=response['retrieved_docs']
    )
    
    print(f"\nAutomatic evaluation results:")
    print(f"  Answer length: {metrics['answer_length']}")
    print(f"  Technical terms: {metrics.get('technical_terms', 0)}")
    print(f"  Citation count: {metrics.get('citation_count', 0)}")
    print(f"  Automatic score: {metrics.get('auto_score', 0):.2f}")
    
    return response

# Run quick test (uncomment the following line)
# quick_test_response = quick_test()

RAG System Evaluation Framework

1. Loading paper data...
✓ Successfully loaded 695 papers from cleaned_papers.jsonl

Dataset Statistics:
----------------------------------------
Total Papers: 695
Number of Topics: 3

Quality Tier Distribution:
  high: 661 papers (95.1%)
  medium: 34 papers (4.9%)
Total Authors: 3636
Latest Paper Date: 2025-12-04

2. Initializing embedding model...
✓ Using local embedding model
  Model: all-MiniLM-L6-v2
  Dimension: 384

3. Building vector database...
Processing 695 papers...


Adding papers: 100%|██████████| 695/695 [00:00<00:00, 670741.21it/s]

Generating embedding vectors...





✓ Vector store built successfully
  Number of documents: 695
  Embedding dimension: 384
  Total Documents: 695
  Number of Topics: 3

4. Initializing RAG system...

5. Running evaluation...
Evaluating 3 questions...

Evaluating question: Based on the latest papers from this month, could you please introduce to me the technological breakthroughs that have emerged in the field of video dynamic editing recently? And could you briefly explain them to me based on the abstracts of the papers?
  Automatic scores: RAG=0.95, Baseline=0.50
  API scores: RAG=4.83, Baseline=3.83
  Retrieved documents: 5

Evaluating question: How many core bottlenecks exist in current autoregressive video diffusion models?
  Automatic scores: RAG=0.74, Baseline=0.40
  API scores: RAG=4.67, Baseline=3.83
  Retrieved documents: 5

Evaluating question: What noteworthy papers in the machine learning and computer vision fields have been published in the past month? Please provide links or DOIs.
  Automatic scores: RAG=0