In [None]:
%load_ext autoreload

In [None]:
from pathlib import Path

from config import config
from data_processing import (
    load_and_process_csv,
    load_documents_from_csv,
    split_documents,
)
from embeddings import initialize_embeddings
from evaluation import (
    RAGEvaluator,
    evaluate_rag_pipeline,
    generate_test_dataset,
)
from rag_chain import RAGOutput
from retrieval import build_retriever
from utils import print_separator, setup_environment

In [None]:
# from pubmed_scraper import PubMedScraper

# scraper = PubMedScraper(email = "olandechris@gmail.com")

# data = scraper.search_with_llm(query = "Find me 50 papers about Covid 19 from 2019 to 2025")

# df = scraper.search_with_llm(query = "Find papers about the impact of Gaza war on children")

In [None]:
def prepare_data(input_csv: str, output_csv: str | Path | None = None) -> Path:
    """
    Prepare and process the input data.

    Args:
        input_csv: Path to input CSV file
        output_csv: Path to save processed CSV (optional)

    Returns:
        Path to the processed CSV file
    """
    print_separator("DATA PREPARATION")
    if output_csv is None:
        output_csv = Path(config.paths.data_dir) / "tests.csv"

    # Load and process CSV
    print(f"Loading data from {input_csv}")
    df = load_and_process_csv(input_csv, output_csv)

    return Path(output_csv)

In [None]:
def build_rag_system(csv_path: str):
    """
    Build the complete RAG system.

    Args:
        csv_path: Path to processed CSV file

    Returns:
        Tuple of (rag_chain, splitted_documents, embeddings)
    """
    print_separator("BUILDING RAG SYSTEM")
    print("Initializing Embeddings ...")

    embeddings = initialize_embeddings(
        model_name=config.model.embedding_model,
        cache_dir=config.model.embedding_cache_dir,
    )

    # Load and split the documents
    print("Loading documents...")
    documents = load_documents_from_csv(csv_path)

    print("Splitting documents...")
    splitted_documents = split_documents(documents, embeddings)
    print(f"Created {len(splitted_documents)} document chunks")

    # Build Retriever
    print("Building retriever")
    retriever = build_retriever(splitted_documents, embeddings, config.retriever)

    # Initialize RAG Chain
    print("Initializing RAG Chain ...")

    # Initialize RAG chain
    print("Initializing RAG chain...")
    rag_chain = RAGOutput(
        prompt_name="rlm/rag-prompt",
        retriever=retriever,
        llm_model=config.model.deepseek_model,
    )
    rag_chain.create_chain()

    print("RAG system built successfully")
    return rag_chain, splitted_documents, embeddings

In [None]:
def run_evaluation(
    rag_chain: RAGOutput,
    test_dataset_path: str | Path | None = None,
    results_path: str | Path | None = None,
):
    """
    Run evaluation on the RAG system.

    Args:
        rag_chain: Configured RAG chain
        test_dataset_path: Path to test dataset (optional)
        results_path: Path to save results (optional)
    """
    print_separator("EVALUATION")

    if test_dataset_path is None:
        test_dataset_path = config.paths.rag_eval_dir / "generated_testset.csv"

    if results_path is None:
        results_path = config.paths.rag_eval_dir / "results_deepseek_fastembed.csv"

    # Run evaluation
    results = evaluate_rag_pipeline(
        rag_chain=rag_chain,
        input_csv_path=str(test_dataset_path),
        output_csv_path=str(results_path),
        question_column="user_input",
    )

    print(f"Evaluation complete. Results shape: {results.shape}")
    return results

In [None]:
def generate_synthetic_testset(
    splitted_documents, embeddings, llm, testset_size: int = 10
):
    """
    Generate synthetic test dataset using RAGAS.

    Args:
        splitted_documents: Split documents
        embeddings: Embeddings model
        llm: Language model
        testset_size: Number of test samples

    Returns:
        Generated test dataset as DataFrame
    """
    print_separator("GENERATING TEST DATASET")

    output_path = config.paths.rag_eval_dir / "generated_testset.csv"

    dataset = generate_test_dataset(
        documents=splitted_documents,
        embeddings=embeddings,
        llm=llm,
        testset_size=testset_size,
        output_path=str(output_path),
    )

    print(f"Test dataset generated with {len(dataset)} samples")
    return dataset

In [None]:
def run_full_evaluation(embeddings):
    """
    Run comprehensive evaluation using RAGEvaluator.

    Args:
        embeddings: Embeddings model to use for evaluation

    Returns:
        Dictionary of evaluation results for all model pairs
    """
    print_separator("COMPREHENSIVE EVALUATION")

    # Initialize evaluator
    evaluator = RAGEvaluator(
        max_workers=1,
        timeout=180,
        generative_models=["deepseek-chat"],
        embedding_models=["fastembed"],
    )

    # Run evaluation for all model pairs
    results = evaluator.evaluate_all_models(
        evaluation_embeddings=embeddings, results_dir=str(config.paths.rag_eval_dir)
    )

    # Print summary
    print_separator("EVALUATION SUMMARY")
    for model_pair, df in results.items():
        print(f"\n{model_pair}:")
        print(f"  Shape: {df.shape}")
        if len(df) > 0:
            metric_cols = [
                col
                for col in df.columns
                if col
                not in ["user_input", "reference", "response", "retrieved_contexts"]
            ]
            if metric_cols:
                print(f"  Metrics: {', '.join(metric_cols)}")
                for col in metric_cols:
                    if df[col].dtype in ["float64", "int64"]:
                        print(f"    {col}: {df[col].mean():.4f} (avg)")

    return results

# Program Entry

In [None]:
# Setup environment
setup_environment()

# Initialize configuration
print_separator("INITIALIZING")
print(f"Data directory: {config.paths.data_dir}")
print(f"RAG evaluation directory: {config.paths.rag_eval_dir}")

llm = config.initialize_llm()
dspy_lm = config.initialize_dspy()
# Prepare data
csv_path = prepare_data("data/gaza_war_impact_children.csv")

# Build RAG system
rag_chain, splitted_documents, embeddings = build_rag_system(str(csv_path))

# Generate test dataset
# generate_synthetic_testset(splitted_documents, embeddings, llm, testset_size=10)

# Run evaluation
results = run_evaluation(rag_chain)

# eval_results = run_full_evaluation(embeddings)