In [1]:
import os
from pathlib import Path
from datasets import load_dataset

import nest_asyncio
nest_asyncio.apply()

main_dir = Path("/home/lyb")
data_dir = main_dir / "RAG/data/eli5_data"

In [None]:
def load_eli5_dataset(save_path):
    # set file path
    file_path = "MarkrAI/eli5_sample_autorag"

    # load dataset
    corpus_dataset = load_dataset(file_path, "corpus")['train'].to_pandas()
    qa_train_dataset = load_dataset(file_path, "qa")['train'].to_pandas()
    qa_test_dataset = load_dataset(file_path, "qa")['test'].to_pandas()

    # save data
    if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
        raise ValueError("corpus.parquet already exists")
    if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
        raise ValueError("qa.parquet already exists")
    corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"))
    qa_train_dataset.to_parquet(os.path.join(save_path, "qa_train.parquet"))
    qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"))


load_eli5_dataset(data_dir)

In [5]:
import pandas as pd
qa_df = pd.read_parquet(data_dir / 'qa_train.parquet')
sample_qa_df = qa_df.sample(20, random_state=42) # In this sample code, we will only optimize pipeline with 20 samples just for testing.
sample_qa_df.reset_index(drop=True, inplace=True)
sample_qa_df.to_parquet(data_dir / 'qa_sample.parquet')

from itertools import chain
from autorag.utils.util import to_list
# We drop unused corpus dataframe for faster inference.
corpus_df = pd.read_parquet(data_dir / 'corpus.parquet')
target_retrieval_gts = list(chain.from_iterable(to_list(sample_qa_df["retrieval_gt"].tolist())))
target_retrieval_gts = list(chain.from_iterable(target_retrieval_gts))
sample_corpus_df = corpus_df[corpus_df["doc_id"].isin(target_retrieval_gts)]
sample_corpus_df.reset_index(drop=True, inplace=True)
sample_corpus_df.to_parquet(data_dir / 'corpus_sample.parquet')

In [None]:
import pandas as pd
from autorag.schema.metricinput import MetricInput
from autorag.evaluation import evaluate_generation

# Load QA dataset
qa_df = pd.read_parquet("qa.parquet", engine="pyarrow")

# Prepare MetricInput list
metric_inputs = [
    MetricInput(query=row["query"], generation_gt=row["generation_gt"])
    for _, row in qa_df.iterrows()
]

# Define custom generation function with decorator
@evaluate_generation(
    metric_inputs=metric_inputs,
    metrics=["bleu", "meteor", "rouge"]
)
def custom_generation(queries):
    # Implement your generation logic
    return generated_texts, [[1, 30]] * len(generated_texts), [[-1, -1.3]] * len(generated_texts)

# Evaluate generation performance
generation_result_df = custom_generation(qa_df["query"].tolist())

In [2]:
from autorag.evaluator import Evaluator
evaluator = Evaluator(qa_data_path=(data_dir/'qa_sample.parquet').as_posix(), corpus_data_path=(data_dir/'corpus.parquet').as_posix(),
                      project_dir='./eli5')








In [None]:
evaluator.start_trial('./ollama_config.yaml', skip_validation=True)

Output()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
from autorag.deploy import extract_best_config
extract_best_config(trial_path='./eli5/0', output_path='./eli5/0/best.yaml')

{'node_lines': [{'node_line_name': 'retrieve_node_line',
   'nodes': [{'node_type': 'retrieval',
     'strategy': {'metrics': ['retrieval_f1',
       'retrieval_recall',
       'retrieval_precision']},
     'modules': [{'module_type': 'HybridCC',
       'top_k': 3,
       'target_modules': ('VectorDB', 'BM25'),
       'weights': (0.3, 0.7),
       'weight': 0.0,
       'target_module_params': ({'top_k': 3, 'vectordb': 'chroma_mpnet'},
        {'top_k': 3})}]}]},
  {'node_line_name': 'post_retrieve_node_line',
   'nodes': [{'node_type': 'prompt_maker',
     'strategy': {'metrics': ['meteor', 'rouge', 'bert_score']},
     'modules': [{'module_type': 'Fstring',
       'prompt': 'Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : '}]},
    {'node_type': 'generator',
     'strategy': {'metrics': ['meteor', 'rouge', 'bert_score']},
     'modules': [{'module_type': 'LlamaIndexLLM',
       'llm': 'ollama',
       'model': 'llama3'

In [8]:
from autorag.deploy import Runner
runner = Runner.from_yaml('./eli5/0/best.yaml', project_dir='./eli5')
runner.run('who are you?')



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  norm_score = (arr - min_value) / (max_value - min_value)




'Based on the passages provided, it seems that there is no clear answer to the question "Who are you?" as the passages appear to be unrelated and do not provide any information about the person\'s identity or characteristics.\n\nHowever, if I had to make an educated guess based on the tone and style of the passages, I would say that the author appears to be someone who values honesty, openness, and creativity. The author seems to be comfortable sharing their thoughts and feelings with others, and is not afraid to express themselves freely.'

{'node_lines': [{'node_line_name': 'retrieve_node_line',
   'nodes': [{'modules': [{'module_type': 'HybridCC',
       'target_module_params': [{'top_k': 3, 'vectordb': 'chroma_mpnet'},
        {'top_k': 3}],
       'target_modules': ['VectorDB', 'BM25'],
       'top_k': 3,
       'weight': 0.0,
       'weights': [0.3, 0.7]}],
     'node_type': 'retrieval',
     'strategy': {'metrics': ['retrieval_f1',
       'retrieval_recall',
       'retrieval_precision']}}]},
  {'node_line_name': 'post_retrieve_node_line',
   'nodes': [{'modules': [{'module_type': 'Fstring',
       'prompt': 'Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : '}],
     'node_type': 'prompt_maker',
     'strategy': {'metrics': ['meteor', 'rouge', 'bert_score']}},
    {'modules': [{'batch': 1,
       'llm': 'ollama',
       'model': 'llama3',
       'module_type': 'LlamaIndexLLM',
       'temperature': 0.5}],
     'node_type': 'generator',
     'strategy'

In [5]:
!autorag run_web --yaml_path ./eli5/0/best.yaml --project_dir ./eli5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2;36m[12/21/24 16:19:56][0m[2;36m [0m[34mINFO    [0m [1m[[0mconfig.py:[1;36m58[0m[1m][0m >> PyTorch version      ]8;id=239230;file:///home/lyb/.conda/envs/autorag/lib/python3.10/site-packages/datasets/config.py\[2mconfig.py[0m]8;;\[2m:[0m]8;id=803982;file:///home/lyb/.conda/envs/autorag/lib/python3.10/site-packages/datasets/config.py#58\[2m58[0m]8;;\
[2;36m                    [0m         [1;36m2.5[0m.[1;36m1[0m available.                       [2m            [0m





[2;36m[12/21/24 16:19:59][0m[2;36m [0m[34mINFO    [0m [1m[[0m_client.py:[1;36m1026[0m[1m][0m >> HTTP Request:  ]8;id=684912;file:///home/lyb/.conda/envs/autorag/lib/python3.10/site-packages/httpx/_client.py\[2m_client.py[0m]8;;\[2m:[0m]8;id=418493;file:///home/lyb/.conda/envs/autorag/lib/python3.10/site-packages/httpx/_client.py#1026\[2m1026[0m]8;;\
[2;36m                    [0m         [1;33mGET[0m                                 [2m               [0m
[2;