In [4]:
import sys
BASE_DIR = "/home/dzigen/Desktop/Projects/rag_project"
#BASE_DIR = "/trinity/home/team06/workspace/mikhail_workspace/rag_project"
sys.path.insert(0, BASE_DIR)

import pandas as pd 
import ast
import numpy as np
import json
from tqdm import tqdm
from IPython.display import clear_output
import os
from typing import Dict, List

from src.Retriever import ThresholdRetriever
from src.Scorer import SimilarityScorerConfig
from src.Reader import LLM_Model
from src.utils import ReaderMetrics, RetrieverMetrics, save_reader_trial_log, prepare_thresholdretriever_configs, prepare_reader_configs, load_benchmarks_df, save_retriever_trial_log
from src.utils import evaluate_retriever, evaluate_reader

#### Stage 1. Retrieve

In [11]:
TRIAL = 2
SAVE_LOGDIR = f'./logs/stage1/trial{TRIAL}'
SAVE_RETRIEVER_HYPERPARAMS = f'{SAVE_LOGDIR}/retriever_hyperparams.json'
SAVE_RETRIEVERCACHE = f'{SAVE_LOGDIR}/retriever_cache.json'
ADDITIONAL_PARAMS = {
    'topk_score_list': 10
}

BENCHMARKS_MAXSIZE = -1
BENCHMARKS_INFO = {'mtssquad': {'db': 'v3', 'table': 'v3'}}

# Retriever part
RETRIEVER_PARAMS = {
    "model_path": f"/home/dzigen/Desktop/PersonalAI/Personal-AI/models/intfloat/multilingual-e5-small",
    "densedb_kwargs": {'metadata': {"hnsw:space": "ip"}},
    "model_kwargs": {'device':'cuda'},
    "encode_kwargs": {'normalize_embeddings': True, 'prompt': 'query: '},
    "params": {'fetch_k': 10, 'threshold': -1, 'max_k': 10}
}

In [12]:
retrievers_config, benchmarks_path = prepare_thresholdretriever_configs(BASE_DIR, BENCHMARKS_INFO, RETRIEVER_PARAMS)
retriever_metrics = RetrieverMetrics()

In [13]:
benchmarks_df = load_benchmarks_df(benchmarks_path, BENCHMARKS_MAXSIZE)

In [14]:
RETRIEVERS = {name: ThresholdRetriever(config) for name, config in retrievers_config.items()}

No sentence-transformers model found with name /home/dzigen/Desktop/PersonalAI/Personal-AI/models/intfloat/multilingual-e5-small. Creating a new one with MEAN pooling.


In [15]:
retriever_scores, retriever_cache_ids, predicted_chunks, cache_relevant_flags = evaluate_retriever(
    benchmarks_df, RETRIEVERS, retriever_metrics, show_step=5, topk_score_list=ADDITIONAL_PARAMS['topk_score_list'])

mtssquad


  return (2 * self.precision(pred_cands, gold_cands, k) * self.recall(pred_cands, gold_cands, k)) / (self.precision(pred_cands, gold_cands, k) + self.recall(pred_cands, gold_cands, k))
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 49908/49908 [11:17<00:00, 73.68it/s, MRR=0.725, mAP=0.725, Recall=0.822, Precision=0.0822, F1=0.149, NoRelContextScore=nan]


In [16]:
save_retriever_trial_log(SAVE_LOGDIR, retriever_scores, SAVE_RETRIEVER_HYPERPARAMS, SAVE_RETRIEVERCACHE, 
                         predicted_chunks, BENCHMARKS_INFO, BENCHMARKS_MAXSIZE, RETRIEVER_PARAMS, ADDITIONAL_PARAMS)

#### Stage 2. Read

In [34]:
TRIAL = 3
SAVE_LOGDIR = f'./logs/stage2/trial{TRIAL}'
SAVE_READER_HYPERPARAMS = f'{SAVE_LOGDIR}/reader_hyperparams.json'
SAVE_READERCACHE = f'{SAVE_LOGDIR}/reader_cache.json'
BENCHMARKS_MAXSIZE = 4000

ADDITIONAL_PARAMS = {
    'score': 'total_n_distance',
    'max_k': 10,
    'bertscore_model_path': "ru_electra_medium",
    'load_retriever_cache': f'./logs/stage1/trial{TRIAL}/retriever_cache.json'
}

BENCHMARKS_INFO = {'ragminibioasq': {'db': 'v1', 'table': 'v1'}}

# 'total_n_distance' | 'mean_dependency_distance'
# 'Отвечай на вопросы, используя информацию из текстов в списке ниже. Каждому тексту в начале в квадратных скобках поставлена в соответствие вещественная оценка его семантической близости к вопросу: в диапозоне от 0.0 (высокая близость) до 1.0 (низкая близость). Используй эту информацию. Выбирай тексты c достаточно высокими оценками близости для генерации ответа на их основе. Если на основании указанных оценок близости ты не уверена в релевантности данных текстов по отношению к заданному вопросу, то сгенерируй следующий ответ: "У меня нет ответа на ваш вопрос.".'

READER_PARAMS = {
    'prompts': {
        "assistant": 'Отвечай на вопросы, используя информацию из текстов в списке ниже. Каждому тексту в начале в квадратных скобках поставлена в соответствие вещественная оценка его синтаксической сложности: в диапазоне от 0.0 (низкая сложность) до 1.0 (высокая сложность). Учитывай эти данные. Тексты с более низкой оценкой содержат меньше дополнительной информации и из них легче извлечь релевантную информацию, если такая есть. Если оценка кажется тебе слишком высокой, а другие тексты нерелевантны запросу, то сгенерируй следующий ответ: "У меня нет ответа на ваш вопрос.".',
        "system": "Ты вопросно-ответная система. Все ответы генерируй на русском языке. По вопросам отвечай кратко, чётко и конкретно. Не генерируй излишнюю информацию.",
    },
    'gen': {'max_new_tokens': 512, 'eos_token_id': 79097},
    'data_operate': {'batch_size': 1},
    }


In [None]:
_, benchmarks_path = prepare_thresholdretriever_configs(BASE_DIR, BENCHMARKS_INFO, RETRIEVER_PARAMS)
benchmarks_df = load_benchmarks_df(benchmarks_path, BENCHMARKS_MAXSIZE)

In [35]:
reader_config = prepare_reader_configs(READER_PARAMS)

In [5]:
READER = LLM_Model(reader_config)

Loading checkpoint shards: 100%|██████████| 4/4 [03:38<00:00, 54.70s/it]


In [6]:
sim_score_config = SimilarityScorerConfig()
reader_metrics = ReaderMetrics(BASE_DIR, ADDITIONAL_PARAMS['bertscore_model_path'], sim_score_config, READER)

Loading Meteor...
Loading ExactMatch


In [36]:
with open(ADDITIONAL_PARAMS['load_retrievercache'], 'r', encoding='utf-8') as fd:
    predicted_chunks = json.loads(fd.read())

contexts = {}
for name, chunks in predicted_chunks.items():
    contexts[name] = []
    for docs in tqdm(chunks):
        formated_items = []
        for i, doc in enumerate(docs[:ADDITIONAL_PARAMS['max_k']]):
            cur_score = doc[0] if ADDITIONAL_PARAMS['score'] == 'cosine' else doc[2][ADDITIONAL_PARAMS['score']]
            formated_items.append(f"{i+1}. [{round(cur_score,5)}] {doc[1]}")
        contexts[name].append(reader_config.prompts.assistant + '\n\n' + '\n\n'.join(formated_items))

100%|██████████| 49908/49908 [00:00<00:00, 58561.81it/s]


In [None]:
reader_scores, reader_cache = evaluate_reader(benchmarks_df, READER, reader_metrics, contexts, 
                                              show_step=1, cache_relevant_flags=cache_relevant_flags)

In [19]:
save_reader_trial_log(SAVE_LOGDIR, reader_scores, SAVE_READER_HYPERPARAMS, SAVE_READERCACHE, 
                      reader_cache, BENCHMARKS_INFO, BENCHMARKS_MAXSIZE, READER_PARAMS, ADDITIONAL_PARAMS)

#### Stage Final. Compute metrics

In [5]:
TRIAL = 4
SAVE_LOGDIR = f'./logs/final/trial{TRIAL}'
SAVE_READER_HYPERPARAMS = f'{SAVE_LOGDIR}/reader_hyperparams.json'
SAVE_READERCACHE = f'{SAVE_LOGDIR}/reader_cache.json'
BENCHMARKS_MAXSIZE = 4000

ADDITIONAL_PARAMS = {
    'bertscore_model_path': "ru_electra_medium",
    'load_generated_answers': "./logs/stage2/trial221/reader_cache.json",
    'load_retrievercache': "./logs/stage1/trial2/retriever_cache.json"
}

BENCHMARKS_INFO = {'mtssquad': {'db': 'v3', 'table': 'v3'}}


RETRIEVER_PARAMS = {
    "model_path": f"/home/dzigen/Desktop/PersonalAI/Personal-AI/models/intfloat/multilingual-e5-small",
    "densedb_kwargs": {'metadata': {"hnsw:space": "ip"}},
    "model_kwargs": {'device':'cuda'},
    "encode_kwargs": {'normalize_embeddings': True, 'prompt': 'query: '},
    "params": {'fetch_k': 10, 'threshold': -1, 'max_k': 10}
}

In [6]:
_, benchmarks_path = prepare_thresholdretriever_configs(BASE_DIR, BENCHMARKS_INFO, RETRIEVER_PARAMS)
benchmarks_df = load_benchmarks_df(benchmarks_path, BENCHMARKS_MAXSIZE)

In [7]:
reader_metrics = ReaderMetrics(BASE_DIR, ADDITIONAL_PARAMS['bertscore_model_path'])

Loading Meteor...
Loading ExactMatch


In [8]:
with open(ADDITIONAL_PARAMS['load_retrievercache'], 'r', encoding='utf-8') as fd:
    retrievercache = json.loads(fd.read())

In [9]:
with open(ADDITIONAL_PARAMS['load_generated_answers'], 'r', encoding='utf-8') as fd:
    generated_answers = json.loads(fd.read())

In [10]:
cache_relevant_flags = {}
for name in benchmarks_df.keys():
    cache_relevant_flags[name] = []
    for i in range(benchmarks_df[name].shape[0]):
        cur_retrieved_ids = list(map(lambda item: item[2]['chunk_id'], retrievercache[name][i]))
        target_ids = benchmarks_df[name]['chunk_ids'][i]
        cache_relevant_flags[name].append(len(set(target_ids).intersection(set(cur_retrieved_ids))) > 0)

In [11]:
sum(cache_relevant_flags['mtssquad'])

3175

In [12]:
scores = {}
cache = {}
show_step = 20
for _, name in enumerate(benchmarks_df.keys()):
    
    scores[name] = {
        'BLEU2': [], 'BLEU1': [],
        'ExactMatch': [],'METEOR': [],
        'ROUGEL': [],
        'BertScore': [],
        'Levenshtain': [],
        'StubScore': [] # отношение числа успешно сгенерированных заглушек к их ожидаемому числу
        }
    cache[name] = []
    
    process = tqdm(generated_answers[name])

    tmp_target_answers = []
    for i, predicted_answer in enumerate(process):
        #print("answer raw len: ", i, len(predicted_answer))
        
        target_answer = benchmarks_df[name]['answer'][i]

        if cache_relevant_flags is None or cache_relevant_flags[name][i]:
            scores[name]['BLEU1'] += reader_metrics.bleu1([predicted_answer], [target_answer])
            scores[name]['BLEU2'] += reader_metrics.bleu2([predicted_answer], [target_answer])
            scores[name]['ExactMatch'] += reader_metrics.exact_match([predicted_answer], [target_answer])
            scores[name]['ROUGEL'] += reader_metrics.rougel([predicted_answer], [target_answer])
            scores[name]['METEOR'] += reader_metrics.meteor([predicted_answer], [target_answer])
            scores[name]['Levenshtain'] += reader_metrics.levenshtain_score([predicted_answer], [target_answer])
        else:
            scores[name]['StubScore'].append(1)
        
        if i % show_step == 0:
            process.set_postfix({m_name: np.mean(score) for m_name, score in scores[name].items()})

    scores[name] = {m_name: round(float(np.mean(score)), 5) for m_name, score in scores[name].items()}
    scores[name]['BertScore'] = reader_metrics.bertscore(generated_answers[name], benchmarks_df[name]['answer'].tolist())
    scores[name]['elapsed_time_sec'] = round(float(process.format_dict["elapsed"]), 3)
    process.set_postfix(scores[name])

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 4000/4000 [06:12<00:00, 10.73it/s, BLEU2=0.266, BLEU1=0.332, ExactMatch=0.0272, METEOR=0.491, ROUGEL=0.3, BertScore=nan, Levenshtain=143, StubScore=1]  
  return self.fget.__get__(instance, owner)()


In [13]:
scores

{'mtssquad': {'BLEU2': 0.26557,
  'BLEU1': 0.33235,
  'ExactMatch': 0.02709,
  'METEOR': 0.49121,
  'ROUGEL': 0.29971,
  'BertScore': {'precision': nan,
   'recall': 0.34993,
   'f1': 0.34445,
   'hash': '/home/dzigen/Desktop/Projects/rag_project/models/ru_electra_medium_LNone_no-idf'},
  'Levenshtain': 142.98268,
  'StubScore': 1.0,
  'elapsed_time_sec': 488.141}}

In [14]:
save_reader_trial_log(SAVE_LOGDIR, scores, SAVE_READER_HYPERPARAMS, SAVE_READERCACHE, 
                      {}, BENCHMARKS_INFO, BENCHMARKS_MAXSIZE,{}, ADDITIONAL_PARAMS)