In [1]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
import ast
import hashlib

LOAD_DATASET_PATH = '../../data/ragminibioasq/base_dataset.csv'
LOAD_PASSAGES_PATH = '../../data/ragminibioasq/base_passages_dataset.csv'

SAVE_TABLE_PATH = '../../data/ragminibioasq/tables/v1/chunked_docs.csv'
SAVE_BENCHMARK_PATH = '../../data/ragminibioasq/tables/v1/benchmark.csv'

In [2]:
def print_df_stat(df):
    print("===Статистика по исходному датасету===")
    print("Количество строк в датасете: ", df.shape[0])
    print("Количество уникальных question-текстов: ", len(df['question'].unique()))
    print("Количество уникальных answer-текстов: ", len(df['answer'].unique()))

def get_md5_of_string(input_string):
    return hashlib.md5(input_string.encode()).hexdigest()

In [3]:
base_df = pd.read_csv(LOAD_DATASET_PATH, sep=';')
base_df['relevant_passage_ids'] = base_df['relevant_passage_ids'].apply(lambda v: ast.literal_eval(v))

passages_df = pd.read_csv(LOAD_PASSAGES_PATH, sep=';')

#### Удаление дубликатов

In [4]:
q_filtered_df = base_df.drop_duplicates(subset=['question']).reset_index(drop=True)

In [5]:
print_df_stat(q_filtered_df)

===Статистика по исходному датасету===
Количество строк в датасете:  4719
Количество уникальных question-текстов:  4719
Количество уникальных answer-текстов:  4693


#### Формирование таблицы с чанками для базы данных

In [7]:
# Создание таблицы для формирования бд
tmp_data = []
chunk_ids_seq = []
unique_chunk_ids = []

for i in tqdm(range(q_filtered_df.shape[0])):
    cur_chunk_ids_seq = []
    for base_id in q_filtered_df['relevant_passage_ids'][i]:
        cur_chunk = passages_df[passages_df['id'] == base_id]['passage'].values[0] 
        
        if pd.isna(cur_chunk):
            #print("none-value found")
            continue

        chunk_id = get_md5_of_string(cur_chunk)
        cur_chunk_ids_seq.append(chunk_id)

        if chunk_id not in unique_chunk_ids:
            document = [chunk_id, cur_chunk, {'doc_id': chunk_id, 'chunk_id': chunk_id}]
            unique_chunk_ids.append(chunk_id)
            tmp_data.append(document)
    chunk_ids_seq.append(cur_chunk_ids_seq)

q_filtered_df['chunk_id'] = chunk_ids_seq
table_df = pd.DataFrame(tmp_data, columns=['chunk_id','chunks', 'metadata'])

print("all questions: ", q_filtered_df.shape[0])
print("unique chunks: ", len(unique_chunk_ids))

100%|██████████| 4719/4719 [00:13<00:00, 342.68it/s]

all questions:  4719
unique chunks:  27974





In [8]:
table_df.to_csv(SAVE_TABLE_PATH, sep=';', index=False)

#### Формирование датасета с зависимостями на таблицу чанков

In [9]:
tmp_data = []
for i in tqdm(range(q_filtered_df.shape[0])):
    tmp_data.append(
        {'question': q_filtered_df['question'][i], 'answer': q_filtered_df['answer'][i], 
         'chunk_ids': list(set(q_filtered_df['chunk_id'][i]))})

100%|██████████| 4719/4719 [00:00<00:00, 99843.22it/s]


In [10]:
benchmark_df = pd.DataFrame(tmp_data)

In [11]:
# Удаляем вопросы для которых не было контекстов

print("before: ", benchmark_df.shape[0])

to_drop_rows = []
for i in range(benchmark_df.shape[0]):
    if len(benchmark_df['chunk_ids'][i]) == 0:
        to_drop_rows.append(i)

benchmark_df.drop(to_drop_rows, axis=0, inplace=True)

print("after: ", benchmark_df.shape[0])

before:  4719
after:  4387


In [12]:
benchmark_df.to_csv(SAVE_BENCHMARK_PATH, sep=';', index=False)