In [1]:
import sys
sys.path.insert(0, "/home/aisummer/mikhail_workspace/nlp-service")

from src.utils import create_id, get_hash
from src.DocumentsParser.utils import TABLES_DIR_TABLE_NAME

from tqdm import tqdm
import pandas as pd
import ast
from collections import Counter
import json

LOAD_PATH = '../../data/sberquad/base_dataset.csv'
SAVE_TABLE_PATH = f'../../data/sberquad/tables/v1/{TABLES_DIR_TABLE_NAME}'
SAVE_BENCHMARK_PATH = '../../data/sberquad/tables/v1/benchmark.csv'

In [2]:
df = pd.read_csv(LOAD_PATH, sep=';')

In [3]:
# Переформатирование столбца "answers"
df['answers'] = df['answers'].map(lambda v: ast.literal_eval(v))

tmp_answers = []
for i in range(df.shape[0]):
    answers = df['answers'][i]['text']
    tmp_answers.append(answers[0] if len(answers) else '')

df['answer'] = tmp_answers
df.drop(axis=1, columns=['answers'], inplace=True)

### Создание таблицы для формирования БД

In [4]:
# Создание таблицы для формирования бд
unique_chunk_ids = []

tmp_data = []
chunk_ids_seq = []

for i in tqdm(range(df.shape[0])):
    chunk = df['context'][i]
    chunk_id = get_hash(chunk, hash_len=10)
    chunk_ids_seq.append(chunk_id)

    if chunk_id not in unique_chunk_ids:
        document = [chunk, {'doc_id': df['title'][i], 'chunk_id': chunk_id, 
                            'prev_chunk_id': None, 'next_chunk_id': None}]
        unique_chunk_ids.append(chunk_id)
        tmp_data.append(document)

df['chunk_id'] = chunk_ids_seq
table_df = pd.DataFrame(tmp_data, columns=['chunks', 'metadata'])

print("all chunks: ", df['context'].shape[0])
print("unique chunks: ", len(unique_chunk_ids))

100%|██████████| 74300/74300 [00:02<00:00, 27254.18it/s] 

all chunks:  74300
unique chunks:  13489





In [5]:
table_df.to_csv(SAVE_TABLE_PATH, sep=';', index=False)

#### Создание таблицы для оценки качества Retriever/Reader-частей

In [14]:
tmp_data = {}
for i in tqdm(range(df.shape[0])):
    qst_hash = get_hash(df['question'][i])
    if qst_hash in tmp_data.keys():
        tmp_data[qst_hash]['chunk_ids'].append(df['chunk_id'][i])
        tmp_data[qst_hash]['contexts'].append(df['context'][i])
    else:
        tmp_data[qst_hash] = {'question': df['question'][i], 'answer': df['answer'][i], 
                              'chunk_ids': [df['chunk_id'][i]], 'contexts': [df['context'][i]]}

    tmp_data[qst_hash]['chunk_ids'] = list(set(tmp_data[qst_hash]['chunk_ids']))
    tmp_data[qst_hash]['contexts'] = list(set(tmp_data[qst_hash]['contexts']))

 23%|██▎       | 16765/74300 [00:00<00:00, 90914.09it/s]

100%|██████████| 74300/74300 [00:00<00:00, 100753.46it/s]


In [15]:
benchmark_df = pd.DataFrame(list(tmp_data.values()))

In [16]:
Counter(list(map(len, benchmark_df['chunk_ids'])))

Counter({1: 74218, 2: 38})

In [17]:
benchmark_df.to_csv(SAVE_BENCHMARK_PATH, sep=';', index=False)