In [1]:
import sys
sys.path.insert(0, "/home/aisummer/mikhail_workspace/nlp-service")

from src.utils import create_id, get_hash
from src.DocumentsParser.utils import TABLES_DIR_TABLE_NAME

from tqdm import tqdm
import pandas as pd
import ast
from collections import Counter
import json

LOAD_PATH = '../../data/squadv2/base_dataset.csv'
SAVE_TABLE_PATH = f'../../data/squadv2/tables/v1/{TABLES_DIR_TABLE_NAME}'
SAVE_BENCHMARK_PATH = '../../data/squadv2/tables/v1/benchmark.csv'

In [2]:
df = pd.read_csv(LOAD_PATH, sep=';')

In [3]:
# Переформатирование столбца "answers"
df['answers'] = df['answers'].map(lambda v: ast.literal_eval(v))

tmp_answers = []
for i in range(df.shape[0]):
    answers = df['answers'][i]['text']
    tmp_answers.append(answers[0] if len(answers) else '')

df['answer'] = tmp_answers
df.drop(axis=1, columns=['answers'], inplace=True)

### Создание таблицы для формирования БД

In [4]:
# Создание таблицы для формирования бд
unique_chunk_ids = []

tmp_data = []
chunk_ids_seq = []

for i in tqdm(range(df.shape[0])):
    cntx = df['context'][i]
    chunk = f"{df['title'][i]}\n{df['context'][i]}"
    chunk_id = get_hash(chunk, hash_len=10)
    chunk_ids_seq.append(chunk_id)

    if chunk_id not in unique_chunk_ids:
        document = [chunk, {'doc_id': df['title'][i], 'chunk_id': chunk_id, 
                            'prev_chunk_id': None, 'next_chunk_id': None}]
        unique_chunk_ids.append(chunk_id)
        tmp_data.append(document)

df['chunk_id'] = chunk_ids_seq
table_df = pd.DataFrame(tmp_data, columns=['chunks', 'metadata'])

print("all chunks: ", df['context'].shape[0])
print("unique chunks: ", len(unique_chunk_ids))

  8%|▊         | 11030/142192 [00:00<00:01, 110283.54it/s]

100%|██████████| 142192/142192 [00:08<00:00, 16256.80it/s]

all chunks:  142192
unique chunks:  20233





In [5]:
table_df.to_csv(SAVE_TABLE_PATH, sep=';', index=False)

#### Создание таблицы для оценки качества Retriever/Reader-частей

In [6]:
tmp_data = {}
for i in tqdm(range(df.shape[0])):
    qst_hash = get_hash(df['question'][i])
    if qst_hash in tmp_data.keys():
        tmp_data[qst_hash]['chunk_ids'].append(df['chunk_id'][i])
        tmp_data[qst_hash]['contexts'].append(df['context'][i])
    else:
        tmp_data[qst_hash] = {'question': df['question'][i], 'answer': df['answer'][i], 
                              'chunk_ids': [df['chunk_id'][i]], 'contexts': [df['context'][i]]}

    tmp_data[qst_hash]['chunk_ids'] = list(set(tmp_data[qst_hash]['chunk_ids']))
    tmp_data[qst_hash]['contexts'] = list(set(tmp_data[qst_hash]['contexts']))

100%|██████████| 142192/142192 [00:01<00:00, 104356.52it/s]


In [7]:
benchmark_df = pd.DataFrame(list(tmp_data.values()))

In [8]:
Counter(list(map(len, benchmark_df['chunk_ids'])))

Counter({1: 141836, 2: 141, 3: 1})

In [9]:
benchmark_df.to_csv(SAVE_BENCHMARK_PATH, sep=';', index=False)

In [10]:
benchmark_df.head()

Unnamed: 0,question,answer,chunk_ids,contexts
0,When did Beyonce start becoming popular?,in the late 1990s,[8268865924],[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...
1,What areas did Beyonce compete in when she was...,singing and dancing,[8268865924],[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...
2,When did Beyonce leave Destiny's Child and bec...,2003,[8268865924],[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...
3,In what city and state did Beyonce grow up?,"Houston, Texas",[8268865924],[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...
4,In which decade did Beyonce become famous?,late 1990s,[8268865924],[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...
