In [1]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
import ast
from collections import Counter
import json

LOAD_PATH = '../../data/mtssquad/base_dataset.csv'
SAVE_TABLE_PATH = '../../data/mtssquad/tables/v1/chunked_docs.csv'
SAVE_BENCHMARK_PATH = '../../data/mtssquad/tables/v1/benchmark.csv'

In [2]:
def print_df_stat(df):
    print("===Статистика по исходному датасету===")
    print("Количество строк в датасете: ", df.shape[0])
    print("Количество уникальных context-текстов: ", len(df['context'].unique()))
    print("Количество уникальных question-текстов: ", len(df['question'].unique()))
    print("Количество уникальных answer-текстов: ", len(df['answer'].unique()))

def get_hash(value: str, hash_len: int = 10) -> float:
    return hash(value) % (10 ** hash_len)

In [3]:
base_df = pd.read_csv(LOAD_PATH, sep=';')

##### Удаление дубликатов (questions)

In [4]:
q_filtered_df = base_df.drop_duplicates(subset=['question']).reset_index(drop=True)

In [5]:
print_df_stat(q_filtered_df)

===Статистика по исходному датасету===
Количество строк в датасете:  49908
Количество уникальных context-текстов:  9031
Количество уникальных question-текстов:  49908
Количество уникальных answer-текстов:  46942


##### Удаление дубликатов (answers)

In [6]:
aq_filtered_df = q_filtered_df.drop_duplicates(subset=['answer']).reset_index(drop=True)

In [7]:
print_df_stat(aq_filtered_df)

===Статистика по исходному датасету===
Количество строк в датасете:  46942
Количество уникальных context-текстов:  9028
Количество уникальных question-текстов:  46942
Количество уникальных answer-текстов:  46942


##### Формирование таблицы с чанками для базы данных

In [8]:
# Создание таблицы для формирования бд
tmp_data = []
chunk_ids_seq = []

unique_chunk_ids = []

for i in tqdm(range(aq_filtered_df.shape[0])):
    chunk = aq_filtered_df['context'][i]
    chunk_id = get_hash(chunk, hash_len=10)
    chunk_ids_seq.append(chunk_id)

    if chunk_id not in unique_chunk_ids:
        document = [chunk, {'doc_id': chunk_id, 'chunk_id': chunk_id, 
                            'prev_chunk_id': None, 'next_chunk_id': None}]
        unique_chunk_ids.append(chunk_id)
        tmp_data.append(document)

aq_filtered_df['chunk_id'] = chunk_ids_seq
table_df = pd.DataFrame(tmp_data, columns=['chunks', 'metadata'])

print("all chunks: ", aq_filtered_df['context'].shape[0])
print("unique chunks: ", len(unique_chunk_ids))

100%|██████████| 46942/46942 [00:01<00:00, 25917.44it/s]

all chunks:  46942
unique chunks:  9028





In [9]:
table_df.to_csv(SAVE_TABLE_PATH, sep=';', index=False)

#### Формирование датасета с зависимостями на таблицу чанков

In [12]:
tmp_data = {}
for i in tqdm(range(aq_filtered_df.shape[0])):
    qst_hash = get_hash(aq_filtered_df['question'][i])
    if qst_hash in tmp_data.keys():
        print(qst_hash, aq_filtered_df['question'][i])
        tmp_data[qst_hash]['chunk_ids'].append(aq_filtered_df['chunk_id'][i])
        tmp_data[qst_hash]['contexts'].append(aq_filtered_df['context'][i])
    else:
        tmp_data[qst_hash] = {'question': aq_filtered_df['question'][i], 'answer': aq_filtered_df['answer'][i], 
                              'chunk_ids': [aq_filtered_df['chunk_id'][i]], 'contexts': [aq_filtered_df['context'][i]]}

    tmp_data[qst_hash]['chunk_ids'] = list(set(tmp_data[qst_hash]['chunk_ids']))
    tmp_data[qst_hash]['contexts'] = list(set(tmp_data[qst_hash]['contexts']))

100%|██████████| 46942/46942 [00:01<00:00, 44401.83it/s]


In [14]:
benchmark_df = pd.DataFrame(list(tmp_data.values()))

In [15]:
Counter(list(map(len, benchmark_df['chunk_ids'])))

Counter({1: 46942})

In [16]:
benchmark_df.to_csv(SAVE_BENCHMARK_PATH, sep=';', index=False)