In [27]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
import ast
from collections import Counter
import json
import hashlib

LOAD_PATH = '../../data/mtssquad/base_dataset.csv'
SAVE_TABLE_PATH = '../../data/mtssquad/tables/v3/chunked_docs.csv'
SAVE_BENCHMARK_PATH = '../../data/mtssquad/tables/v3/benchmark.csv'

In [28]:
def print_df_stat(df):
    print("===Статистика по исходному датасету===")
    print("Количество строк в датасете: ", df.shape[0])
    print("Количество уникальных context-текстов: ", len(df['context'].unique()))
    print("Количество уникальных question-текстов: ", len(df['question'].unique()))
    print("Количество уникальных answer-текстов: ", len(df['answer'].unique()))

def get_md5_of_string(input_string):
    return hashlib.md5(input_string.encode()).hexdigest()

In [29]:
base_df = pd.read_csv(LOAD_PATH, sep=';')

##### Удаление дубликатов (questions)

In [30]:
q_filtered_df = base_df.drop_duplicates(subset=['question']).reset_index(drop=True)

In [31]:
print_df_stat(q_filtered_df)

===Статистика по исходному датасету===
Количество строк в датасете:  49908
Количество уникальных context-текстов:  9031
Количество уникальных question-текстов:  49908
Количество уникальных answer-текстов:  46942


##### Формирование таблицы с чанками для базы данных

In [32]:
# Создание таблицы для формирования бд
tmp_data = []
chunk_ids_seq = []
unique_chunk_ids = []

for i in tqdm(range(q_filtered_df.shape[0])):
    chunk = q_filtered_df['context'][i]
    if pd.isna(chunk):
        continue

    chunk_id = get_md5_of_string(chunk)
    chunk_ids_seq.append([chunk_id])

    if chunk_id not in unique_chunk_ids:
        document = [chunk_id, chunk, {'doc_id': chunk_id, 'chunk_id': chunk_id}]
        unique_chunk_ids.append(chunk_id)
        tmp_data.append(document)

q_filtered_df['chunk_id'] = chunk_ids_seq
table_df = pd.DataFrame(tmp_data, columns=['chunk_id', 'chunks', 'metadata'])

print("all chunks: ", q_filtered_df['context'].shape[0])
print("unique chunks: ", len(unique_chunk_ids))

100%|██████████| 49908/49908 [00:03<00:00, 16268.86it/s]

all chunks:  49908
unique chunks:  9031





In [36]:
table_df.to_csv(SAVE_TABLE_PATH, sep=';', index=False)

#### Формирование датасета с зависимостями на таблицу чанков

In [37]:
tmp_data = []
for i in tqdm(range(q_filtered_df.shape[0])):
    tmp_data.append(
        {'question': q_filtered_df['question'][i], 'answer': q_filtered_df['answer'][i], 
         'chunk_ids': list(set(q_filtered_df['chunk_id'][i]))})

100%|██████████| 49908/49908 [00:00<00:00, 89396.46it/s] 


In [38]:
benchmark_df = pd.DataFrame(tmp_data)

In [39]:
Counter(list(map(len, benchmark_df['chunk_ids'])))

Counter({1: 49908})

In [40]:
benchmark_df.to_csv(SAVE_BENCHMARK_PATH, sep=';', index=False)