* https://huggingface.co/datasets/BeIR/msmarco
* https://huggingface.co/datasets/BeIR/msmarco-qrels

# Импорты

In [8]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import os
import sys
from json import dump as jdump, load as jload
import random
seed = 42
from tqdm import tqdm

import sys
sys.path.append('/home/jupyter/work/resources/DiplomDimReduction/')
import importlib

import config
importlib.reload(config)
from config import config_dict

import utils
importlib.reload(config)
from utils import create_path

#Пути

In [9]:
corpus_name = config_dict['marco_name']
qrels_name = config_dict['marco_qrels_name']

corpus_prefix = config_dict['marco_prefix']

corpus_data_path = config_dict['corpus_data_template'].format(corpus_prefix)
corpus_ids_path = config_dict['corpus_ids_template'].format(corpus_prefix)

queries_data_path = config_dict['queries_data_template'].format(corpus_prefix)
queries_ids_path = config_dict['queries_ids_template'].format(corpus_prefix)

In [10]:
def save_data(data, data_path, ids=[], ids_path=""):
  create_path(data_path)
  if ids:
    create_path(ids_path)
    jdump(ids, open(ids_path, 'w', encoding='utf-8'))
    print(f'{len(ids)} ids are saved')
  data.to_parquet(data_path)
  print(f"{sys.getsizeof(data)} -> {os.path.getsize(data_path)}")

# Датасет

## Копус

In [11]:
corpus = load_dataset(corpus_name, 'corpus')

Generating corpus split: 100%|██████████| 8841823/8841823 [03:32<00:00, 41533.43 examples/s] 


In [12]:
%%time
corpus_df = pd.DataFrame(corpus['corpus'], columns=['_id', 'text'])
corpus_df.rename(columns={'_id': 'corpus_id', 'text': 'corpus_text'}, inplace=True)
corpus_df['corpus_id'] = corpus_df['corpus_id'].astype('int')
corpus_df

CPU times: user 4min 38s, sys: 12.2 s, total: 4min 50s
Wall time: 4min 48s


Unnamed: 0,corpus_id,corpus_text
0,0,The presence of communication amid scientific ...
1,1,The Manhattan Project and its atomic bomb help...
2,2,Essay on The Manhattan Project - The Manhattan...
3,3,The Manhattan Project was the name for a proje...
4,4,versions of each volume as well as complementa...
...,...,...
8841818,8841818,When metal salts emit short wavelengths of vis...
8841819,8841819,Thousands of people across the United States w...
8841820,8841820,"The recipe that creates blue, for example, inc..."
8841821,8841821,"On Independence Days of yore, old-timey crowds..."


In [13]:
corpus_df.nunique()

corpus_id      8841823
corpus_text    8841661
dtype: int64

In [14]:
%%time
save_data(corpus_df, corpus_data_path, corpus_df['corpus_id'].tolist(), corpus_ids_path)

/home/jupyter/work/resources/DiplomDimReduction//data/raw/marco created.
/home/jupyter/work/resources/DiplomDimReduction//data/raw/marco exists.
8841823 ids are saved
4329020456 -> 1643708812
CPU times: user 21.9 s, sys: 7.66 s, total: 29.5 s
Wall time: 2min 19s


In [15]:
del corpus, corpus_df

## Запросы

In [17]:
queries = load_dataset(corpus_name, 'queries')

Generating queries split: 100%|██████████| 509962/509962 [00:01<00:00, 444425.35 examples/s] 


In [18]:
%%time
queries_df = pd.DataFrame(queries['queries'], columns=['_id', 'text'])
queries_df.rename(columns={'_id': 'query_id', 'text': 'query_text'}, inplace=True)
queries_df['query_id'] = queries_df['query_id'].astype('int')
queries_df

CPU times: user 15.6 s, sys: 76.8 ms, total: 15.7 s
Wall time: 15.5 s


Unnamed: 0,query_id,query_text
0,1185869,)what was the immediate impact of the success ...
1,1185868,_________ justice is designed to repair the ha...
2,597651,what color is amber urine
3,403613,is autoimmune hepatitis a bile acid synthesis ...
4,1183785,elegxo meaning
...,...,...
509957,147073,difference between discrete and process manufa...
509958,243761,how long did abraham lincoln serve
509959,162662,does adult acne rosacea give you blepharitis
509960,247194,how long do you bake muffins


In [19]:
queries_df.nunique()

query_id      509962
query_text    509962
dtype: int64

In [20]:
%%time
save_data(queries_df, queries_data_path, queries_df['query_id'].tolist(), queries_ids_path)

/home/jupyter/work/resources/DiplomDimReduction//data/raw/marco exists.
/home/jupyter/work/resources/DiplomDimReduction//data/raw/marco exists.
509962 ids are saved
50107422 -> 15102535
CPU times: user 501 ms, sys: 28.6 ms, total: 530 ms
Wall time: 631 ms


In [21]:
del queries, queries_df

## Разметка

### Трейн

In [22]:
split_suffix = config_dict['train_suffix']
qrels_data_path = config_dict['qrels_data_template'].format(corpus_prefix, split_suffix)
data_path = config_dict['data_template'].format(corpus_prefix, split_suffix)

In [23]:
qrels = load_dataset(qrels_name, 'default')

Generating train split: 100%|██████████| 532751/532751 [00:00<00:00, 1715978.43 examples/s]
Generating validation split: 100%|██████████| 7437/7437 [00:00<00:00, 363229.26 examples/s]
Generating test split: 100%|██████████| 9260/9260 [00:00<00:00, 477721.74 examples/s]


In [24]:
%%time
qrels_df = pd.DataFrame(qrels[split_suffix], columns=['corpus-id', 'query-id'])
qrels_df
qrels_df.rename(columns={'corpus-id': 'corpus_id', 'query-id': 'query_id'}, inplace=True)
qrels_df

CPU times: user 15.1 s, sys: 73.3 ms, total: 15.2 s
Wall time: 15.1 s


Unnamed: 0,corpus_id,query_id
0,0,1185869
1,16,1185868
2,49,597651
3,60,403613
4,389,1183785
...,...,...
532746,8841362,19285
532747,4989159,558837
532748,8841547,559149
532749,8841643,706678


In [25]:
qrels_df.nunique()

corpus_id    516472
query_id     502939
dtype: int64

In [26]:
qrels_df.groupby(['query_id']).count()

Unnamed: 0_level_0,corpus_id
query_id,Unnamed: 1_level_1
3,1
4,1
5,1
6,1
8,1
...,...
1185863,1
1185864,1
1185865,1
1185868,1


In [27]:
save_data(qrels_df, qrels_data_path)

/home/jupyter/work/resources/DiplomDimReduction//data/raw/marco exists.
8524160 -> 5357133


In [28]:
del qrels, qrels_df

### Тест

In [29]:
split_suffix = config_dict['test_suffix']
qrels_data_path = config_dict['qrels_data_template'].format(corpus_prefix, split_suffix)
data_path = config_dict['data_template'].format(corpus_prefix, split_suffix)

In [30]:
qrels = load_dataset(qrels_name, 'default')

In [31]:
%%time
qrels_df = pd.DataFrame(qrels[split_suffix])
qrels_df
qrels_df.rename(columns={'corpus-id': 'corpus_id', 'query-id': 'query_id'}, inplace=True)
qrels_df

CPU times: user 273 ms, sys: 116 µs, total: 273 ms
Wall time: 268 ms


Unnamed: 0,query_id,corpus_id,score
0,19335,1017759,0
1,19335,1082489,0
2,19335,109063,0
3,19335,1160863,0
4,19335,1160871,0
...,...,...,...
9255,1133167,8839920,2
9256,1133167,8839922,2
9257,1133167,944810,0
9258,1133167,949411,0


In [32]:
qrels_df.nunique()

query_id       43
corpus_id    9139
score           4
dtype: int64

In [33]:
qrels_df.groupby(['query_id']).count()

Unnamed: 0_level_0,corpus_id,score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
19335,194,194
47923,143,143
87181,158,158
87452,139,139
104861,306,306
130510,133,133
131843,132,132
146187,138,138
148538,159,159
156493,300,300


In [34]:
save_data(qrels_df, qrels_data_path)

/home/jupyter/work/resources/DiplomDimReduction//data/raw/marco exists.
222384 -> 65829


In [35]:
del qrels, qrels_df

## Сборка

### Трейн

In [37]:
split_suffix = config_dict['train_suffix']
qrels_data_path = config_dict['qrels_data_template'].format(corpus_prefix, split_suffix)
data_path = config_dict['data_template'].format(corpus_prefix, split_suffix)

In [38]:
corpus_ids = jload(open(corpus_ids_path))
queries_ids = jload(open(queries_ids_path))

In [39]:
qrels_df = pd.read_parquet(qrels_data_path)

In [40]:
qrels_df.nunique()

corpus_id    516472
query_id     502939
dtype: int64

In [41]:
queries_sample = []
for i in tqdm(range(500)):
  filtered_df = qrels_df[~qrels_df['query_id'].isin(queries_sample)]
  queries_sample.extend(filtered_df['query_id'].sample(100).tolist())
del filtered_df

100%|██████████| 500/500 [00:15<00:00, 32.37it/s]


In [47]:
len(queries_sample)

10000

In [46]:
random.seed(seed)
queries_sample = random.sample(queries_sample, 10000)

In [None]:
n_total = 10
n_rel = 1
n_unrel = n_total - n_rel

random.seed(seed)
np.random.seed(seed)

data = []

for query_id in tqdm(queries_sample):
    group = qrels_df[qrels_df["query_id"] == query_id]
    rel_ids = group['corpus_id'].unique()
    # if len(rel_ids) < n_rel:
    #     continue
    unrel_ids = list(set(corpus_ids) - set(rel_ids))
    # if len(unrel_ids) < n_unrel:
    #     continue
    rel_sample = random.sample(list(rel_ids), n_rel)
    unrel_sample = random.sample(unrel_ids, n_unrel)

    for corpus_id in rel_sample:
        data.append((query_id, corpus_id, 1))
    for corpus_id in unrel_sample:
        data.append((query_id, corpus_id, 0))

del qrels_df
data_df = pd.DataFrame(data, columns=["query_id", "corpus_id", "label"])
del data

 74%|███████▍  | 7415/10000 [2:23:24<50:40,  1.18s/it]  

In [None]:
data_df

In [None]:
save_data(data_df, data_path)

In [None]:
%%time
corpus_df = pd.read_parquet(corpus_data_path)

In [None]:
data_df = pd.merge(data_df, corpus_df, on='corpus_id')
data_df

In [None]:
del corpus_df

In [None]:
queries_df = pd.read_parquet(queries_data_path)

In [None]:
data_df = pd.merge(data_df, queries_df, on='query_id')
data_df

In [None]:
del queries_df

In [None]:
save_data(data_df, data_path)

### Тест

In [None]:
split_suffix = config_dict['test_suffix']
qrels_data_path = config_dict['qrels_data_template'].format(corpus_prefix, split_suffix)
data_path = config_dict['data_template'].format(corpus_prefix, split_suffix)

In [None]:
corpus_ids = jload(open(corpus_ids_path))
queries_ids = jload(open(queries_ids_path))

In [None]:
qrels_df = pd.read_parquet(qrels_data_path)

In [None]:
qrels_df.nunique()

In [None]:
queries_sample = qrels_df["query_id"].unique().tolist()

In [None]:
n_total = 10
n_rel = 3
n_unrel = n_total - n_rel

random.seed(seed)
np.random.seed(seed)

data = []

for query_id in tqdm(queries_sample):
    group = qrels_df[qrels_df["query_id"] == query_id]
    # ВЗЯЛИ БИНАРНЫЕ ДЛЯ ПРОСТОТЫ
    rel_ids = group[group["score"] > 1]["corpus_id"].unique()
    unrel_ids = group[group["score"] == 0]["corpus_id"].unique()

    rel_sample = random.sample(list(rel_ids), n_rel)
    unrel_sample = random.sample(list(unrel_ids), n_unrel)

    for corpus_id in rel_sample:
        data.append((query_id, corpus_id, 1))
    for corpus_id in unrel_sample:
        data.append((query_id, corpus_id, 0))

del qrels_df
data_df = pd.DataFrame(data, columns=["query_id", "corpus_id", "label"])
del data

In [None]:
data_df

In [None]:
save_data(data_df, data_path)

In [None]:
%%time
corpus_df = pd.read_parquet(corpus_data_path)

In [None]:
data_df = pd.merge(data_df, corpus_df, on='corpus_id')
data_df

In [None]:
del corpus_df

In [None]:
queries_df = pd.read_parquet(queries_data_path)

In [None]:
data_df = pd.merge(data_df, queries_df, on='query_id')
data_df

In [None]:
del queries_df

In [None]:
save_data(data_df, data_path)