In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,0"

In [2]:
from time import time
import math
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer as skTfidf
import cupy as cp           #use cupy array instead of numpy to speed up calculation by using GPU
import cudf as cf
from cuml.metrics.pairwise_distances import sparse_pairwise_distances
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidf
from sklearn.metrics import pairwise_distances


import matplotlib.pyplot as plt
import sys

In [3]:
# path = '/home/test/Data/corpus.csv'
# path = '/Users/lsacy/data/corpus.csv'
path = './corpus.csv'
def load_frame(path_to_df=path, encoding='utf-16', filter_value = 100000):
    
    df= pd.read_csv(path_to_df, encoding=encoding, index_col='id')
    df.drop([df.columns[0]], inplace=True, axis=1)
    df.drop_duplicates(subset=['text'],inplace=True)
    
    exam_type_distribution = df.groupby(['exam_type'])['exam_type'].count()
    exam_type_distribution.sort_values(ascending=False)
    
    list_filtered = exam_type_distribution[exam_type_distribution > filter_value].index
    df_filtered = df[df['exam_type'].isin(list_filtered)]
    
    print(f'original dataframe shape: {df.shape}')
    print(f'df_filtered shape: {df_filtered.shape}')
    
    print(f'numbers of exam types before: {len(set(df["exam_type"]))}')
    print(f'types after filtering: {list_filtered}')
    print(f'number of exam types after filtering: {len(set(df_filtered["exam_type"]))}')

    return df_filtered, list_filtered

In [4]:
# define a load function to load each exam type
def load_exam_type(df, exam_types):
    for i in exam_types:
        print(f'exam type: {i}')
        dataframe = df[df['exam_type'] == i]    
        print(f'number of documents: {dataframe.shape}')
        yield dataframe

In [5]:
def batch_tfidf(sparseMatrix, size = 5000):
    for idx, item in enumerate(range(0, sparseMatrix.shape[0], size)):
        batch_sparseMatrix = sparseMatrix[item:item+size,:]
        print(f'batch shape: {batch_sparseMatrix.shape}, item: {item} - {item+size}')
        yield batch_sparseMatrix

In [6]:
def get_distances(upperbound, lower, batch_size=8000, filter=100000):

    df_filtered = load_frame(filter_value = filter)
    df_by_type = load_exam_type(df_filtered[0], df_filtered[1])
    results_dict = {}

    # loop through each exam type
    for i in range(0,len(df_filtered[1])):
        print(f'exam {i}/{len(df_filtered[1])}')
        dataframe = next(df_by_type)
        df_indices = dataframe.index.to_list()

        tfidf = cuTfidf().fit_transform(dataframe['text'])
        
        batch = batch_tfidf(tfidf, size=batch_size)
        total_number_batches = math.ceil(tfidf.shape[0]/batch_size)
        print(f'# of batches: {total_number_batches} | batch size: {batch_size}')
        
        # loop through batches of tfidf matrix row wise
        counter = 0
        for i in range(0, math.ceil(tfidf.shape[0]/batch_size)):
            print(f'batch {i+1}/{total_number_batches}')
            
            batch_sparse = next(batch)
            
            distance_batch = sparse_pairwise_distances(batch_sparse, tfidf, metric='euclidean') # distance matrix

            sort_by_distance(distance_batch, df_indices, upperbound, results_dict, lower)
                                         
            del distance_batch
            del batch_sparse
                                         
            counter += batch_size
            print(f'results dict length: {len(results_dict)}')
            print('')
            


        del tfidf
        del batch
        
                
    return results_dict

In [7]:
def sort_by_distance(distance_batch, df_indices, upperbound, results, lower):
    #distance_batch= distance_batch[0:100] #take a sample of 100
    found = 0
    for i, row in tqdm(enumerate(distance_batch)):
        sorted_array = cp.sort(row)
        arg_sorted = cp.argsort(row)

        candidates, distances = get_candidates(sorted_array, lower, arg_sorted, upperbound)


        if len(candidates) > 1:
            found += 1

            df_candidates = [df_indices[int(i)] for i in candidates]
            original_index = df_candidates[0]
            df_candidates = df_candidates[1:]
            
            save_results(distances, results, original_index, df_candidates)
            
            del sorted_array, original_index, df_candidates, distances, arg_sorted, candidates
            
    print(f'{found} matches found')

    return results

In [8]:
def save_results(distances, results, original_index, df_candidates):
    results[original_index] = (df_candidates, distances)
    return results

In [9]:
def get_candidates(sorted_array, lower, arg_sorted, upper):
    candidates = []
    distances = []
    for i, x in enumerate(sorted_array):
        if x > upper:
            break
        elif x < lower:
            continue
        else:
            candidates.append(arg_sorted[i])
            distances.append(float(x))
    return candidates, distances[1:]

In [10]:
test = get_distances(upperbound=0.6, lower=0, batch_size=6500, filter=10000)

original dataframe shape: (2586631, 4)
df_filtered shape: (1845401, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARCW', 'ARKK', 'ARKW', 'ARRB1EBBEK', 'ARRBXEB', 'ARREXEBO9L',
       'ARREXEBO9R', 'ARREXEBOEL', 'ARREXEBOER', 'ARREXEBOGL', 'ARREXEBOGR',
       'ARREXEBOHL', 'ARREXEBOHR', 'ARREXEBOSL', 'ARREXEBOSR', 'ARREXEBUFL',
       'ARREXEBUFR', 'ARREXEBUGL', 'ARREXEBUGR', 'ARREXEBUHL', 'ARREXEBUHR',
       'ARREXEBUKL', 'ARREXEBUKR', 'ARRKOPG', 'ARRKXEBNNH', 'ARRT', 'ARRTRBS',
       'ARRWXEBBWS', 'ARRWXEBHWS', 'ARRWXEBLWS', 'ARSB', 'ARSBSNONIX',
       'ARSBSNOTXN', 'ARSEFKVUVX', 'ARSESNOUXX', 'ARSXSNOWGW', 'ARXXTLE'],
      dtype='object', name='exam_type')
number of exam types after filtering: 38
exam 0/38
exam type: ARCK
number of documents: (100888, 4)
# of batches: 16 | batch size: 6500
batch 1/16
batch shape: (6500, 66772), item: 0 - 6500


6500it [00:07, 912.99it/s] 


452 matches found
results dict length: 451

batch 2/16
batch shape: (6500, 66772), item: 6500 - 13000


6500it [00:06, 943.25it/s] 


503 matches found
results dict length: 953

batch 3/16
batch shape: (6500, 66772), item: 13000 - 19500


6500it [00:06, 948.91it/s] 


468 matches found
results dict length: 1419

batch 4/16
batch shape: (6500, 66772), item: 19500 - 26000


6500it [00:07, 887.27it/s] 


450 matches found
results dict length: 1868

batch 5/16
batch shape: (6500, 66772), item: 26000 - 32500


6500it [00:06, 980.59it/s] 


461 matches found
results dict length: 2328

batch 6/16
batch shape: (6500, 66772), item: 32500 - 39000


6500it [00:07, 927.61it/s] 


450 matches found
results dict length: 2773

batch 7/16
batch shape: (6500, 66772), item: 39000 - 45500


6500it [00:06, 949.49it/s] 


486 matches found
results dict length: 3259

batch 8/16
batch shape: (6500, 66772), item: 45500 - 52000


6500it [00:07, 845.74it/s]


499 matches found
results dict length: 3754

batch 9/16
batch shape: (6500, 66772), item: 52000 - 58500


6500it [00:05, 1099.99it/s]


459 matches found
results dict length: 4212

batch 10/16
batch shape: (6500, 66772), item: 58500 - 65000


6500it [00:07, 861.51it/s] 


475 matches found
results dict length: 4687

batch 11/16
batch shape: (6500, 66772), item: 65000 - 71500


6500it [00:06, 1069.12it/s]


462 matches found
results dict length: 5149

batch 12/16
batch shape: (6500, 66772), item: 71500 - 78000


6500it [00:07, 844.21it/s]


470 matches found
results dict length: 5616

batch 13/16
batch shape: (6500, 66772), item: 78000 - 84500


6500it [00:07, 885.19it/s] 


475 matches found
results dict length: 6091

batch 14/16
batch shape: (6500, 66772), item: 84500 - 91000


6500it [00:07, 878.80it/s] 


465 matches found
results dict length: 6554

batch 15/16
batch shape: (6500, 66772), item: 91000 - 97500


6500it [00:06, 999.96it/s] 


497 matches found
results dict length: 7050

batch 16/16
batch shape: (3388, 66772), item: 97500 - 104000


3388it [00:04, 742.11it/s]


268 matches found
results dict length: 7316

exam 1/38
exam type: ARCW
number of documents: (16434, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 27693), item: 0 - 6500


6500it [00:04, 1491.06it/s]


60 matches found
results dict length: 7375

batch 2/3
batch shape: (6500, 27693), item: 6500 - 13000


6500it [00:04, 1511.69it/s]


87 matches found
results dict length: 7456

batch 3/3
batch shape: (3434, 27693), item: 13000 - 19500


3434it [00:02, 1508.80it/s]


45 matches found
results dict length: 7498

exam 2/38
exam type: ARKK
number of documents: (27608, 4)
# of batches: 5 | batch size: 6500
batch 1/5
batch shape: (6500, 46523), item: 0 - 6500


6500it [00:04, 1508.67it/s]


193 matches found
results dict length: 7691

batch 2/5
batch shape: (6500, 46523), item: 6500 - 13000


6500it [00:04, 1509.21it/s]


202 matches found
results dict length: 7891

batch 3/5
batch shape: (6500, 46523), item: 13000 - 19500


6500it [00:04, 1508.09it/s]


220 matches found
results dict length: 8108

batch 4/5
batch shape: (6500, 46523), item: 19500 - 26000


6500it [00:04, 1489.99it/s]


224 matches found
results dict length: 8324

batch 5/5
batch shape: (1608, 46523), item: 26000 - 32500


1608it [00:01, 1109.63it/s]


48 matches found
results dict length: 8370

exam 3/38
exam type: ARKW
number of documents: (15026, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 29503), item: 0 - 6500


6500it [00:04, 1519.98it/s]


85 matches found
results dict length: 8453

batch 2/3
batch shape: (6500, 29503), item: 6500 - 13000


6500it [00:04, 1541.39it/s]


67 matches found
results dict length: 8518

batch 3/3
batch shape: (2026, 29503), item: 13000 - 19500


2026it [00:01, 1573.98it/s]


16 matches found
results dict length: 8533

exam 4/38
exam type: ARRB1EBBEK
number of documents: (60014, 4)
# of batches: 10 | batch size: 6500
batch 1/10
batch shape: (6500, 41769), item: 0 - 6500


6500it [00:04, 1353.32it/s]


454 matches found
results dict length: 8983

batch 2/10
batch shape: (6500, 41769), item: 6500 - 13000


6500it [00:06, 1005.89it/s]


437 matches found
results dict length: 9416

batch 3/10
batch shape: (6500, 41769), item: 13000 - 19500


6500it [00:04, 1431.74it/s]


441 matches found
results dict length: 9852

batch 4/10
batch shape: (6500, 41769), item: 19500 - 26000


6500it [00:05, 1085.75it/s]


446 matches found
results dict length: 10285

batch 5/10
batch shape: (6500, 41769), item: 26000 - 32500


6500it [00:04, 1425.86it/s]


410 matches found
results dict length: 10686

batch 6/10
batch shape: (6500, 41769), item: 32500 - 39000


6500it [00:06, 1083.28it/s]


431 matches found
results dict length: 11108

batch 7/10
batch shape: (6500, 41769), item: 39000 - 45500


6500it [00:04, 1400.18it/s]


417 matches found
results dict length: 11512

batch 8/10
batch shape: (6500, 41769), item: 45500 - 52000


6500it [00:05, 1094.27it/s]


414 matches found
results dict length: 11916

batch 9/10
batch shape: (6500, 41769), item: 52000 - 58500


6500it [00:04, 1417.25it/s]


411 matches found
results dict length: 12316

batch 10/10
batch shape: (1514, 41769), item: 58500 - 65000


1514it [00:01, 1378.81it/s]


111 matches found
results dict length: 12422

exam 5/38
exam type: ARRBXEB
number of documents: (29120, 4)
# of batches: 5 | batch size: 6500
batch 1/5
batch shape: (6500, 30618), item: 0 - 6500


6500it [00:04, 1543.52it/s]


125 matches found
results dict length: 12547

batch 2/5
batch shape: (6500, 30618), item: 6500 - 13000


6500it [00:04, 1535.88it/s]


130 matches found
results dict length: 12676

batch 3/5
batch shape: (6500, 30618), item: 13000 - 19500


6500it [00:04, 1498.95it/s]


115 matches found
results dict length: 12790

batch 4/5
batch shape: (6500, 30618), item: 19500 - 26000


6500it [00:04, 1451.76it/s]


115 matches found
results dict length: 12902

batch 5/5
batch shape: (3120, 30618), item: 26000 - 32500


3120it [00:02, 1411.73it/s]


53 matches found
results dict length: 12955

exam 6/38
exam type: ARREXEBO9L
number of documents: (13803, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 12984), item: 0 - 6500


6500it [00:04, 1450.89it/s]


412 matches found
results dict length: 13349

batch 2/3
batch shape: (6500, 12984), item: 6500 - 13000


6500it [00:04, 1486.54it/s]


470 matches found
results dict length: 13789

batch 3/3
batch shape: (803, 12984), item: 13000 - 19500


803it [00:00, 1514.98it/s]


56 matches found
results dict length: 13841

exam 7/38
exam type: ARREXEBO9R
number of documents: (14462, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 13575), item: 0 - 6500


6500it [00:04, 1531.81it/s]


443 matches found
results dict length: 14260

batch 2/3
batch shape: (6500, 13575), item: 6500 - 13000


6500it [00:04, 1532.15it/s]


429 matches found
results dict length: 14667

batch 3/3
batch shape: (1462, 13575), item: 13000 - 19500


1462it [00:00, 1531.41it/s]


103 matches found
results dict length: 14767

exam 8/38
exam type: ARREXEBOEL
number of documents: (12329, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 14392), item: 0 - 6500


6500it [00:04, 1503.73it/s]


357 matches found
results dict length: 15112

batch 2/2
batch shape: (5829, 14392), item: 6500 - 13000


5829it [00:03, 1488.94it/s]


348 matches found
results dict length: 15432

exam 9/38
exam type: ARREXEBOER
number of documents: (10731, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 13818), item: 0 - 6500


6500it [00:04, 1474.32it/s]


429 matches found
results dict length: 15846

batch 2/2
batch shape: (4231, 13818), item: 6500 - 13000


4231it [00:02, 1489.02it/s]


258 matches found
results dict length: 16082

exam 10/38
exam type: ARREXEBOGL
number of documents: (17784, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 15325), item: 0 - 6500


6500it [00:04, 1467.19it/s]


421 matches found
results dict length: 16491

batch 2/3
batch shape: (6500, 15325), item: 6500 - 13000


6500it [00:04, 1495.19it/s]


368 matches found
results dict length: 16841

batch 3/3
batch shape: (4784, 15325), item: 13000 - 19500


4784it [00:03, 1494.53it/s]


320 matches found
results dict length: 17138

exam 11/38
exam type: ARREXEBOGR
number of documents: (15231, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 14550), item: 0 - 6500


6500it [00:04, 1495.01it/s]


422 matches found
results dict length: 17546

batch 2/3
batch shape: (6500, 14550), item: 6500 - 13000


6500it [00:04, 1512.90it/s]


403 matches found
results dict length: 17925

batch 3/3
batch shape: (2231, 14550), item: 13000 - 19500


2231it [00:01, 1518.88it/s]


143 matches found
results dict length: 18056

exam 12/38
exam type: ARREXEBOHL
number of documents: (17921, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 16055), item: 0 - 6500


6500it [00:30, 211.20it/s]


2266 matches found
results dict length: 20058

batch 2/3
batch shape: (6500, 16055), item: 6500 - 13000


6500it [00:26, 243.11it/s]


2226 matches found
results dict length: 21874

batch 3/3
batch shape: (4921, 16055), item: 13000 - 19500


4921it [00:19, 247.17it/s]


1604 matches found
results dict length: 23130

exam 13/38
exam type: ARREXEBOHR
number of documents: (12543, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 14178), item: 0 - 6500


6500it [00:04, 1523.58it/s]


390 matches found
results dict length: 23499

batch 2/2
batch shape: (6043, 14178), item: 6500 - 13000


6043it [00:03, 1533.44it/s]


316 matches found
results dict length: 23795

exam 14/38
exam type: ARREXEBOSL
number of documents: (17494, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 17782), item: 0 - 6500


6500it [00:04, 1516.05it/s]


378 matches found
results dict length: 24157

batch 2/3
batch shape: (6500, 17782), item: 6500 - 13000


6500it [00:04, 1528.79it/s]


330 matches found
results dict length: 24470

batch 3/3
batch shape: (4494, 17782), item: 13000 - 19500


4494it [00:02, 1546.81it/s]


219 matches found
results dict length: 24681

exam 15/38
exam type: ARREXEBOSR
number of documents: (20113, 4)
# of batches: 4 | batch size: 6500
batch 1/4
batch shape: (6500, 19497), item: 0 - 6500


6500it [00:04, 1524.72it/s]


329 matches found
results dict length: 24998

batch 2/4
batch shape: (6500, 19497), item: 6500 - 13000


6500it [00:04, 1509.53it/s]


349 matches found
results dict length: 25328

batch 3/4
batch shape: (6500, 19497), item: 13000 - 19500


6500it [00:04, 1548.94it/s]


317 matches found
results dict length: 25621

batch 4/4
batch shape: (613, 19497), item: 19500 - 26000


613it [00:00, 1602.67it/s]


18 matches found
results dict length: 25639

exam 16/38
exam type: ARREXEBUFL
number of documents: (14436, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 17518), item: 0 - 6500


6500it [00:04, 1509.13it/s]


386 matches found
results dict length: 26012

batch 2/3
batch shape: (6500, 17518), item: 6500 - 13000


6500it [00:04, 1521.08it/s]


371 matches found
results dict length: 26358

batch 3/3
batch shape: (1436, 17518), item: 13000 - 19500


1436it [00:00, 1467.48it/s]


76 matches found
results dict length: 26426

exam 17/38
exam type: ARREXEBUFR
number of documents: (15464, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 18004), item: 0 - 6500


6500it [00:04, 1468.46it/s]


372 matches found
results dict length: 26782

batch 2/3
batch shape: (6500, 18004), item: 6500 - 13000


6500it [00:04, 1494.71it/s]


342 matches found
results dict length: 27104

batch 3/3
batch shape: (2464, 18004), item: 13000 - 19500


2464it [00:01, 1488.91it/s]


145 matches found
results dict length: 27239

exam 18/38
exam type: ARREXEBUGL
number of documents: (17183, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 15908), item: 0 - 6500


6500it [00:05, 1275.79it/s]


893 matches found
results dict length: 28095

batch 2/3
batch shape: (6500, 15908), item: 6500 - 13000


6500it [00:04, 1303.60it/s]


895 matches found
results dict length: 28922

batch 3/3
batch shape: (4183, 15908), item: 13000 - 19500


4183it [00:03, 1274.68it/s]


581 matches found
results dict length: 29440

exam 19/38
exam type: ARREXEBUGR
number of documents: (19219, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 17302), item: 0 - 6500


6500it [00:05, 1213.70it/s]


942 matches found
results dict length: 30322

batch 2/3
batch shape: (6500, 17302), item: 6500 - 13000


6500it [00:05, 1282.70it/s]


850 matches found
results dict length: 31106

batch 3/3
batch shape: (6219, 17302), item: 13000 - 19500


6219it [00:04, 1299.18it/s]


829 matches found
results dict length: 31850

exam 20/38
exam type: ARREXEBUHL
number of documents: (10321, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 15231), item: 0 - 6500


6500it [00:04, 1554.74it/s]


169 matches found
results dict length: 32015

batch 2/2
batch shape: (3821, 15231), item: 6500 - 13000


3821it [00:02, 1557.19it/s]


84 matches found
results dict length: 32095

exam 21/38
exam type: ARREXEBUHR
number of documents: (10607, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 15321), item: 0 - 6500


6500it [00:04, 1550.06it/s]


154 matches found
results dict length: 32245

batch 2/2
batch shape: (4107, 15321), item: 6500 - 13000


4107it [00:02, 1552.01it/s]


93 matches found
results dict length: 32337

exam 22/38
exam type: ARREXEBUKL
number of documents: (21815, 4)
# of batches: 4 | batch size: 6500
batch 1/4
batch shape: (6500, 21592), item: 0 - 6500


6500it [00:04, 1432.79it/s]


514 matches found
results dict length: 32830

batch 2/4
batch shape: (6500, 21592), item: 6500 - 13000


6500it [00:04, 1432.48it/s]


551 matches found
results dict length: 33354

batch 3/4
batch shape: (6500, 21592), item: 13000 - 19500


6500it [00:04, 1456.03it/s]


459 matches found
results dict length: 33784

batch 4/4
batch shape: (2315, 21592), item: 19500 - 26000


2315it [00:01, 1430.64it/s]


183 matches found
results dict length: 33945

exam 23/38
exam type: ARREXEBUKR
number of documents: (21970, 4)
# of batches: 4 | batch size: 6500
batch 1/4
batch shape: (6500, 21674), item: 0 - 6500


6500it [00:04, 1444.94it/s]


518 matches found
results dict length: 34448

batch 2/4
batch shape: (6500, 21674), item: 6500 - 13000


6500it [00:04, 1449.22it/s]


507 matches found
results dict length: 34928

batch 3/4
batch shape: (6500, 21674), item: 13000 - 19500


6500it [00:04, 1482.65it/s]


491 matches found
results dict length: 35396

batch 4/4
batch shape: (2470, 21674), item: 19500 - 26000


2470it [00:01, 1445.62it/s]


210 matches found
results dict length: 35593

exam 24/38
exam type: ARRKOPG
number of documents: (37337, 4)
# of batches: 6 | batch size: 6500
batch 1/6
batch shape: (6500, 27770), item: 0 - 6500


6500it [00:04, 1544.80it/s]


134 matches found
results dict length: 35725

batch 2/6
batch shape: (6500, 27770), item: 6500 - 13000


6500it [00:04, 1543.07it/s]


140 matches found
results dict length: 35863

batch 3/6
batch shape: (6500, 27770), item: 13000 - 19500


6500it [00:04, 1552.61it/s]


146 matches found
results dict length: 36007

batch 4/6
batch shape: (6500, 27770), item: 19500 - 26000


6500it [00:04, 1539.66it/s]


134 matches found
results dict length: 36134

batch 5/6
batch shape: (6500, 27770), item: 26000 - 32500


6500it [00:04, 1546.45it/s]


130 matches found
results dict length: 36257

batch 6/6
batch shape: (4837, 27770), item: 32500 - 39000


4837it [00:03, 1545.40it/s]


110 matches found
results dict length: 36358

exam 25/38
exam type: ARRKXEBNNH
number of documents: (10120, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 12687), item: 0 - 6500


6500it [00:06, 1054.95it/s]


498 matches found
results dict length: 36809

batch 2/2
batch shape: (3620, 12687), item: 6500 - 13000


3620it [00:03, 1057.09it/s]


280 matches found
results dict length: 37065

exam 26/38
exam type: ARRT
number of documents: (474268, 4)
# of batches: 73 | batch size: 6500
batch 1/73
batch shape: (6500, 141650), item: 0 - 6500


6500it [00:20, 319.11it/s]


1095 matches found
results dict length: 38152

batch 2/73
batch shape: (6500, 141650), item: 6500 - 13000


6500it [00:16, 397.96it/s]


1118 matches found
results dict length: 39270

batch 3/73
batch shape: (6500, 141650), item: 13000 - 19500


6500it [00:16, 400.07it/s]


1063 matches found
results dict length: 40330

batch 4/73
batch shape: (6500, 141650), item: 19500 - 26000


6500it [00:17, 365.09it/s]


1193 matches found
results dict length: 41513

batch 5/73
batch shape: (6500, 141650), item: 26000 - 32500


6500it [00:16, 384.50it/s]


1077 matches found
results dict length: 42589

batch 6/73
batch shape: (6500, 141650), item: 32500 - 39000


6500it [00:18, 344.13it/s]


1140 matches found
results dict length: 43724

batch 7/73
batch shape: (6500, 141650), item: 39000 - 45500


6500it [00:17, 364.38it/s]


1203 matches found
results dict length: 44919

batch 8/73
batch shape: (6500, 141650), item: 45500 - 52000


6500it [00:16, 389.15it/s]


1131 matches found
results dict length: 46036

batch 9/73
batch shape: (6500, 141650), item: 52000 - 58500


6500it [00:17, 371.11it/s]


1138 matches found
results dict length: 47169

batch 10/73
batch shape: (6500, 141650), item: 58500 - 65000


6500it [00:18, 350.05it/s]


1165 matches found
results dict length: 48327

batch 11/73
batch shape: (6500, 141650), item: 65000 - 71500


6500it [00:17, 376.65it/s]


1131 matches found
results dict length: 49456

batch 12/73
batch shape: (6500, 141650), item: 71500 - 78000


6500it [00:17, 361.71it/s]


1174 matches found
results dict length: 50626

batch 13/73
batch shape: (6500, 141650), item: 78000 - 84500


6500it [00:17, 372.36it/s]


1161 matches found
results dict length: 51781

batch 14/73
batch shape: (6500, 141650), item: 84500 - 91000


6500it [00:17, 374.63it/s]


1169 matches found
results dict length: 52945

batch 15/73
batch shape: (6500, 141650), item: 91000 - 97500


6500it [00:17, 367.05it/s]


1182 matches found
results dict length: 54121

batch 16/73
batch shape: (6500, 141650), item: 97500 - 104000


6500it [00:17, 367.23it/s]


1217 matches found
results dict length: 55325

batch 17/73
batch shape: (6500, 141650), item: 104000 - 110500


6500it [00:18, 358.43it/s]


1170 matches found
results dict length: 56481

batch 18/73
batch shape: (6500, 141650), item: 110500 - 117000


6500it [00:16, 398.36it/s]


1086 matches found
results dict length: 57565

batch 19/73
batch shape: (6500, 141650), item: 117000 - 123500


6500it [00:16, 389.50it/s]


1183 matches found
results dict length: 58740

batch 20/73
batch shape: (6500, 141650), item: 123500 - 130000


6500it [00:16, 395.76it/s]


1114 matches found
results dict length: 59849

batch 21/73
batch shape: (6500, 141650), item: 130000 - 136500


6500it [00:16, 402.04it/s]


1160 matches found
results dict length: 61004

batch 22/73
batch shape: (6500, 141650), item: 136500 - 143000


6500it [00:17, 367.57it/s]


1218 matches found
results dict length: 62212

batch 23/73
batch shape: (6500, 141650), item: 143000 - 149500


6500it [00:17, 367.78it/s]


1147 matches found
results dict length: 63351

batch 24/73
batch shape: (6500, 141650), item: 149500 - 156000


6500it [00:17, 372.66it/s]


1109 matches found
results dict length: 64450

batch 25/73
batch shape: (6500, 141650), item: 156000 - 162500


6500it [00:17, 367.58it/s]


1191 matches found
results dict length: 65632

batch 26/73
batch shape: (6500, 141650), item: 162500 - 169000


6500it [00:15, 407.31it/s]


1110 matches found
results dict length: 66733

batch 27/73
batch shape: (6500, 141650), item: 169000 - 175500


6500it [00:16, 385.50it/s]


1166 matches found
results dict length: 67883

batch 28/73
batch shape: (6500, 141650), item: 175500 - 182000


6500it [00:16, 392.26it/s]


1060 matches found
results dict length: 68935

batch 29/73
batch shape: (6500, 141650), item: 182000 - 188500


6500it [00:16, 404.93it/s]


1084 matches found
results dict length: 70011

batch 30/73
batch shape: (6500, 141650), item: 188500 - 195000


6500it [00:16, 392.46it/s]


1116 matches found
results dict length: 71112

batch 31/73
batch shape: (6500, 141650), item: 195000 - 201500


6500it [00:16, 390.08it/s]


1207 matches found
results dict length: 72307

batch 32/73
batch shape: (6500, 141650), item: 201500 - 208000


6500it [00:17, 382.11it/s]


1168 matches found
results dict length: 73464

batch 33/73
batch shape: (6500, 141650), item: 208000 - 214500


6500it [00:17, 373.80it/s]


1113 matches found
results dict length: 74568

batch 34/73
batch shape: (6500, 141650), item: 214500 - 221000


6500it [00:17, 374.89it/s]


1193 matches found
results dict length: 75745

batch 35/73
batch shape: (6500, 141650), item: 221000 - 227500


6500it [00:16, 392.46it/s]


1134 matches found
results dict length: 76867

batch 36/73
batch shape: (6500, 141650), item: 227500 - 234000


6500it [00:16, 385.96it/s]


1199 matches found
results dict length: 78050

batch 37/73
batch shape: (6500, 141650), item: 234000 - 240500


6500it [00:15, 409.41it/s]


1066 matches found
results dict length: 79101

batch 38/73
batch shape: (6500, 141650), item: 240500 - 247000


6500it [00:17, 374.11it/s]


1189 matches found
results dict length: 80277

batch 39/73
batch shape: (6500, 141650), item: 247000 - 253500


6500it [00:17, 380.15it/s]


1194 matches found
results dict length: 81459

batch 40/73
batch shape: (6500, 141650), item: 253500 - 260000


6500it [00:17, 369.49it/s]


1203 matches found
results dict length: 82635

batch 41/73
batch shape: (6500, 141650), item: 260000 - 266500


6500it [00:17, 374.25it/s]


1130 matches found
results dict length: 83751

batch 42/73
batch shape: (6500, 141650), item: 266500 - 273000


6500it [00:16, 385.84it/s]


1115 matches found
results dict length: 84857

batch 43/73
batch shape: (6500, 141650), item: 273000 - 279500


6500it [00:17, 378.36it/s]


1114 matches found
results dict length: 85962

batch 44/73
batch shape: (6500, 141650), item: 279500 - 286000


6500it [00:17, 373.54it/s]


1160 matches found
results dict length: 87110

batch 45/73
batch shape: (6500, 141650), item: 286000 - 292500


6500it [00:17, 378.42it/s]


1173 matches found
results dict length: 88267

batch 46/73
batch shape: (6500, 141650), item: 292500 - 299000


6500it [00:17, 372.30it/s]


1143 matches found
results dict length: 89404

batch 47/73
batch shape: (6500, 141650), item: 299000 - 305500


6500it [00:17, 377.11it/s]


1163 matches found
results dict length: 90553

batch 48/73
batch shape: (6500, 141650), item: 305500 - 312000


6500it [00:17, 363.63it/s]


1205 matches found
results dict length: 91739

batch 49/73
batch shape: (6500, 141650), item: 312000 - 318500


6500it [00:17, 366.82it/s]


1186 matches found
results dict length: 92908

batch 50/73
batch shape: (6500, 141650), item: 318500 - 325000


6500it [00:17, 379.10it/s]


1134 matches found
results dict length: 94031

batch 51/73
batch shape: (6500, 141650), item: 325000 - 331500


6500it [00:17, 375.60it/s]


1114 matches found
results dict length: 95125

batch 52/73
batch shape: (6500, 141650), item: 331500 - 338000


6500it [00:17, 370.04it/s]


1208 matches found
results dict length: 96319

batch 53/73
batch shape: (6500, 141650), item: 338000 - 344500


6500it [00:16, 405.84it/s]


1060 matches found
results dict length: 97374

batch 54/73
batch shape: (6500, 141650), item: 344500 - 351000


6500it [00:16, 384.10it/s]


1093 matches found
results dict length: 98456

batch 55/73
batch shape: (6500, 141650), item: 351000 - 357500


6500it [00:17, 377.64it/s]


1121 matches found
results dict length: 99561

batch 56/73
batch shape: (6500, 141650), item: 357500 - 364000


6500it [00:16, 396.21it/s]


1136 matches found
results dict length: 100688

batch 57/73
batch shape: (6500, 141650), item: 364000 - 370500


6500it [00:18, 358.74it/s]


1173 matches found
results dict length: 101850

batch 58/73
batch shape: (6500, 141650), item: 370500 - 377000


6500it [00:17, 366.10it/s]


1100 matches found
results dict length: 102939

batch 59/73
batch shape: (6500, 141650), item: 377000 - 383500


6500it [00:17, 369.01it/s]


1133 matches found
results dict length: 104056

batch 60/73
batch shape: (6500, 141650), item: 383500 - 390000


6500it [00:16, 387.06it/s]


1155 matches found
results dict length: 105194

batch 61/73
batch shape: (6500, 141650), item: 390000 - 396500


6500it [00:17, 362.24it/s]


1166 matches found
results dict length: 106350

batch 62/73
batch shape: (6500, 141650), item: 396500 - 403000


6500it [00:15, 416.75it/s]


1029 matches found
results dict length: 107367

batch 63/73
batch shape: (6500, 141650), item: 403000 - 409500


6500it [00:16, 382.95it/s]


1134 matches found
results dict length: 108492

batch 64/73
batch shape: (6500, 141650), item: 409500 - 416000


6500it [00:17, 376.56it/s]


1076 matches found
results dict length: 109555

batch 65/73
batch shape: (6500, 141650), item: 416000 - 422500


6500it [00:15, 416.01it/s]


1108 matches found
results dict length: 110657

batch 66/73
batch shape: (6500, 141650), item: 422500 - 429000


6500it [00:16, 402.97it/s]


1127 matches found
results dict length: 111772

batch 67/73
batch shape: (6500, 141650), item: 429000 - 435500


6500it [00:16, 384.90it/s]


1153 matches found
results dict length: 112909

batch 68/73
batch shape: (6500, 141650), item: 435500 - 442000


6500it [00:16, 390.24it/s]


1128 matches found
results dict length: 114023

batch 69/73
batch shape: (6500, 141650), item: 442000 - 448500


6500it [00:16, 389.61it/s]


1111 matches found
results dict length: 115116

batch 70/73
batch shape: (6500, 141650), item: 448500 - 455000


6500it [00:18, 353.87it/s]


1128 matches found
results dict length: 116225

batch 71/73
batch shape: (6500, 141650), item: 455000 - 461500


6500it [00:16, 400.17it/s]


1111 matches found
results dict length: 117319

batch 72/73
batch shape: (6500, 141650), item: 461500 - 468000


6500it [00:18, 354.95it/s]


1150 matches found
results dict length: 118450

batch 73/73
batch shape: (6268, 141650), item: 468000 - 474500


6268it [00:16, 369.93it/s]


1091 matches found
results dict length: 119521

exam 27/38
exam type: ARRTRBS
number of documents: (449008, 4)
# of batches: 70 | batch size: 6500
batch 1/70
batch shape: (6500, 102565), item: 0 - 6500


6500it [00:19, 326.00it/s]


1248 matches found
results dict length: 120766

batch 2/70
batch shape: (6500, 102565), item: 6500 - 13000


6500it [00:24, 268.43it/s]


1277 matches found
results dict length: 122043

batch 3/70
batch shape: (6500, 102565), item: 13000 - 19500


6500it [00:21, 299.06it/s]


1189 matches found
results dict length: 123230

batch 4/70
batch shape: (6500, 102565), item: 19500 - 26000


6500it [00:24, 264.18it/s]


1309 matches found
results dict length: 124534

batch 5/70
batch shape: (6500, 102565), item: 26000 - 32500


6500it [00:23, 277.30it/s]


1256 matches found
results dict length: 125785

batch 6/70
batch shape: (6500, 102565), item: 32500 - 39000


6500it [00:25, 255.58it/s]


1345 matches found
results dict length: 127122

batch 7/70
batch shape: (6500, 102565), item: 39000 - 45500


6500it [00:23, 280.54it/s]


1260 matches found
results dict length: 128377

batch 8/70
batch shape: (6500, 102565), item: 45500 - 52000


6500it [00:22, 293.75it/s]


1338 matches found
results dict length: 129713

batch 9/70
batch shape: (6500, 102565), item: 52000 - 58500


6500it [00:26, 242.31it/s]


1388 matches found
results dict length: 131092

batch 10/70
batch shape: (6500, 102565), item: 58500 - 65000


6500it [00:26, 245.69it/s]


1349 matches found
results dict length: 132429

batch 11/70
batch shape: (6500, 102565), item: 65000 - 71500


6500it [00:24, 269.46it/s]


1263 matches found
results dict length: 133687

batch 12/70
batch shape: (6500, 102565), item: 71500 - 78000


6500it [00:27, 240.45it/s]


1315 matches found
results dict length: 134991

batch 13/70
batch shape: (6500, 102565), item: 78000 - 84500


6500it [00:23, 277.48it/s]


1429 matches found
results dict length: 136403

batch 14/70
batch shape: (6500, 102565), item: 84500 - 91000


6500it [00:26, 242.38it/s]


1322 matches found
results dict length: 137711

batch 15/70
batch shape: (6500, 102565), item: 91000 - 97500


6500it [00:34, 190.09it/s]


1536 matches found
results dict length: 139233

batch 16/70
batch shape: (6500, 102565), item: 97500 - 104000


6500it [00:26, 242.90it/s]


1361 matches found
results dict length: 140583

batch 17/70
batch shape: (6500, 102565), item: 104000 - 110500


6500it [00:25, 257.08it/s]


1360 matches found
results dict length: 141932

batch 18/70
batch shape: (6500, 102565), item: 110500 - 117000


6500it [00:24, 268.02it/s]


1374 matches found
results dict length: 143295

batch 19/70
batch shape: (6500, 102565), item: 117000 - 123500


6500it [00:22, 289.96it/s]


1292 matches found
results dict length: 144578

batch 20/70
batch shape: (6500, 102565), item: 123500 - 130000


6500it [00:22, 291.63it/s]


1167 matches found
results dict length: 145733

batch 21/70
batch shape: (6500, 102565), item: 130000 - 136500


6500it [00:26, 248.50it/s]


1433 matches found
results dict length: 147156

batch 22/70
batch shape: (6500, 102565), item: 136500 - 143000


6500it [00:19, 336.60it/s]


1231 matches found
results dict length: 148379

batch 23/70
batch shape: (6500, 102565), item: 143000 - 149500


6500it [00:25, 257.34it/s]


1349 matches found
results dict length: 149714

batch 24/70
batch shape: (6500, 102565), item: 149500 - 156000


6500it [00:27, 232.50it/s]


1492 matches found
results dict length: 151190

batch 25/70
batch shape: (6500, 102565), item: 156000 - 162500


6500it [00:23, 274.66it/s]


1349 matches found
results dict length: 152527

batch 26/70
batch shape: (6500, 102565), item: 162500 - 169000


6500it [00:24, 261.29it/s]


1341 matches found
results dict length: 153855

batch 27/70
batch shape: (6500, 102565), item: 169000 - 175500


6500it [00:22, 290.06it/s]


1322 matches found
results dict length: 155161

batch 28/70
batch shape: (6500, 102565), item: 175500 - 182000


6500it [00:22, 290.89it/s]


1276 matches found
results dict length: 156429

batch 29/70
batch shape: (6500, 102565), item: 182000 - 188500


6500it [00:22, 286.45it/s]


1332 matches found
results dict length: 157751

batch 30/70
batch shape: (6500, 102565), item: 188500 - 195000


6500it [00:24, 265.19it/s]


1352 matches found
results dict length: 159091

batch 31/70
batch shape: (6500, 102565), item: 195000 - 201500


6500it [00:24, 262.38it/s]


1297 matches found
results dict length: 160372

batch 32/70
batch shape: (6500, 102565), item: 201500 - 208000


6500it [00:23, 272.45it/s]


1304 matches found
results dict length: 161651

batch 33/70
batch shape: (6500, 102565), item: 208000 - 214500


6500it [00:20, 317.64it/s]


1241 matches found
results dict length: 162881

batch 34/70
batch shape: (6500, 102565), item: 214500 - 221000


6500it [00:22, 294.55it/s]


1281 matches found
results dict length: 164150

batch 35/70
batch shape: (6500, 102565), item: 221000 - 227500


6500it [00:24, 261.21it/s]


1364 matches found
results dict length: 165497

batch 36/70
batch shape: (6500, 102565), item: 227500 - 234000


6500it [00:22, 294.40it/s]


1340 matches found
results dict length: 166828

batch 37/70
batch shape: (6500, 102565), item: 234000 - 240500


6500it [00:29, 219.30it/s]


1322 matches found
results dict length: 168123

batch 38/70
batch shape: (6500, 102565), item: 240500 - 247000


6500it [00:26, 248.50it/s]


1385 matches found
results dict length: 169490

batch 39/70
batch shape: (6500, 102565), item: 247000 - 253500


6500it [00:30, 213.07it/s]


1483 matches found
results dict length: 170945

batch 40/70
batch shape: (6500, 102565), item: 253500 - 260000


6500it [00:21, 303.02it/s]


1291 matches found
results dict length: 172227

batch 41/70
batch shape: (6500, 102565), item: 260000 - 266500


6500it [00:19, 325.31it/s]


1310 matches found
results dict length: 173523

batch 42/70
batch shape: (6500, 102565), item: 266500 - 273000


6500it [00:18, 355.46it/s]


1268 matches found
results dict length: 174782

batch 43/70
batch shape: (6500, 102565), item: 273000 - 279500


6500it [00:25, 257.66it/s]


1382 matches found
results dict length: 176142

batch 44/70
batch shape: (6500, 102565), item: 279500 - 286000


6500it [00:24, 268.31it/s]


1323 matches found
results dict length: 177444

batch 45/70
batch shape: (6500, 102565), item: 286000 - 292500


6500it [00:25, 258.41it/s]


1364 matches found
results dict length: 178792

batch 46/70
batch shape: (6500, 102565), item: 292500 - 299000


6500it [00:24, 265.60it/s]


1373 matches found
results dict length: 180134

batch 47/70
batch shape: (6500, 102565), item: 299000 - 305500


6500it [00:21, 309.40it/s]


1231 matches found
results dict length: 181350

batch 48/70
batch shape: (6500, 102565), item: 305500 - 312000


6500it [00:31, 205.62it/s]


1371 matches found
results dict length: 182703

batch 49/70
batch shape: (6500, 102565), item: 312000 - 318500


6500it [00:20, 317.05it/s]


1222 matches found
results dict length: 183908

batch 50/70
batch shape: (6500, 102565), item: 318500 - 325000


6500it [00:27, 232.16it/s]


1437 matches found
results dict length: 185323

batch 51/70
batch shape: (6500, 102565), item: 325000 - 331500


6500it [00:26, 246.43it/s]


1362 matches found
results dict length: 186659

batch 52/70
batch shape: (6500, 102565), item: 331500 - 338000


6500it [00:18, 360.18it/s]


1255 matches found
results dict length: 187907

batch 53/70
batch shape: (6500, 102565), item: 338000 - 344500


6500it [00:25, 257.91it/s]


1380 matches found
results dict length: 189262

batch 54/70
batch shape: (6500, 102565), item: 344500 - 351000


6500it [00:22, 295.33it/s]


1355 matches found
results dict length: 190601

batch 55/70
batch shape: (6500, 102565), item: 351000 - 357500


6500it [00:24, 265.48it/s]


1290 matches found
results dict length: 191870

batch 56/70
batch shape: (6500, 102565), item: 357500 - 364000


6500it [00:25, 255.37it/s]


1371 matches found
results dict length: 193222

batch 57/70
batch shape: (6500, 102565), item: 364000 - 370500


6500it [00:25, 257.40it/s]


1350 matches found
results dict length: 194552

batch 58/70
batch shape: (6500, 102565), item: 370500 - 377000


6500it [00:23, 281.41it/s]


1340 matches found
results dict length: 195871

batch 59/70
batch shape: (6500, 102565), item: 377000 - 383500


6500it [00:20, 311.28it/s]


1187 matches found
results dict length: 197044

batch 60/70
batch shape: (6500, 102565), item: 383500 - 390000


6500it [00:21, 306.21it/s]


1289 matches found
results dict length: 198318

batch 61/70
batch shape: (6500, 102565), item: 390000 - 396500


6500it [00:25, 254.56it/s]


1284 matches found
results dict length: 199579

batch 62/70
batch shape: (6500, 102565), item: 396500 - 403000


6500it [00:27, 233.89it/s]


1361 matches found
results dict length: 200916

batch 63/70
batch shape: (6500, 102565), item: 403000 - 409500


6500it [00:24, 270.82it/s]


1288 matches found
results dict length: 202180

batch 64/70
batch shape: (6500, 102565), item: 409500 - 416000


6500it [00:23, 281.80it/s]


1356 matches found
results dict length: 203517

batch 65/70
batch shape: (6500, 102565), item: 416000 - 422500


6500it [00:23, 275.87it/s]


1282 matches found
results dict length: 204783

batch 66/70
batch shape: (6500, 102565), item: 422500 - 429000


6500it [00:22, 283.76it/s]


1298 matches found
results dict length: 206063

batch 67/70
batch shape: (6500, 102565), item: 429000 - 435500


6500it [00:22, 283.87it/s]


1233 matches found
results dict length: 207275

batch 68/70
batch shape: (6500, 102565), item: 435500 - 442000


6500it [00:22, 288.87it/s]


1382 matches found
results dict length: 208629

batch 69/70
batch shape: (6500, 102565), item: 442000 - 448500


6500it [00:23, 272.63it/s]


1259 matches found
results dict length: 209870

batch 70/70
batch shape: (508, 102565), item: 448500 - 455000


508it [00:00, 619.16it/s]


25 matches found
results dict length: 209893

exam 28/38
exam type: ARRWXEBBWS
number of documents: (12864, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 17645), item: 0 - 6500


6500it [00:04, 1610.48it/s]


113 matches found
results dict length: 210005

batch 2/2
batch shape: (6364, 17645), item: 6500 - 13000


6364it [00:03, 1593.95it/s]


119 matches found
results dict length: 210116

exam 29/38
exam type: ARRWXEBHWS
number of documents: (27778, 4)
# of batches: 5 | batch size: 6500
batch 1/5
batch shape: (6500, 23711), item: 0 - 6500


6500it [00:04, 1522.68it/s]


224 matches found
results dict length: 210338

batch 2/5
batch shape: (6500, 23711), item: 6500 - 13000


6500it [00:04, 1518.29it/s]


251 matches found
results dict length: 210582

batch 3/5
batch shape: (6500, 23711), item: 13000 - 19500


6500it [00:04, 1529.12it/s]


255 matches found
results dict length: 210827

batch 4/5
batch shape: (6500, 23711), item: 19500 - 26000


6500it [00:04, 1543.77it/s]


229 matches found
results dict length: 211048

batch 5/5
batch shape: (1778, 23711), item: 26000 - 32500


1778it [00:01, 1541.92it/s]


76 matches found
results dict length: 211122

exam 30/38
exam type: ARRWXEBLWS
number of documents: (63575, 4)
# of batches: 10 | batch size: 6500
batch 1/10
batch shape: (6500, 37852), item: 0 - 6500


6500it [00:05, 1194.68it/s]


229 matches found
results dict length: 211350

batch 2/10
batch shape: (6500, 37852), item: 6500 - 13000


6500it [00:04, 1544.53it/s]


217 matches found
results dict length: 211566

batch 3/10
batch shape: (6500, 37852), item: 13000 - 19500


6500it [00:04, 1548.09it/s]


226 matches found
results dict length: 211789

batch 4/10
batch shape: (6500, 37852), item: 19500 - 26000


6500it [00:05, 1202.99it/s]


241 matches found
results dict length: 212026

batch 5/10
batch shape: (6500, 37852), item: 26000 - 32500


6500it [00:04, 1547.17it/s]


248 matches found
results dict length: 212265

batch 6/10
batch shape: (6500, 37852), item: 32500 - 39000


6500it [00:05, 1203.69it/s]


222 matches found
results dict length: 212482

batch 7/10
batch shape: (6500, 37852), item: 39000 - 45500


6500it [00:05, 1200.88it/s]


232 matches found
results dict length: 212708

batch 8/10
batch shape: (6500, 37852), item: 45500 - 52000


6500it [00:05, 1203.79it/s]


232 matches found
results dict length: 212935

batch 9/10
batch shape: (6500, 37852), item: 52000 - 58500


6500it [00:05, 1207.82it/s]


233 matches found
results dict length: 213161

batch 10/10
batch shape: (5075, 37852), item: 58500 - 65000


5075it [00:04, 1208.70it/s]


162 matches found
results dict length: 213320

exam 31/38
exam type: ARSB
number of documents: (125752, 4)
# of batches: 20 | batch size: 6500
batch 1/20
batch shape: (6500, 88365), item: 0 - 6500


6500it [00:39, 164.69it/s]


1425 matches found
results dict length: 214699

batch 2/20
batch shape: (6500, 88365), item: 6500 - 13000


6500it [00:40, 159.13it/s]


1422 matches found
results dict length: 216029

batch 3/20
batch shape: (6500, 88365), item: 13000 - 19500


6500it [00:34, 186.72it/s]


1400 matches found
results dict length: 217328

batch 4/20
batch shape: (6500, 88365), item: 19500 - 26000


6500it [00:39, 164.46it/s]


1437 matches found
results dict length: 218672

batch 5/20
batch shape: (6500, 88365), item: 26000 - 32500


6500it [00:40, 160.64it/s]


1464 matches found
results dict length: 220046

batch 6/20
batch shape: (6500, 88365), item: 32500 - 39000


6500it [00:38, 168.99it/s]


1445 matches found
results dict length: 221390

batch 7/20
batch shape: (6500, 88365), item: 39000 - 45500


6500it [00:40, 161.32it/s]


1437 matches found
results dict length: 222713

batch 8/20
batch shape: (6500, 88365), item: 45500 - 52000


6500it [00:37, 172.66it/s]


1436 matches found
results dict length: 224045

batch 9/20
batch shape: (6500, 88365), item: 52000 - 58500


6500it [00:37, 175.56it/s]


1363 matches found
results dict length: 225309

batch 10/20
batch shape: (6500, 88365), item: 58500 - 65000


6500it [00:39, 163.84it/s]


1425 matches found
results dict length: 226619

batch 11/20
batch shape: (6500, 88365), item: 65000 - 71500


6500it [00:39, 165.02it/s]


1434 matches found
results dict length: 227954

batch 12/20
batch shape: (6500, 88365), item: 71500 - 78000


6500it [00:34, 191.02it/s]


1345 matches found
results dict length: 229191

batch 13/20
batch shape: (6500, 88365), item: 78000 - 84500


6500it [00:36, 176.61it/s]


1348 matches found
results dict length: 230430

batch 14/20
batch shape: (6500, 88365), item: 84500 - 91000


6500it [00:39, 165.44it/s]


1336 matches found
results dict length: 231641

batch 15/20
batch shape: (6500, 88365), item: 91000 - 97500


6500it [00:35, 182.21it/s]


1331 matches found
results dict length: 232853

batch 16/20
batch shape: (6500, 88365), item: 97500 - 104000


6500it [00:32, 198.59it/s]


1320 matches found
results dict length: 234052

batch 17/20
batch shape: (6500, 88365), item: 104000 - 110500


6500it [00:37, 172.60it/s]


1321 matches found
results dict length: 235240

batch 18/20
batch shape: (6500, 88365), item: 110500 - 117000


6500it [00:36, 176.23it/s]


1432 matches found
results dict length: 236556

batch 19/20
batch shape: (6500, 88365), item: 117000 - 123500


6500it [00:38, 171.01it/s]


1373 matches found
results dict length: 237787

batch 20/20
batch shape: (2252, 88365), item: 123500 - 130000


2252it [00:10, 225.00it/s]


427 matches found
results dict length: 238163

exam 32/38
exam type: ARSBSNONIX
number of documents: (23746, 4)
# of batches: 4 | batch size: 6500
batch 1/4
batch shape: (6500, 30050), item: 0 - 6500


6500it [00:05, 1251.23it/s]


649 matches found
results dict length: 238799

batch 2/4
batch shape: (6500, 30050), item: 6500 - 13000


6500it [00:05, 1231.07it/s]


653 matches found
results dict length: 239425

batch 3/4
batch shape: (6500, 30050), item: 13000 - 19500


6500it [00:05, 1253.93it/s]


652 matches found
results dict length: 240052

batch 4/4
batch shape: (4246, 30050), item: 19500 - 26000


4246it [00:03, 1275.55it/s]


414 matches found
results dict length: 240454

exam 33/38
exam type: ARSBSNOTXN
number of documents: (14646, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 13971), item: 0 - 6500


6500it [00:45, 141.76it/s]


2260 matches found
results dict length: 242515

batch 2/3
batch shape: (6500, 13971), item: 6500 - 13000


6500it [00:40, 160.84it/s]


2090 matches found
results dict length: 244375

batch 3/3
batch shape: (1646, 13971), item: 13000 - 19500


1646it [00:10, 155.73it/s]


548 matches found
results dict length: 244834

exam 34/38
exam type: ARSEFKVUVX
number of documents: (21106, 4)
# of batches: 4 | batch size: 6500
batch 1/4
batch shape: (6500, 22559), item: 0 - 6500


6500it [00:22, 287.33it/s]


1475 matches found
results dict length: 246260

batch 2/4
batch shape: (6500, 22559), item: 6500 - 13000


6500it [00:23, 281.36it/s]


1411 matches found
results dict length: 247605

batch 3/4
batch shape: (6500, 22559), item: 13000 - 19500


6500it [00:21, 298.78it/s]


1394 matches found
results dict length: 248924

batch 4/4
batch shape: (1606, 22559), item: 19500 - 26000


1606it [00:05, 311.95it/s]


340 matches found
results dict length: 249252

exam 35/38
exam type: ARSESNOUXX
number of documents: (14486, 4)
# of batches: 3 | batch size: 6500
batch 1/3
batch shape: (6500, 27013), item: 0 - 6500


6500it [00:23, 274.85it/s]


1184 matches found
results dict length: 250413

batch 2/3
batch shape: (6500, 27013), item: 6500 - 13000


6500it [00:24, 262.25it/s]


1226 matches found
results dict length: 251601

batch 3/3
batch shape: (1486, 27013), item: 13000 - 19500


1486it [00:05, 295.04it/s]


275 matches found
results dict length: 251868

exam 36/38
exam type: ARSXSNOWGW
number of documents: (11303, 4)
# of batches: 2 | batch size: 6500
batch 1/2
batch shape: (6500, 25819), item: 0 - 6500


6500it [00:05, 1211.19it/s]


349 matches found
results dict length: 252209

batch 2/2
batch shape: (4803, 25819), item: 6500 - 13000


4803it [00:03, 1224.56it/s]


271 matches found
results dict length: 252469

exam 37/38
exam type: ARXXTLE
number of documents: (26896, 4)
# of batches: 5 | batch size: 6500
batch 1/5
batch shape: (6500, 32446), item: 0 - 6500


6500it [00:06, 1079.04it/s]


743 matches found
results dict length: 253196

batch 2/5
batch shape: (6500, 32446), item: 6500 - 13000


6500it [00:06, 1059.75it/s]


773 matches found
results dict length: 253947

batch 3/5
batch shape: (6500, 32446), item: 13000 - 19500


6500it [00:06, 1081.35it/s]


799 matches found
results dict length: 254719

batch 4/5
batch shape: (6500, 32446), item: 19500 - 26000


6500it [00:06, 1058.05it/s]


734 matches found
results dict length: 255430

batch 5/5
batch shape: (896, 32446), item: 26000 - 32500


896it [00:00, 1152.66it/s]

119 matches found
results dict length: 255544






In [None]:
df, df_types = load_frame()

In [None]:
df_type

In [None]:
print(df.loc[110471]['text'])

In [None]:
print(df.loc[4421195]['text'])

In [None]:
print(df.loc[1309976]['text'])

In [11]:
import pickle
filename = 'distances_below_0.6'
outfile = open(filename, 'wb')



In [12]:
pickle.dump(test, outfile)
outfile.close()

In [13]:
infile = open(filename, 'rb')
new_dict = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [None]:

for idx, (i, j) in enumerate(test.items()):
    # loop through every 10th item in j[0]
    if idx ==0 or idx//5 == 0:
        print(f'# of candidates: {len(j[0])}\n')
        original_text = df.loc[i]['text']
        print(f'original text:\n{original_text}')
        for i, k in enumerate(j[0][::50]):

            text = df.loc[k]['text']

            print(f'\nVergleichstext mit distance: {j[1][i]} \n{text}')


            #print(dataframe.loc[k]['text'])

            print('')

In [None]:
# deduplicate with saved results_dict

for i, j in test.items():
    items_to_delete = [i for i in j[0] if i not in deleted_items]
    print(i, items_to_delete)
    for ij in items_to_delete:
        deleted_items.append(ij)
        print(f'deleted item: {deleted_items}')
    if i not in deleted_items:
        print('True')
        df = df.drop([*items_to_delete])
        deleted_items.append(i)
