In [2]:
from time import time
import math
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer as skTfidf
import cupy as cp           #use cupy array instead of numpy to speed up calculation by using GPU
import cudf as cf
from cuml.metrics.pairwise_distances import sparse_pairwise_distances
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidf
from cuml.metrics.pairwise_distances import pairwise_distances


import matplotlib.pyplot as plt
import sys

In [3]:
path = '/home/test/Data/corpus.csv'
def load_frame(path_to_df=path, encoding='utf-16', filter_value = 100000):
    
    df= pd.read_csv(path_to_df, encoding=encoding, index_col='id')
    df.drop([df.columns[0]], inplace=True, axis=1)
    df.drop_duplicates(subset=['text'],inplace=True)
    
    exam_type_distribution = df.groupby(['exam_type'])['exam_type'].count()
    exam_type_distribution.sort_values(ascending=False)
    
    list_filtered = exam_type_distribution[exam_type_distribution > filter_value].index
    df_filtered = df[df['exam_type'].isin(list_filtered)]
    
    print(f'original dataframe shape: {df.shape}')
    print(f'df_filtered shape: {df_filtered.shape}')
    
    print(f'numbers of exam types before: {len(set(df["exam_type"]))}')
    print(f'types after filtering: {list_filtered}')
    print(f'number of exam types after filtering: {len(set(df_filtered["exam_type"]))}')

    return df_filtered, list_filtered

In [4]:
# define a load function to load each exam type
def load_exam_type(df, exam_types):
    for i in exam_types:
        dataframe = df[df['exam_type'] == i]    
        print(f'exam type: {i}, dataframe shape: {dataframe.shape}')
        yield dataframe

In [5]:
def batch_tfidf(sparseMatrix, size = 5000):
    for idx, item in enumerate(range(0, sparseMatrix.shape[0], size)):
        batch_sparseMatrix = sparseMatrix[item:item+size,:]
        print(f'batch shape: {batch_sparseMatrix.shape}, index: {item}')
        yield batch_sparseMatrix

In [6]:
# create a function to list all entries with the its similarity inside a threshhold
def get_distances(upperbound, batch_size=8000, filter=100000):
    # create a dictionary to store the results
    results_dict = {}
    
    # start with loading the dataframe by exam_types
    df_filtered = load_frame(filter_value = filter)
    df_by_type = load_exam_type(df_filtered[0], df_filtered[1])

    # loop over all exam types
    for i in range(0,len(df_filtered[1])):
        dataframe = next(df_by_type)
        df_indices = dataframe.index.to_list()
        # calculate the tfidf
        tfidf = cuTfidf().fit_transform(dataframe['text'])
        
        #use batches to calculate the distances
        batch = batch_tfidf(tfidf, size=batch_size)
        counter = 0
        #loop through all batches
        for i in range(0, math.ceil(tfidf.shape[0]/batch_size)):
            distances = []
            batch_sparse = next(batch)
            batch_dataframe = dataframe[counter:counter+batch_size]
            # print(f'batch dataframe shape: {batch_dataframe.shape}')
            counter += batch_size
            batch_indices = batch_dataframe.index.to_list()
            # calculate the euclidean distance
            for i, j in tqdm(enumerate(batch_sparse)):
                x = batch_sparse[i]
                distance = sparse_pairwise_distances(x, tfidf, metric='euclidean')
                distances.append(distance)
                del x
            print(f'list length: {len(distances)}')
            print(f'list type: type({type(distances)})')
            #distances_array = np.array(distances)
            #print(f'np array shape: {distances_array.shape}')
            results = sort_by_distance(distances, batch_dataframe, upperbound=upperbound, indices=df_indices)
        del tfidf
        break
                
    return results

In [7]:
def get_candidates(sorted_array, arg_sorted, upper):
    candidates = []
    for i, x in enumerate(sorted_array):
        if x > upper:
            break
        else:
            candidates.append(int(arg_sorted[i]))
    return candidates

In [8]:
# define a function to calculate the euclidean distance of a batch of documents 
# and return the neighbours based on the threshold
def sort_by_distance(distance_batch, batch_dataframe, upperbound, indices):

    results = {}
    distance_batch = distance_batch[0:50] #for sampling/testing purposes
    
    for i, row in enumerate(distance_batch):

        arg_sorted = np.argsort(row)
        sorted_array = row[arg_sorted]
        
        candidates = get_candidates(sorted_array, arg_sorted, upper=upperbound)
                
        df_candidates = [indices[int(i)] for i in candidates[1:]]
        original_index = indices[i]
        distances = sorted_array[1:len(df_candidates)+1]
        
        if df_candidates:
            results[original_index] = (df_candidates, distances)

    return results

In [9]:
test = get_distances(batch_size=80000, upperbound=0.1)

original dataframe shape: (2586631, 4)
df_filtered shape: (1149916, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARRT', 'ARRTRBS', 'ARSB'], dtype='object', name='exam_type')
number of exam types after filtering: 4
exam type: ARCK
number of documents: (100888, 4)
batch shape: (80000, 66772), index: 0


MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/test/miniconda3/envs/rapids/include/rmm/mr/device/cuda_memory_resource.hpp

In [None]:
def distance_deduplicate(distance_batch, batch_dataframe, upperbound, lowerbound, indices):
    for i, row in enumerate(distance_batch):
        arg_sorted = cp.argsort(row)
        sorted_array = row[arg_sorted]
        candidates = get_candidates(sorted_array, arg_sorted, upper=upperbound)
        df_candidates = [indices[int(i)] for i in candidates[1:]] #the ones to be deleted
        original_index = indices[i] #the one to keep
        
        
        

In [36]:
test = get_matches(upperbound=0.2, batch_size=12000, filter=100000)

original dataframe shape: (2586631, 4)
df_filtered shape: (1149916, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARRT', 'ARRTRBS', 'ARSB'], dtype='object', name='exam_type')
number of exam types after filtering: 4
exam type: ARCK
number of documents: (100888, 4)
exam type: ARRT
number of documents: (474268, 4)
exam type: ARRTRBS
number of documents: (449008, 4)
exam type: ARSB
number of documents: (125752, 4)


In [37]:
test

{699394: ([709879], array([0.17644596], dtype=float32)),
 921246: ([944357, 975710, 1026933],
  array([0.        , 0.        , 0.12366536], dtype=float32)),
 2951520: ([2951641], array([0.18948333], dtype=float32)),
 2951641: ([2951520], array([0.18948396], dtype=float32)),
 3127460: ([3191707], array([0.12182397], dtype=float32)),
 1290517: ([1378183, 1585015, 1596270],
  array([0.18931544, 0.1893159 , 0.19431907], dtype=float32)),
 2135117: ([2135162, 2234454], array([0., 0.], dtype=float32)),
 2135162: ([2135162, 2234454], array([0., 0.], dtype=float32)),
 2135242: ([2147647], array([0.], dtype=float32)),
 2567179: ([2773351], array([0.], dtype=float32)),
 3431773: ([3546338], array([0.16418308], dtype=float32))}

In [28]:
df = load_frame()[0]

original dataframe shape: (2586631, 4)
df_filtered shape: (1149916, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARRT', 'ARRTRBS', 'ARSB'], dtype='object', name='exam_type')
number of exam types after filtering: 4


In [34]:
print(df.loc[2135117]['text'])

Sonographie gesamtes Abdomen vom   Klinik Fragestellung Rechtfertigende Indikation  Schockraummanagement Verkehrsunfall FAST   Befund und Beurteilung   Sonographisch kein Nachweis einer Verletzung der parenchymatösen Abdominalorgane  Keine freie intraabdominelle Flüssigkeit abzugrenzen  Soweit einsehbar basal kein wesentlicher Pleura oder Perikarderguss  Harnblase gut gefüllt echofreies Binnenmuster  


In [35]:
df.loc[2234454]['text']

'Sonographie gesamtes Abdomen vom   Klinik Fragestellung Rechtfertigende Indikation  Schockraummanagement FAST Verkehrsunfall  Befund und Beurteilung   Sonographisch kein Nachweis einer Verletzung der parenchymatösen Abdominalorgane  Keine freie intraabdominelle Flüssigkeit abzugrenzen  Soweit einsehbar basal kein wesentlicher Pleura oder Perikarderguss  Harnblase gut gefüllt echofreies Binnenmuster   '

In [42]:
deleted_items = []

In [43]:
for i, j in test.items():
    items_to_delete = [i for i in j[0] if i not in deleted_items]
    print(i, items_to_delete)
    for ij in items_to_delete:
        deleted_items.append(ij)
        print(f'deleted item: {deleted_items}')
    if i not in deleted_items:
        print('True')
        df = df.drop([*items_to_delete])
        deleted_items.append(i)


699394 [709879]
deleted item: [709879]
True
921246 [944357, 975710, 1026933]
deleted item: [709879, 699394, 944357]
deleted item: [709879, 699394, 944357, 975710]
deleted item: [709879, 699394, 944357, 975710, 1026933]
True
2951520 [2951641]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641]
True
2951641 []
3127460 [3191707]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707]
True
1290517 [1378183, 1585015, 1596270]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183, 1585015]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183, 1585015, 1596270]
True
2135117 [2135162, 2234454]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183, 1585015, 1596270, 129051