In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
from time import time
import math
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer as skTfidf
import cupy as cp           #use cupy array instead of numpy to speed up calculation by using GPU
import cudf as cf
from cuml.metrics.pairwise_distances import sparse_pairwise_distances
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidf
from cuml.metrics.pairwise_distances import pairwise_distances


import matplotlib.pyplot as plt
import sys

In [3]:
# path = '/home/test/Data/corpus.csv'
path = './corpus.csv'
def load_frame(path_to_df=path, encoding='utf-16', filter_value = 100000):
    
    df= pd.read_csv(path_to_df, encoding=encoding, index_col='id')
    df.drop([df.columns[0]], inplace=True, axis=1)
    df.drop_duplicates(subset=['text'],inplace=True)
    
    exam_type_distribution = df.groupby(['exam_type'])['exam_type'].count()
    exam_type_distribution.sort_values(ascending=False)
    
    list_filtered = exam_type_distribution[exam_type_distribution > filter_value].index
    df_filtered = df[df['exam_type'].isin(list_filtered)]
    
    print(f'original dataframe shape: {df.shape}')
    print(f'df_filtered shape: {df_filtered.shape}')
    
    print(f'numbers of exam types before: {len(set(df["exam_type"]))}')
    print(f'types after filtering: {list_filtered}')
    print(f'number of exam types after filtering: {len(set(df_filtered["exam_type"]))}')

    return df_filtered, list_filtered

In [4]:
# define a load function to load each exam type
def load_exam_type(df, exam_types):
    for i in exam_types:
        print(f'exam type: {i}')
        dataframe = df[df['exam_type'] == i]    
        print(f'number of documents: {dataframe.shape}')
        yield dataframe

In [5]:
def batch_tfidf(sparseMatrix, size = 5000):
    for idx, item in enumerate(range(0, sparseMatrix.shape[0], size)):
        batch_sparseMatrix = sparseMatrix[item:item+size,:]
        print(f'batch shape: {batch_sparseMatrix.shape}, item: {item}')
        yield batch_sparseMatrix

In [6]:
def get_distances(upperbound, batch_size=8000, filter=100000):

    df_filtered = load_frame(filter_value = filter)
    df_by_type = load_exam_type(df_filtered[0], df_filtered[1])
    results_dict = {}

    # loop through each exam type
    for i in range(0,len(df_filtered[1])):
        dataframe = next(df_by_type)
        df_indices = dataframe.index.to_list()

        tfidf = cuTfidf().fit_transform(dataframe['text'])
        
        batch = batch_tfidf(tfidf, size=batch_size)
        print(f'batches: {math.ceil(tfidf.shape[0]/batch_size)} | batch size: {batch_size}')
        
        # loop through bathes of tfidf matrix row wise
        counter = 0
        for i in range(0, math.ceil(tfidf.shape[0]/batch_size)):
            # distances = []
            batch_sparse = next(batch)
            batch_dataframe = dataframe[counter:counter+batch_size]
            batch_indices = batch_dataframe.index.to_list() # no batch indices needed if using candidates[0]
                                                            # see below in sort_by_distance for 
            
            distance_batch = sparse_pairwise_distances(batch_sparse, tfidf, metric='euclidean')
            
            candidates_and_distances = sort_by_distance(distance_batch, df_indices, batch_indices, upperbound, results)
            
            if candidates_and_distances:
                results_dict.update(candidates_and_distances)
            
            counter += batch_size

            del distance_batch
            del batch_sparse

        del tfidf
        del batch
        return candidates_and_distances
        break
                
    return candidates_and_distances

In [7]:

def sort_by_distance(distance_batch, df_indices, upperbound, results):
    for i, row in enumerate(distance_batch):
        sorted_array = np.sort(row)
        arg_sorted = np.argsort(row)
        # working
        # candidates = get_candidates(sorted_array, arg_sorted, upperbound)[1:]
        # df_candidates = [df_indices[int(i)] for i in candidates]
        # original_index = batch_indices[i]

        # new version without batch indices
        candidates = get_candidates(sorted_array, arg_sorted, upperbound)
        df_candidates = [df_indices[int(i)] for i in candidates[1:]]
        original_index = candidates[0]
        

        if candidates:
            # results[original_index] = (df_candidates)

            results.update(save_results(sorted_array, candidates, df_candidates, original_index))
            
    return results

# candidates_and_distances = sort_by_distance(distance_batch, dataframe_indices, batch_indices)

In [25]:
def get_candidates(sorted_array, arg_sorted, upper):
    candidates = []
    for i, x in enumerate(sorted_array):
        if x > upper:
            break
        else:
            candidates.append(int(arg_sorted[i]))
    return candidates

In [26]:
def save_results(sorted_array, candidates, df_candidates, original_index):
    results = {}
    distances = sorted_array[1:len(candidates)+1]
    results[original_index] = (df_candidates, distances)
    return results

    

In [None]:
test = get_distances(upperbound=0.5, batch_size=8000, filter=100000)

original dataframe shape: (2586631, 4)
df_filtered shape: (1149916, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARRT', 'ARRTRBS', 'ARSB'], dtype='object', name='exam_type')
number of exam types after filtering: 4
exam type: ARCK
number of documents: (100888, 4)
batches: 13 - batch size: 8000
(8000, 66772) 0


TypeError: unhashable type: 'cupy._core.core.ndarray'

In [10]:
# load the data
loaded = load_frame()
exam_types = loaded[1]
df = loaded[0]
df_by_type = load_exam_type(df, exam_types)
dataframe = next(df_by_type)

original dataframe shape: (2586631, 4)
df_filtered shape: (1149916, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARRT', 'ARRTRBS', 'ARSB'], dtype='object', name='exam_type')
number of exam types after filtering: 4
exam type: ARCK
number of documents: (100888, 4)


In [33]:


# calcuate tfidf and batch the sparse matrix
tfidf = cuTfidf().fit_transform(dataframe['text'])
batch_size = 8000
batch = batch_tfidf(tfidf, size = batch_size)
batch_sparse = next(batch)
print(f'batches: {math.ceil(tfidf.shape[0]/batch_size)} | batch size: {batch_size}')

dataframe_indices = dataframe.index.to_list()
print(f'tfidf shape: {tfidf.shape}')

distance_batch = sparse_pairwise_distances(batch_sparse, tfidf, metric='euclidean') # distance matrix
distance_batch.shape # distance_batch takes 2 gigabytes of memory, needs to be deleted after use

distance_batch = distance_batch[0:50] #take a small sample of the distance matrix
print(f'sparse pairse distance batch shape: {distance_batch.shape}')
results={} # to save the results as a dictionary
upperbound = 0.5  # upperbound of the distance to be considered as a match

for i, row in enumerate(distance_batch):
    arg_sorted  = np.argsort(row)
    sorted_array = np.sort(row)
    # print(arg_sorted)
    # print(sorted_array)

    candidates = get_candidates(sorted_array, arg_sorted, upperbound) # get the candidates
    if len(candidates) > 1:
    
        df_candidates = [dataframe_indices[i] for i in candidates]
        original_index = df_candidates[0]
        df_candidates = df_candidates[1:]
        # if df_candidates:
        #     print(f'df_candidates: {df_candidates}')
        #     print(f'# of df candidates: {len(df_candidates)}')

        # original_index = batch_indices[i]
        # print(original_index)

        distances = sorted_array[1:len(candidates)+1]
        results[original_index] = (df_candidates, distances)
        
del distance_batch
del tfidf

(8000, 66772) 0
batches: 13 | batch size: 8000
tfidf shape: (100888, 66772)
sparse pairse distance batch shape: (50, 100888)


In [34]:
# print(results) to show the examples of results with distances

for i, j in results.items():

    # loop through every 10th item in j[0]
    print(f'# of candidates: {len(j[0])}\n')
    original_text = dataframe.loc[i]['text']
    print(f'original text:\n{original_text}')
    for i, k in enumerate(j[0][::50]):
        
        text = dataframe.loc[k]['text']
        
        print(f'\nVergleichstext mit distance: {j[1][i]} \n{text}')
        
        
        #print(dataframe.loc[k]['text'])
    print('')

# of candidates: 281

original text:
CT Kopf vom    Klinik Fragestellung Rechtfertigende Indikation  Kopfschmerzen   Methodik  Digitale Übersichtsradiographien Parallel zur OrbitoMeatalLinie gewinkelte native ZeilenCT des Kopfes in  mm Schichtdicke Bildschirmbefundung  Befund Es liegen keine Voraufnahmen zum Vergleich vor  Kein Nachweis einer frischen intrakraniellen Blutung Kein Nachweis eines demarkierten frischen zerebralen Infarkts Mittelständiger Interhemisphärenspalt Normal weites symmetrisches Ventrikelsystem ohne Anhalt für einen Liquoraufstau Basale Zisternen und  Ventrikel frei einsehbar Keine Hirndruckzeichen Kein Nachweis einer Raumforderung soweit nativ beurteilbar Keine pathologischen Veränderungen der Schädelkalotte und Schädelbasis Regelrechte Anlage und freie Pneumatisation der mit abgebildeten NNH und der Mastoidzellen  Beurteilung  Kein Nachweis einer ICB     

Vergleichstext mit distance: 0.21215848624706268 
CT Kopf vom    Klinik Fragestellung Rechtfertigende Indik

In [12]:
# select the 2nd item from results dictionary
print(list(results.values())[1:])
print(list(results)[1:])

[([3961627], array([0.4273842], dtype=float32)), ([3345037], array([0.45848805], dtype=float32))]
[954, 1471]


In [9]:
# calculate by hand row for row is not efficient
distances = []
for i, j in tqdm(enumerate(batch_sparse)):
    x = batch_sparse[i]
    distance = sparse_pairwise_distances(x, tfidf, metric='euclidean')[0]
    distances.append(distance)
    
    del x
    del distance
print(len(distances),len(distances[0]))


# calculation when multiplying by 'hand' results in a slightly different result, 
# but the difference is negligible

first=sparse_pairwise_distances(batch_sparse[0], batch_sparse, metric='euclidean')
print(first.shape)
full = sparse_pairwise_distances(batch_sparse, metric='euclidean')
# show the different values between first[0] and full[0,:]
# sum the number of different values
print(sum(first[0] != full[0,:]))
print(sum(abs(first[0] - full[0,:]) > 0.01))
print(len(first[0]))

# for i in range(len(first[0])):
#     if abs(first[0][i] - full[0,i]) > 0.1:
#         print(i, first[0][i], full[0,i])

(1, 8000)
1675
0
8000


In [94]:
# different methods for appending each single calculation has different performance with
# -> python list being the quickest and numpy array vstack being the slowest
# -> use numpy concatenate for the best performance if numpy is required
py_array = [[0, 1, 2], [0, 2, 0]]
py_row = [4, 5, 6]
numpy_array = np.array(py_array)
numpy_row = np.array([4,5,6])

%timeit np.array(py_array)
%timeit np.concatenate([numpy_array, numpy_row.reshape(1, -1)], axis=0)
%timeit np.vstack([numpy_array, numpy_row]) 
%timeit np.append(numpy_array, numpy_row.reshape(1, -1), axis=0)
list_array = []
%timeit py_array.append(py_row)

936 ns ± 14.4 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
1.73 µs ± 3.27 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
2.74 µs ± 5.12 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.26 µs ± 12.6 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
42.3 ns ± 0.14 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [6]:
tfidf = cuTfidf().fit_transform(dataframe['text'])
# deleting the tfidf variable to free up memory space with del tfidf does not work
# -> use gc.collect() to free up memory space