In [1]:
from time import time
import math
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer as skTfidf
import cupy as cp           #use cupy array instead of numpy to speed up calculation by using GPU
import cudf as cf
from cuml.metrics.pairwise_distances import sparse_pairwise_distances
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidf
from cuml.metrics.pairwise_distances import pairwise_distances


import matplotlib.pyplot as plt
import sys

In [2]:
path = './corpus_short_125_no_dup_with_type.csv'
def load_frame(path_to_df=path, encoding='utf-16', filter_value = 100000):
    
    df= pd.read_csv(path_to_df, encoding=encoding, index_col='id')
    df.drop([df.columns[0]], inplace=True, axis=1)
    df.drop_duplicates(subset=['text'],inplace=True)
    
    exam_type_distribution = df.groupby(['exam_type'])['exam_type'].count()
    exam_type_distribution.sort_values(ascending=False)
    
    list_filtered = exam_type_distribution[exam_type_distribution > filter_value].index
    df_filtered = df[df['exam_type'].isin(list_filtered)]
    
    print(f'original dataframe shape: {df.shape}')
    print(f'df_filtered shape: {df_filtered.shape}')
    
    print(f'numbers of exam types before: {len(set(df["exam_type"]))}')
    print(f'types after filtering: {list_filtered}')
    print(f'number of exam types after filtering: {len(set(df_filtered["exam_type"]))}')

    return df_filtered, list_filtered

In [3]:
# define a load function to load each exam type
def load_exam_type(df, exam_types):
    for i in exam_types:
        print(f'exam type: {i}')
        dataframe = df[df['exam_type'] == i]    
        print(f'number of documents: {dataframe.shape}')
        yield dataframe

In [61]:
# create a function to list all entries with the its similarity inside a threshhold
def get_matches(upperbound, lowerbound, batch_size=12000, filter=100000):
    # create a dictionary to store the results
    results_dict = {}

    # start with loading the dataframe
    df_filtered = load_frame(filter_value = filter)
    df_by_type = load_exam_type(df_filtered[0], df_filtered[1])

    # loop over all exam types
    for i in range(0,len(df_filtered[1])):
        dataframe = next(df_by_type)
        #print(f'exam type: {dataframe["exam_type"].iloc[0]}')
        for idx, batch in enumerate(range(0, len(dataframe), batch_size)):
            batch_info = f'batch number {idx}, rows {batch}:{batch + batch_size}'
            # print(batch_info)
            # transform the batchch+batch_size].index[i],
            # create a tfIdf vectorizer and fit and transform the documents
            batch_dataframe = dataframe.iloc[batch:batch + batch_size]
            df_indices = batch_dataframe.index.to_list()
            tfidf_batch = cuTfidf().fit_transform(batch_dataframe['text'])
            # print(f'tfidf shape:{tfidf_batch.shape}')

            # calculate the euclidean distance
            distances_batch = sparse_pairwise_distances(tfidf_batch, metric='euclidean')

            results = euclidean_distance(distances_batch, batch_dataframe, upperbound=upperbound, lowerbound=lowerbound, indices=df_indices)
            if results:
                results_dict.update(results)
            #break
        #break
    return results_dict



In [62]:
# define a function to calculate the euclidean distance of a batch of documents 
# and return the neighbours based on the threshold
def euclidean_distance(distance_batch, batch_dataframe, upperbound, lowerbound, indices):

    results = {}
    distance_batch = distance_batch[0:50] #for sampling/testing purposes
    
    for i, row in enumerate(distance_batch):

        arg_sorted = cp.argsort(row)
        sorted_array = row[arg_sorted]
        
        candidates = get_candidates(sorted_array, arg_sorted, upper=upperbound)
                
        df_candidates = [indices[int(i)] for i in candidates[1:]]
        original_index = indices[i]
        distances = sorted_array[1:len(df_candidates)+1]
        
        if df_candidates:
            results[original_index] = (df_candidates, distances)

    return results

In [63]:
def get_candidates(sorted_array, arg_sorted, upper):
    candidates = []
    for i, x in enumerate(sorted_array):
        if x > upper:
            break
        else:
            candidates.append(int(arg_sorted[i]))
    return candidates

In [64]:
test = get_matches(upperbound=0.2, lowerbound=0, batch_size=12000, filter=100000)

original dataframe shape: (2586631, 4)
df_filtered shape: (1149916, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARRT', 'ARRTRBS', 'ARSB'], dtype='object', name='exam_type')
number of exam types after filtering: 4
exam type: ARCK
number of documents: (100888, 4)
exam type: ARRT
number of documents: (474268, 4)
exam type: ARRTRBS
number of documents: (449008, 4)
exam type: ARSB
number of documents: (125752, 4)


In [65]:
test

{699394: ([709879], array([0.17644596], dtype=float32)),
 921246: ([944357, 975710, 1026933],
  array([0.        , 0.        , 0.12366536], dtype=float32)),
 2951520: ([2951641], array([0.18948333], dtype=float32)),
 2951641: ([2951520], array([0.18948396], dtype=float32)),
 3127460: ([3191707], array([0.12182446], dtype=float32)),
 1290517: ([1378183, 1585015, 1596270],
  array([0.18931559, 0.18931559, 0.19431968], dtype=float32)),
 2135117: ([2135162, 2234454], array([0., 0.], dtype=float32)),
 2135162: ([2135162, 2234454], array([0., 0.], dtype=float32)),
 2135242: ([2147647], array([0.], dtype=float32)),
 2567179: ([2773351], array([0.], dtype=float32)),
 3431773: ([3546338], array([0.16418344], dtype=float32))}

In [78]:
df = load_frame()[0]

original dataframe shape: (2586631, 4)
df_filtered shape: (1149916, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARRT', 'ARRTRBS', 'ARSB'], dtype='object', name='exam_type')
number of exam types after filtering: 4


In [83]:
print(df.loc[2135117]['text'])

Sonographie gesamtes Abdomen vom   Klinik Fragestellung Rechtfertigende Indikation  Schockraummanagement Verkehrsunfall FAST   Befund und Beurteilung   Sonographisch kein Nachweis einer Verletzung der parenchymatösen Abdominalorgane  Keine freie intraabdominelle Flüssigkeit abzugrenzen  Soweit einsehbar basal kein wesentlicher Pleura oder Perikarderguss  Harnblase gut gefüllt echofreies Binnenmuster  


In [82]:
df.loc[2135162]['text']

'Sonographie gesamtes Abdomen vom   Klinik Fragestellung Rechtfertigende Indikation  Verkehrsunfall Schockraummanagement FAST   Befund und Beurteilung   Sonographisch kein Nachweis einer Verletzung der parenchymatösen Abdominalorgane  Keine freie intraabdominelle Flüssigkeit abzugrenzen  Soweit einsehbar basal kein wesentlicher Pleura oder Perikarderguss  Harnblase gut gefüllt echofreies Binnenmuster '

In [42]:
deleted_items = []

In [72]:
for i in test.items():
    print(f'i[0] {i[0]}')
    print(f'i[1] {i[1]}')
    

i[0] 699394
i[1] ([709879], array([0.17644596], dtype=float32))
i[0] 921246
i[1] ([944357, 975710, 1026933], array([0.        , 0.        , 0.12366536], dtype=float32))
i[0] 2951520
i[1] ([2951641], array([0.18948333], dtype=float32))
i[0] 2951641
i[1] ([2951520], array([0.18948396], dtype=float32))
i[0] 3127460
i[1] ([3191707], array([0.12182446], dtype=float32))
i[0] 1290517
i[1] ([1378183, 1585015, 1596270], array([0.18931559, 0.18931559, 0.19431968], dtype=float32))
i[0] 2135117
i[1] ([2135162, 2234454], array([0., 0.], dtype=float32))
i[0] 2135162
i[1] ([2135162, 2234454], array([0., 0.], dtype=float32))
i[0] 2135242
i[1] ([2147647], array([0.], dtype=float32))
i[0] 2567179
i[1] ([2773351], array([0.], dtype=float32))
i[0] 3431773
i[1] ([3546338], array([0.16418344], dtype=float32))


In [43]:
for i, j in test.items():
    items_to_delete = [i for i in j[0] if i not in deleted_items]
    print(i, items_to_delete)
    for ij in items_to_delete:
        deleted_items.append(ij)
        print(f'deleted item: {deleted_items}')
    if i not in deleted_items:
        print('True')
        df = df.drop([*items_to_delete])
        deleted_items.append(i)


699394 [709879]
deleted item: [709879]
True
921246 [944357, 975710, 1026933]
deleted item: [709879, 699394, 944357]
deleted item: [709879, 699394, 944357, 975710]
deleted item: [709879, 699394, 944357, 975710, 1026933]
True
2951520 [2951641]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641]
True
2951641 []
3127460 [3191707]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707]
True
1290517 [1378183, 1585015, 1596270]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183, 1585015]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183, 1585015, 1596270]
True
2135117 [2135162, 2234454]
deleted item: [709879, 699394, 944357, 975710, 1026933, 921246, 2951641, 2951520, 3191707, 3127460, 1378183, 1585015, 1596270, 129051