In [6]:
from time import time
import math
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer as skTfidf
import cupy as cp           #use cupy array instead of numpy to speed up calculation by using GPU
import cudf as cf
from cuml.metrics.pairwise_distances import sparse_pairwise_distances
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidf
from cuml.metrics.pairwise_distances import pairwise_distances

import time
import matplotlib.pyplot as plt
import sys

In [7]:
path = '/home/test/Data/corpus.csv'
def load_frame(path_to_df=path, encoding='utf-16', filter_value = 100000):
    
    df= pd.read_csv(path_to_df, encoding=encoding, index_col='id')
    df.drop([df.columns[0]], inplace=True, axis=1)
    df.drop_duplicates(subset=['text'],inplace=True)
    
    exam_type_distribution = df.groupby(['exam_type'])['exam_type'].count()
    exam_type_distribution.sort_values(ascending=False)
    
    list_filtered = exam_type_distribution[exam_type_distribution > filter_value].index
    df_filtered = df[df['exam_type'].isin(list_filtered)]
    
    print(f'original dataframe shape: {df.shape}')
    print(f'df_filtered shape: {df_filtered.shape}')
    
    print(f'numbers of exam types before: {len(set(df["exam_type"]))}')
    print(f'types after filtering: {list_filtered}')
    print(f'number of exam types after filtering: {len(set(df_filtered["exam_type"]))}')

    return df_filtered, list_filtered

In [8]:
# define a load function to load each exam type
def load_exam_type(df, exam_types):
    for i in exam_types:
        print(f'exam type: {i}')
        dataframe = df[df['exam_type'] == i]    
        print(f'number of documents: {dataframe.shape}')
        yield dataframe

In [9]:
def batch(sparseMatrix, size = 5000):
    for idx, item in enumerate(range(0, sparseMatrix.shape[0], size)):
        batch_sparseMatrix = sparseMatrix[item:item+size,:]
        print(batch_sparseMatrix.shape, item)
        yield batch_sparseMatrix

In [14]:
# create a function to list all entries with the its similarity inside a threshhold
def get_matches(upperbound, lowerbound, batch_size=12000, filter=100000):
    # create a dictionary to store the results
    results_dict = {}

    # start with loading the dataframe
    df_filtered = load_frame(filter_value = filter)
    df_by_type = load_exam_type(df_filtered[0], df_filtered[1])
    
    #tfidf_batch = cuTfidf().fit_transform(df_by_type)
    
    # loop over all exam types
    for i in range(0,len(df_filtered[1])):
        dataframe = next(df_by_type)
        #print(f'exam type: {dataframe["exam_type"].iloc[0]}')
        #for idx, row in df_by_type.iterrows(): 
            #row ist ein text 
        for idx, batch in enumerate(range(0, len(dataframe), batch_size)):
            batch_info = f'batch number {idx}, rows {batch}:{batch + batch_size}'
            # print(batch_info)
            # transform the batchch+batch_size].index[i],
            # create a tfIdf vectorizer and fit and transform the documents
            batch_dataframe = dataframe.iloc[batch:batch + batch_size]
            df_indices = batch_dataframe.index.to_list()
            tfidf_batch = cuTfidf().fit_transform(batch_dataframe['text'])
            # print(f'tfidf shape:{tfidf_batch.shape}')

            # calculate the euclidean distance
            distances_batch = sparse_pairwise_distances(tfidf_batch, metric='euclidean')
            print(f'distances shape: {distances_batch.shape}')

            results = euclidean_distance(distances_batch, batch_dataframe, upperbound=upperbound, lowerbound=lowerbound, indices=df_indices)
            if results:
                results_dict.update(results)
            break
        break
    return results_dict



In [17]:
# define a function to calculate the euclidean distance of a batch of documents 
# and return the neighbours based on the threshold
def euclidean_distance(distance_batch, batch_dataframe, upperbound, lowerbound, indices):

    results = {}
    distance_batch = distance_batch[0:50] #for sampling/testing purposes
    
    for i, row in enumerate(distance_batch):

        arg_sorted = cp.argsort(row)  # use np.argsort is as fast as cp.argsort, maybe better for memory
        sorted_array = row[arg_sorted]
        
        candidates = get_candidates(sorted_array, arg_sorted, upper=upperbound)
                
        df_candidates = [indices[int(i)] for i in candidates[1:]]
        original_index = indices[i]
        
        if df_candidates:
            results[original_index] = df_candidates

    return results

In [18]:
def get_candidates(sorted_array, arg_sorted, upper):
    candidates = []
    for i, x in enumerate(sorted_array):
        if x > upper:
            break
        else:
            candidates.append(int(arg_sorted[i]))
    return candidates

In [19]:
test = get_matches(upperbound=0.2, lowerbound=0, batch_size=12000, filter=10000)

original dataframe shape: (2586631, 4)
df_filtered shape: (1845401, 4)
numbers of exam types before: 985
types after filtering: Index(['ARCK', 'ARCW', 'ARKK', 'ARKW', 'ARRB1EBBEK', 'ARRBXEB', 'ARREXEBO9L',
       'ARREXEBO9R', 'ARREXEBOEL', 'ARREXEBOER', 'ARREXEBOGL', 'ARREXEBOGR',
       'ARREXEBOHL', 'ARREXEBOHR', 'ARREXEBOSL', 'ARREXEBOSR', 'ARREXEBUFL',
       'ARREXEBUFR', 'ARREXEBUGL', 'ARREXEBUGR', 'ARREXEBUHL', 'ARREXEBUHR',
       'ARREXEBUKL', 'ARREXEBUKR', 'ARRKOPG', 'ARRKXEBNNH', 'ARRT', 'ARRTRBS',
       'ARRWXEBBWS', 'ARRWXEBHWS', 'ARRWXEBLWS', 'ARSB', 'ARSBSNONIX',
       'ARSBSNOTXN', 'ARSEFKVUVX', 'ARSESNOUXX', 'ARSXSNOWGW', 'ARXXTLE'],
      dtype='object', name='exam_type')
number of exam types after filtering: 38
exam type: ARCK
number of documents: (100888, 4)
distances shape: (12000, 12000)


In [None]:
print(len(test))

In [None]:
test2 = get_matches(upperbound=0.75, lowerbound=0, batch_size=30000, filter=10000)

In [None]:
print(len(test2))

In [None]:
df = load_frame(filter_value=10000)[0]

In [None]:
df.loc[130]['text']

In [None]:
df.loc[476196]['text']

In [None]:
test2

In [None]:
df = load_frame()[0]

In [None]:
print(df.loc[2135242]['text'])

In [None]:
df.loc[2147647]['text']

In [None]:
deleted_items = []

In [None]:
for i in test.items():
    print(f'i[0] {i[0]}')
    print(f'i[1] {i[1]}')
    

In [None]:
for i, j in test.items():
    print(i)
    print(j)

In [None]:
for i, j in test.items():
    items_to_delete = [i for i in j if i not in deleted_items]
    print(i, items_to_delete)
    for item in items_to_delete:
        deleted_items.append(item)
        print(f'deleted item: {deleted_items}')
    if i not in deleted_items:
        print('True')
        df = df.drop([*items_to_delete])
        deleted_items.append(i)