In [1]:
import mmh3
from tqdm import tqdm
import pandas as pd
import pyarrow as pa
import numpy as np
from dataloader import CitationDataset
from utils import get_referenced_by, filter_df
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=6, progress_bar=True)


INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
dataset = CitationDataset()
df = dataset.load_dataframe(subset=True)
# df.to_pickle("df.pkl")

# df = pd.read_pickle("df.pkl")
print(len(df))
df = get_referenced_by(df)
print(len(df))
df = filter_df(df, verbose=True)
print(len(df))


loading dataframe from cache /mnt/c/Users/uroko/OneDrive/DTU/tools_for_data_science/02807_project/DATA/dblp-ref
loading /mnt/c/Users/uroko/OneDrive/DTU/tools_for_data_science/02807_project/DATA/dblp-ref/dblp-ref-3.json
79007


Reversing references: 100%|██████████| 79007/79007 [00:06<00:00, 12604.48it/s]


79007
Filtering dataframe...
Initial shape: (79007, 11)
After removing no abstract: (44970, 11)
After removing no title: (44970, 11)
After removing no references or citations: (33764, 11)
33764


In [3]:
def clean_text(aString):
    output = aString.replace('\n','')
    output_list = output.split()
    output_list = [''.join(ch for ch in aWord if ch.isalnum()) for aWord in output_list]
    output_list = [s.lower() for s in output_list]
    output = ' '.join(output_list)
    return " ".join(output.split())


def get_signature(text: str, shingle_size = 3, sig_len = 5):
    import sys
    import mmh3
    import numpy as np  
    def shingle(text: str, shingle_size):
        text_list = text.split()
        return list(set(" ".join(text_list[i:i+shingle_size]) for i in range(len(text_list)-shingle_size+1)))
    
    def minhash(text_list, seed) -> int:
        hash_list = [mmh3.hash(shingle, seed) for shingle in text_list] 
        return min(hash_list)
    
    shingle_list = shingle(text, shingle_size)
    if len(shingle_list) == 0:
        return np.nan
    try:
        signature = [minhash(shingle_list, seed) for seed in range(sig_len)]
    except  Exception as e:
        print(text)
        print(shingle_list)
        sys.exit(e)
    return signature


test_string = "this is a test string to shingle and hash"
print(get_signature(test_string))

[-1748950211, -1266821499, -2045041042, -2042116182, -1566184949]


In [4]:
df["abstract"] = (
    df["abstract"].parallel_apply(clean_text).convert_dtypes(dtype_backend="pyarrow")
)
df.dtypes

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13168), Label(value='0 / 13168')))…

abstract      string[pyarrow]
authors                object
n_citation     int64[pyarrow]
references             object
title         string[pyarrow]
venue         string[pyarrow]
year           int64[pyarrow]
id            string[pyarrow]
dtype: object

In [5]:
k=250
df["signature"] = df["abstract"].parallel_apply(get_signature, sig_len=k)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13168), Label(value='0 / 13168')))…

In [6]:
b,r = 50, 5
assert k == b*r

def jaccard(name1, name2, signatures_dict):
    """
    Input:
        - name1 (str): key of the first document S
        - name2 (str): key of the second document T
        - signatures_dict (dict of str:list): dictionary of signatures
    Return: Jaccard similarity between S and T
    """
    signatures_doc1 = np.array(signatures_dict[name1])
    signatures_doc2 = np.array(signatures_dict[name2])
    return len(np.intersect1d(signatures_doc1, signatures_doc2))/len(np.union1d(signatures_doc1, signatures_doc2))


def lsh(signatures_dict, jaccard_threshold=0.6, seed=42):
    lsh_dict = {}
    for key, values in tqdm(signatures_dict.items()):
        blocks = np.split(np.array(values), b)
        blocks_hash_values = []
        for aBlock in blocks:
            blocks_hash_values.append(mmh3.hash(aBlock, seed))
        lsh_dict[key] = blocks_hash_values
    list_keys = list(lsh_dict.keys())
    similar_items = {}
    for i in tqdm(range(len(list_keys)-1)):
        for j in range(i+1, len(list_keys)):
            common_values = np.intersect1d(lsh_dict[list_keys[i]], lsh_dict[list_keys[j]])
            if len(common_values) > 0:
                # we found a candidate
                similarity_score = jaccard(list_keys[i], list_keys[j], signatures_dict)
                if similarity_score >= jaccard_threshold:
                    print("Found one!")
                    similar_items[(list_keys[i], list_keys[j])] = similarity_score
    return similar_items

In [12]:
def jaccard2(signature1, signature2):
    import numpy as np

    if signature1 == np.nan or signature2 == np.nan:
        return 0

    signatures_doc1 = np.array(signature1)
    signatures_doc2 = np.array(signature2)
    return len(np.intersect1d(signatures_doc1, signatures_doc2)) / len(
        np.union1d(signatures_doc1, signatures_doc2)
    )


def get_most_similar(df, promt, n_top=10):
    clean_promt = clean_text(promt)
    promt_sig = get_signature(clean_promt)

    df["sim"] = df["signature"].apply(jaccard2, signature2=promt_sig)

    return df[df["sim"] > 0].sort_values("sim", ascending=False).head(n_top)


test_idx = 500
prompt = df.iloc[test_idx]["abstract"]
# for speedup: https://stackoverflow.com/questions/73845259/efficient-cosine-similarity-between-dataframe-rows

print(prompt)

df_top = get_most_similar(df.drop([test_idx], axis=0), prompt, n_top=10)
df_top

the effectiveness of a statistical machine translation system smt is very dependent upon the amount of parallel corpus used in the training phase for lowresource language pairs there are not enough parallel corpora to build an accurate smt in this paper a novel approach is presented to extract bilingual persianitalian parallel sentences from a nonparallel comparable corpus in this study english is used as the pivot language to compute the matching scores between source and target sentences and candidate selection phase additionally a new monolingual sentence similarity metric normalized google distance ngd is proposed to improve the matching process moreover some extensions of the baseline system are applied to improve the quality of extracted sentences measured with bleu experimental results show that using the new pivot based extraction can increase the quality of bilingual corpus significantly and consequently improves the performance of the persianitalian smt system


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id,signature,sim
34231,emotional preference of people from different ...,"[Atena Bajoulvand, Ramtin Zargari Marandi, Moh...",0,"[00ead8a6-882e-4ced-81b1-4e17fb95b098, 1e06b01...",Analysis of folk music preference of people fr...,Applied Mathematics and Computation,2017,3c016f11-5962-47fd-9e89-886a84a30b81,"[-2140890565, -2144375480, -2123211068, -21289...",0.007905
13277,orientations and positions of a permanent magn...,"[Houde Dai, Wanan Yang, Xuke Xia, Shijian Su, ...",0,"[1e9afe42-b3d5-4da9-a8e6-c46b0e949b67, b8943c8...",A three-axis magnetic sensor array system for ...,,2016,45a7f1f0-0f69-49c3-9b76-7a9acb8a5829,"[-2108729406, -2144375480, -2134336092, -21289...",0.007905
52116,the popularity and reach of short text message...,"[Renato Moraes Silva, Tulio C. Alberto, Tiago ...",0,"[0661f7c2-6853-4e02-8e24-77a37ca7b8ab, 10bd5d8...",Towards filtering undesired short text message...,Expert Systems With Applications,2017,3505728b-8c4a-4f53-9a7e-2e639745d3c8,"[-2140950911, -2144375480, -2140018415, -21289...",0.007905
35045,in case of outliers it is inevitable that the ...,"[Ozge Cagcag Yolcu, Hak-Keung Lam]",0,"[0575b654-767b-43e1-9d24-a096b3e5293b, 05caaba...",A combined robust fuzzy time series method for...,Neurocomputing,2017,0be8ced2-e549-4503-b212-e592f2475ed5,"[-2123145952, -2144375480, -2082269033, -21289...",0.007905
73470,network function virtualization nfv introduces...,"[Peilong Li, Xiaoban Wu, Yongyi Ran, Yan Luo]",0,"[0ca36c27-8b86-42d5-b378-d780ea0ba5a5, 32ddb4a...",Designing Virtual Network Functions for 100 Gb...,architectures for networking and communication...,2017,86d37a89-e352-48a3-9b13-c9663330f125,"[-2144845599, -2144375480, -2115858889, -21289...",0.007905
7051,the volume of digital documents increases rapi...,"[Xianghua Fu, Zhaofeng Ma, Boqin Feng]",0,"[1a7be5f4-5d85-4d5e-9ab2-67649a25eb05, 20a5606...",Kernel-Based Semantic Text Categorization for ...,grid and cooperative computing,2004,0c985c55-5d49-4e58-818d-a11d5d1ff142,"[-2128616418, -2144375480, -2083914156, -21289...",0.007905
29670,magnetic resonance imaging mri is widely used ...,"[Mohsen Ghafoorian, Alireza Mehrtash, Tina Kap...",0,"[0dd5b8c6-c278-4a53-a9de-2dca2282c0a5, 21a6dd8...",Transfer Learning for Domain Adaptation in MRI...,arXiv: Computer Vision and Pattern Recognition,2017,ab39ca5a-622b-443f-b150-269fa8e0d581,"[-2119253054, -2144375480, -2127027482, -21289...",0.007905
74186,as the application of smart devices becomes mo...,"[Hyeong-Jun Kim, Jin-Soo Kim]",0,,A user-space storage I/O framework for NVMe SS...,IEEE Transactions on Consumer Electronics,2017,ab683aea-8791-4065-80a8-f1970325a736,"[-2119652966, -2144375480, -2117097154, -21289...",0.007905
23151,in this study the parameter identification bas...,[Chingiz Hajiyev],0,[5d52b7b8-a369-4b93-93ab-f19602d0c596],Reconfigurable fault-tolerant UAV flight contr...,,2016,22f16e41-ea79-4838-a569-37f5698a0450,"[-2114710673, -2144375480, -2086085094, -21289...",0.007905
34731,although many algorithms have been proposed no...,"[Rustu Akay, Alper Basturk, Adem Kalinli, Xin ...",0,"[0fa2abeb-1be2-4698-918a-86a8a0d5317d, 11685c9...",Parallel population-based algorithm portfolios...,Neurocomputing,2017,bca80292-b15b-4de0-b3e9-b6a11f439b79,"[-2134139170, -2144375480, -2124030516, -21289...",0.007905


In [8]:
signature_dict = df[df["signature"].notna()]["signature"].to_dict()
print(lsh(signature_dict))

100%|██████████| 44970/44970 [00:04<00:00, 9518.78it/s] 
  0%|          | 32/44969 [00:37<14:40:01,  1.18s/it]


KeyboardInterrupt: 