In [1]:
import mmh3
import sys
import pickle
import pandas as pd
import numpy as np
from dataloader import CitationDataset
from typing import List
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=6)


INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# dataset = CitationDataset()
# df = dataset.load_dataframe(subset=True)
# df.to_pickle("df.pkl")

df = pd.read_pickle("df.pkl")



In [3]:
df["abstract"][df["abstract"] != ""].sort_values()

130994              \...
154436      \n\n...
510299       \n\n \n  ...
911858       \n \n\n  !!\n\n "...
671062                     ...
                                ...                        
126545    ��������������������� ��������������� ������� ...
105295    ������������������������� ����������������� ��...
23932     �������������������������������� �������������...
537197    ����������������������������������� �� � ��� �...
598301    𝑘�������-Anonymous microaggregation emerges as...
Name: abstract, Length: 2548532, dtype: object

In [4]:
def shingle(text: str, shingle_size):
    text_list = text.split()
    return list(set(" ".join(text_list[i:i+shingle_size]) for i in range(len(text_list)-shingle_size+1)))

def minhash(text_list, seed) -> int:
    hash_list = [mmh3.hash(shingle, seed) for shingle in text_list] 
    return min(hash_list)

def clean_text(aString):
    output = aString.replace('\n','')
    output_list = output.split()
    output_list = [''.join(ch for ch in aWord if ch.isalnum()) for aWord in output_list]
    output_list = [s.lower() for s in output_list]
    output = ' '.join(output_list)
    return " ".join(output.split())

def get_signature(text: str, shingle_size = 3, sig_len = 5) -> List[int]:    
    shingle_list = shingle(text, shingle_size)
    if len(shingle_list) == 0:
        return pd.NA
    try:
        signature = [minhash(shingle_list, seed) for seed in range(sig_len)]
    except  Exception as e:
        print(text)
        print(shingle_list)
        sys.exit(e)
    return signature


test_string = "this is a test string to shingle and hash"

print(get_signature(test_string))
    


[-1748950211, -1266821499, -2045041042, -2042116182, -1566184949]
[]


In [5]:
df["abstract"] = df["abstract"].parallel_apply(clean_text)



In [6]:
df["signature"] = df["abstract"].parallel_apply(get_signature)

In [7]:
df

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id,signature
0,adaboost algorithm based on haarlike features ...,"[Zheng Xu, Runbin Shi, Zhihao Sun, Yaqi Li, Yu...",0,"[0a11984c-ab6e-4b75-9291-e1b700c98d52, 1f4152a...",A Heterogeneous System for Real-Time Detection...,high performance computing and communications,2016,001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,"[-2052198483, -2126535731, -2139769656, -21405..."
1,in this paper a kind of novel jigsaw ebg struc...,"[Yufei Liang, Yan Zhang, Tao Dong, Shan-wei Lu]",0,[],A novel conformal jigsaw EBG structure design,international conference on conceptual structures,2016,002e0b7e-d62f-4140-b015-1fe29a9acbaa,"[-2146879282, -2089811086, -2118171633, -21437..."
2,this paper studies the problem of using an aut...,"[Xiaodong Ai, Keyou You, Shiji Song]",0,"[1862a08a-08c6-4ab1-a214-8932bbd0d2d9, 7bcea2f...",A source-seeking strategy for an autonomous un...,"international conference on control, automatio...",2016,00352759-f0a7-4678-82ae-fed68c700da6,"[-2130273902, -2136533398, -2015697018, -21321..."
3,,"[Francine Berman, Vinton G. Cerf]",0,[],Social and ethical behavior in the internet of...,Communications of The ACM,2017,00f77fa9-ae49-4935-9166-2f5f9cdb3d6b,
4,,"[Leon A. Sakkal, Kyle Z. Rajkowski, Roger S. A...",50,"[4f4f200c-0764-4fef-9718-b8bccf303dba, aa699fb...",Prediction of consensus binding mode geometrie...,Journal of Computational Chemistry,2017,013ea675-bb58-42f8-a423-f5534546b2b1,
...,...,...,...,...,...,...,...,...,...
999995,,"[Julien Stephan, Mathieu Brau, Yoann Corre, Yv...",4,"[13706ee7-da12-440d-8b13-ab6106e77887, 16a2526...",On the Effect of Realistic Traffic Demand Rise...,vehicular technology conference,2014,4aa66242-5efc-464f-8b80-463691a50e2e,
999996,in the last few years workflow systems have be...,[Giacomo Piccinelli],10,"[02763555-4d84-49fa-b6aa-00171c832dc0, 4be4595...",Distributed workflow management: the TEAM model,cooperative information systems,1998,4aa672c2-c5e6-469d-b168-2056474cf47f,"[-2143227994, -2082158534, -2130015653, -21348..."
999997,there are many different designs for audio amp...,"[Stephen M. Cox, Bruce H. Candy]",19,[3059f971-f49e-4a49-a1e5-82ea4e8f6879],Class-D Audio Amplifiers with Negative Feedback,Siam Journal on Applied Mathematics,2005,4aa67ed9-6b28-4ed4-a180-13d3a0dc0a59,"[-2141235806, -2118793329, -2118655693, -20853..."
999998,this paper proposes a language acquisition fra...,"[Tao Gong, James W. Minett, William S-Y. Wang]",5,"[207c4e1c-52f4-4737-9cd9-1db86b33d580, 55c81e3...",A simulation study exploring the role of cultu...,Connection Science,2010,4aa692aa-2448-436c-a809-81b54e5f2f66,"[-2133497614, -2133538829, -2130962010, -21433..."
