In [3]:
import mmh3
import sys
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
from dataloader import CitationDataset
from typing import List
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=6, progress_bar=True)


INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [5]:
dataset = CitationDataset()
df = dataset.load_dataframe(subset=False)
# df.to_pickle("df.pkl")

# df = pd.read_pickle("df.pkl")
df

loading dataframe from cache c:\Users\uroko\OneDrive\DTU\tools_for_data_science\02807_project/DATA/dblp-ref
loading c:\Users\uroko\OneDrive\DTU\tools_for_data_science\02807_project/DATA/dblp-ref/dblp-ref-3.json
loading c:\Users\uroko\OneDrive\DTU\tools_for_data_science\02807_project/DATA/dblp-ref/dblp-ref-2.json
loading c:\Users\uroko\OneDrive\DTU\tools_for_data_science\02807_project/DATA/dblp-ref/dblp-ref-1.json
loading c:\Users\uroko\OneDrive\DTU\tools_for_data_science\02807_project/DATA/dblp-ref/dblp-ref-0.json


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,AdaBoost algorithm based on Haar-like features...,"[Zheng Xu, Runbin Shi, Zhihao Sun, Yaqi Li, Yu...",0,"[0a11984c-ab6e-4b75-9291-e1b700c98d52, 1f4152a...",A Heterogeneous System for Real-Time Detection...,high performance computing and communications,2016,001eef4f-1d00-4ae6-8b4f-7e66344bbc6e
1,"In this paper, a kind of novel jigsaw EBG stru...","[Yufei Liang, Yan Zhang, Tao Dong, Shan-wei Lu]",0,[],A novel conformal jigsaw EBG structure design,international conference on conceptual structures,2016,002e0b7e-d62f-4140-b015-1fe29a9acbaa
2,This paper studies the problem of using an aut...,"[Xiaodong Ai, Keyou You, Shiji Song]",0,"[1862a08a-08c6-4ab1-a214-8932bbd0d2d9, 7bcea2f...",A source-seeking strategy for an autonomous un...,"international conference on control, automatio...",2016,00352759-f0a7-4678-82ae-fed68c700da6
3,,"[Francine Berman, Vinton G. Cerf]",0,[],Social and ethical behavior in the internet of...,Communications of The ACM,2017,00f77fa9-ae49-4935-9166-2f5f9cdb3d6b
4,,"[Leon A. Sakkal, Kyle Z. Rajkowski, Roger S. A...",50,"[4f4f200c-0764-4fef-9718-b8bccf303dba, aa699fb...",Prediction of consensus binding mode geometrie...,Journal of Computational Chemistry,2017,013ea675-bb58-42f8-a423-f5534546b2b1
...,...,...,...,...,...,...,...,...
999995,,"[Julien Stephan, Mathieu Brau, Yoann Corre, Yv...",4,"[13706ee7-da12-440d-8b13-ab6106e77887, 16a2526...",On the Effect of Realistic Traffic Demand Rise...,vehicular technology conference,2014,4aa66242-5efc-464f-8b80-463691a50e2e
999996,"In the last few years, workflow systems have b...",[Giacomo Piccinelli],10,"[02763555-4d84-49fa-b6aa-00171c832dc0, 4be4595...",Distributed workflow management: the TEAM model,cooperative information systems,1998,4aa672c2-c5e6-469d-b168-2056474cf47f
999997,There are many different designs for audio amp...,"[Stephen M. Cox, Bruce H. Candy]",19,[3059f971-f49e-4a49-a1e5-82ea4e8f6879],Class-D Audio Amplifiers with Negative Feedback,Siam Journal on Applied Mathematics,2005,4aa67ed9-6b28-4ed4-a180-13d3a0dc0a59
999998,This paper proposes a language acquisition fra...,"[Tao Gong, James W. Minett, William S-Y. Wang]",5,"[207c4e1c-52f4-4737-9cd9-1db86b33d580, 55c81e3...",A simulation study exploring the role of cultu...,Connection Science,2010,4aa692aa-2448-436c-a809-81b54e5f2f66


In [7]:
def get_referenced_by(df: pd.DataFrame) -> List[int]:
    ref_df = df[["id", "references"]]
    reversed_refs = {}
    for _, row in tqdm(ref_df.iterrows(), total=ref_df.shape[0]):
        for ref in row["references"]:
            # Add the current ID to the list of IDs that reference 'ref'
            if ref in reversed_refs:
                reversed_refs[ref].append(row["id"])
            else:
                reversed_refs[ref] = [row["id"]]

    reversed_df = pd.DataFrame(
        list(reversed_refs.items()), columns=["id", "referenced_by"]
    )

    full_df = pd.merge(df, reversed_df, on="id", how="outer")
    full_df["n_counted_citations"] = full_df["referenced_by"].apply(lambda x: len(x) if isinstance(x, list) else 0)
    full_df["n_citations"] = full_df["references"].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return full_df

full_df = get_referenced_by(df)

100%|██████████| 3079007/3079007 [03:37<00:00, 14157.25it/s]


In [39]:
def clean_text(aString):
    output = aString.replace('\n','')
    output_list = output.split()
    output_list = [''.join(ch for ch in aWord if ch.isalnum()) for aWord in output_list]
    output_list = [s.lower() for s in output_list]
    output = ' '.join(output_list)
    return " ".join(output.split())

def get_signature(text: str, shingle_size = 3, sig_len = 5):
    import sys
    import mmh3
    import numpy as np  
    def shingle(text: str, shingle_size):
        text_list = text.split()
        return list(set(" ".join(text_list[i:i+shingle_size]) for i in range(len(text_list)-shingle_size+1)))
    
    def minhash(text_list, seed) -> int:
        hash_list = [mmh3.hash(shingle, seed) for shingle in text_list] 
        return min(hash_list)
    
    shingle_list = shingle(text, shingle_size)
    if len(shingle_list) == 0:
        return np.nan
    try:
        signature = [minhash(shingle_list, seed) for seed in range(sig_len)]
    except  Exception as e:
        print(text)
        print(shingle_list)
        sys.exit(e)
    return signature


test_string = "this is a test string to shingle and hash"

print(get_signature(test_string))
    


[-1748950211, -1266821499, -2045041042, -2042116182, -1566184949]


In [5]:
df["abstract"] = df["abstract"].parallel_apply(clean_text).convert_dtypes(dtype_backend="pyarrow")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13168), Label(value='0 / 13168')))…

In [17]:
df.dtypes

abstract      string[pyarrow]
authors                object
n_citation     int64[pyarrow]
references             object
title         string[pyarrow]
venue         string[pyarrow]
year           int64[pyarrow]
id            string[pyarrow]
signature              object
dtype: object

In [18]:
k=250
df["signature"] = df["abstract"].parallel_apply(get_signature, sig_len=k)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13168), Label(value='0 / 13168')))…

In [29]:
b,r = 50, 5
assert k == b*r

def jaccard(name1, name2, signatures_dict):
    """
    Input:
        - name1 (str): key of the first document S
        - name2 (str): key of the second document T
        - signatures_dict (dict of str:list): dictionary of signatures
    Return: Jaccard similarity between S and T
    """
    signatures_doc1 = np.array(signatures_dict[name1])
    signatures_doc2 = np.array(signatures_dict[name2])
    return len(np.intersect1d(signatures_doc1, signatures_doc2))/len(np.union1d(signatures_doc1, signatures_doc2))


def lsh(signatures_dict, jaccard_threshold=0.6, seed=42):
    lsh_dict = {}
    for key, values in tqdm(signatures_dict.items()):
        blocks = np.split(np.array(values), b)
        blocks_hash_values = []
        for aBlock in blocks:
            blocks_hash_values.append(mmh3.hash(aBlock, seed))
        lsh_dict[key] = blocks_hash_values
    list_keys = list(lsh_dict.keys())
    similar_items = {}
    for i in tqdm(range(len(list_keys)-1)):
        for j in range(i+1, len(list_keys)):
            common_values = np.intersect1d(lsh_dict[list_keys[i]], lsh_dict[list_keys[j]])
            if len(common_values) > 0:
                # we found a candidate
                similarity_score = jaccard(list_keys[i], list_keys[j], signatures_dict)
                if similarity_score >= jaccard_threshold:
                    print("Found one!")
                    similar_items[(list_keys[i], list_keys[j])] = similarity_score
    return similar_items

In [None]:
def jaccard2(signature1, signature2):
    import numpy as np

    if signature1 == np.nan or signature2 == np.nan:
        return 0

    signatures_doc1 = np.array(signature1)

    signatures_doc2 = np.array(signature2)

    return len(np.intersect1d(signatures_doc1, signatures_doc2)) / len(
        np.union1d(signatures_doc1, signatures_doc2)
    )



def get_most_similar(df, promt, n_top=10):

    df["sim"] = df["signature"].apply(jaccard2, signature2=promt)

    return df.sort_values("sim", ascending=False).head(n_top)



test_idx = 435


# for speedup: https://stackoverflow.com/questions/73845259/efficient-cosine-similarity-between-dataframe-rows


print(df.iloc[test_idx]["signature"])

df_top = get_most_similar(df.drop([test_idx], axis=0), df.iloc[test_idx]["signature"])
df_top

[-2080982354, -2139052548, -2142453737, -2126901166, -2121145338, -2125979400, -2085546357, -2143938237, -2118369876, -2106967118, -2129740934, -2130938472, -2129489982, -2127677696, -2143489443, -2096647300, -2138085774, -2117069289, -2115938634, -2090919881, -2142182361, -2090436787, -2087580467, -2137948731, -2130357036, -2127067475, -2131816058, -2085799310, -2102385111, -2110580215, -2125114671, -2069723196, -2123237093, -2121718327, -2097652380, -2142004928, -2111854006, -2145889260, -2105271526, -2126041949, -2035293353, -2096565519, -2124213843, -2071232887, -2127211451, -2140938010, -2147045929, -2138552024, -2137273062, -2121626227, -2135881518, -2142331513, -2112831466, -2124423480, -2080292192, -2139882481, -2145160581, -2055321131, -2133336757, -2118343542, -2145609788, -2146640111, -2083994368, -2131597894, -2092227483, -2121384122, -2128639547, -2132996197, -2124443844, -2130875415, -2142085201, -2084139907, -2089372674, -2141247598, -2115363500, -2141568146, -2104536249

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id,signature,sim
7914,with the development of informationcommunicati...,"[Duckki Kim, Youngsong Mun]",50,,Design and Performance Analysis of Multimedia ...,Lecture Notes in Computer Science,2006,e7d70cd3-fe4e-4128-94d6-3da861c2d3bd,"[-2123028355, -2144375480, -2104450162, -21289...",0.008065
62087,in this paper we employed two machine learning...,"[H. Gunes Kayacik, Nur Zincir-Heywood]",0,,Analysis of three intrusion detection system b...,Lecture Notes in Computer Science,2005,3f737266-99ee-4831-97a5-8722d16ec3b0,"[-2121457342, -2129631814, -2113994092, -21416...",0.008065
56763,wiapa wireless networks for industrial automat...,"[Heng Wang, Lun Shao, Shuyang Xia, Ping Wang, ...",0,"[1c1ff489-cec2-4ea7-a86e-b22d26a13c74, 27b3845...",An Efficient Channel Utilization Scheme for WI...,the internet of things,2016,5b67c9de-5b74-4fa0-aea7-426d89077aa4,"[-2098518247, -2143886285, -2067930841, -21468...",0.008065
68035,the papers in this special section focus on di...,"[Zhichao Zhang, W. Peng Tay, M. Draief, Xiaofe...",0,,Introduction to the Issue on Distributed Infor...,,2017,47fdf51c-75b5-4f3e-9e90-943396fd0d6e,"[-2019817466, -2091183034, -1937528908, -14273...",0.008065
15172,in this paper one of the novel issues of the w...,"[B. Lahijanian, M. H. Fazel Zarandi, F. Vasheg...",0,"[287b3a37-f241-4705-8675-ddd05aeaa88d, 425a9e4...",Double coverage ambulance location modeling us...,,2016,2bb3b1d2-17e3-4ca2-84ee-d55ad067b787,"[-2077137423, -2144375480, -2141689826, -21445...",0.008065
40302,this paper presents a novel framework that ena...,"[Liang Deng, Peng Liu, Jun Xu, Ping Chen, Qing...",0,"[0643a8bc-1332-42f9-8d84-393ee3987dd8, 1507328...",Dancing with Wolves: Towards Practical Event-d...,virtual execution environments,2017,4e716285-fc38-4afa-87cd-04a9c83886b0,"[-2135044236, -2103622593, -2089902885, -21358...",0.008065
2451,we characterize the expressive power of ex ef ...,"[Mikołaj Bojańczyk, Igor Walukiewicz]",0,,Characterizing EF and EX tree logics,Lecture Notes in Computer Science,2004,73a29ba1-b56a-4c08-8a71-527ffea57b1a,"[-2062519556, -2127432301, -1890907650, -20370...",0.008065
47706,testing is one of the most important parts of ...,"[Han Huang, Fangqing Liu, Xiaoyan Zhuo, Zhifen...",0,"[1428f757-51e1-4de7-bb58-91917c0390b5, 16d3b55...",Differential Evolution Based on Self-Adaptive ...,IEEE Computational Intelligence Magazine,2017,ad5ce51f-aafa-4bb9-8b97-c496344259f1,"[-2136799429, -2146362144, -2115171093, -21213...",0.008065
10587,one of the fundamental tasks for spatial index...,"[Jine Tang, Bo Zhang, You Zhou, Liangmin Wang]",0,"[0431d661-1872-4483-b2f6-af19a8e3013a, 0785aa3...",An Energy-Aware Spatial Index Tree for Multi-R...,IEEE Access,2017,9cc4c665-bfb5-43db-84bf-37e5eab4dfba,"[-2084468142, -2127432301, -2137586935, -21439...",0.008065
7407,workflow technology is rapidly evolving and ra...,"[Georgia Kougka, Anastasios Gounaris, Alkis Si...",0,"[0b2b5b0c-74d1-486b-88ae-7280fcd09c27, 0caef02...",The Many Faces of Data-centric Workflow Optimi...,arXiv: Databases,2017,6b8f132f-1082-4f33-83fb-dce35fedcab0,"[-2139505454, -2120769467, -2111282411, -21280...",0.008065


In [30]:
signature_dict = df[df["signature"].notna()]["signature"].to_dict()
print(lsh(signature_dict))

100%|██████████| 44970/44970 [00:04<00:00, 9680.13it/s] 
  0%|          | 38/44969 [00:45<14:50:21,  1.19s/it]


KeyboardInterrupt: 