In [None]:
# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load dataset
import pandas as pd

df = pd.read_json('arxivData.json')

# drop useless columns
df = df.drop(['author', 'link', 'tag'], axis = 1)
df.head()

Unnamed: 0,day,id,month,summary,title,year
0,1,1802.00209v1,2,We propose an architecture for VQA which utili...,Dual Recurrent Attention Units for Visual Ques...,2018
1,12,1603.03827v1,3,Recent approaches based on artificial neural n...,Sequential Short-Text Classification with Recu...,2016
2,2,1606.00776v2,6,We introduce the multiresolution recurrent neu...,Multiresolution Recurrent Neural Networks: An ...,2016
3,23,1705.08142v2,5,Multi-task learning is motivated by the observ...,Learning what to share between loosely related...,2017
4,7,1709.02349v2,9,We present MILABOT: a deep reinforcement learn...,A Deep Reinforcement Learning Chatbot,2017


In [21]:
# EDA & Preprocessing
# `id` should be encoded because Index only accepts integer ids
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['id'] = le.fit_transform(df['id'])

df.head(2)

Unnamed: 0,day,id,month,summary,title,year
0,1,36693,2,We propose an architecture for VQA which utili...,Dual Recurrent Attention Units for Visual Ques...,2018
1,12,18198,3,Recent approaches based on artificial neural n...,Sequential Short-Text Classification with Recu...,2016


In [14]:
# load model & generate embeddings & store embeddings
from sentence_transformers import SentenceTransformer
import pickle

model = SentenceTransformer('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')
embeddings = model.encode(df['summary'][:50], show_progress_bar = True)

with open('new-embeddings.pickle', 'wb') as pkl:
    pickle.dump(embeddings, pkl)

embeddings.shape

Batches: 100%|██████████| 2/2 [00:01<00:00,  1.22it/s]


(50, 768)

In [16]:
# load embeddings from file
with open('new-embeddings.pickle', 'rb') as pkl:
    embeddings = pickle.load(pkl)

embeddings.shape

(50, 768)

In [None]:
# Create index & add embedding & id pairs
import faiss

d_embeddings = embeddings.shape[1]
length = len(embeddings)

index = faiss.IndexFlatL2(d_embeddings)
index = faiss.IndexIDMap(index)
index.add_with_ids(embeddings, df['id'][:length])

print("Number of Embeddings in Faiss Index: ", index.ntotal)


Number of Embeddings in Faiss:  50


In [33]:
import numpy as np

# search using index
user_query = "analysis about stable diffusion"

# query needs to be encoded
query_embeddings = model.encode(user_query)

print("Embedding generated from query: ", query_embeddings.shape)

D, I = index.search(np.array([query_embeddings]), k=10)

print("I =", I)

I.shape

Embedding generated from query:  (768,)
I = [[11949 31336 28850 28112 27779 23572 19318 31993 38035 19490]]


(1, 10)

In [36]:
# Method that converts from ids to info

def ids2info(ids, column):
    return [df[df.id == idx][column] for idx in ids]

ids2info(I.flatten(), 'summary')[:2]

[16    We propose a simple neural network model to de...
 Name: summary, dtype: object,
 28    We investigate the non-identifiability issues ...
 Name: summary, dtype: object]

In [41]:
# Show Results
results = { 'ids': I.flatten(), 'summary': ids2info(I.flatten(), 'summary') }

pd.DataFrame(results).head(10)

Unnamed: 0,ids,summary
0,11949,16 We propose a simple neural network model...
1,31336,28 We investigate the non-identifiability i...
2,28850,"48 Recently, a technique called Layer-wise ..."
3,28112,47 Generative Adversarial Networks (GANs) h...
4,27779,3 Multi-task learning is motivated by the o...
5,23572,15 This paper presents a novel yet intuitiv...
6,19318,2 We introduce the multiresolution recurren...
7,31993,49 Can textual data be compressed intellige...
8,38035,32 We tackle here the problem of multimodal...
9,19490,20 Supervised machine learning models boast...


In [52]:
# Search Similiar Articles from Arxiv
target_summary = df.iloc[19318, [3, 1]]

# Encode using model
target_embedding = model.encode(target_summary)

D, I = index.search(np.array(target_embedding), k=3)

I.shape

  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]


(2, 3)

In [53]:
# Show Results
results = { 'ids': I.flatten(), 'summary': ids2info(I.flatten(), 'summary') }

pd.DataFrame(results).head(10)

Unnamed: 0,ids,summary
0,31336,28 We investigate the non-identifiability i...
1,19490,20 Supervised machine learning models boast...
2,18710,39 Embedding-based Knowledge Base Completio...
3,36693,0 We propose an architecture for VQA which ...
4,31993,49 Can textual data be compressed intellige...
5,15887,"18 In this paper, we address the task of Op..."
