In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from keras import layers
import tensorflow as tf
from sklearn.model_selection import train_test_split
from ast import literal_eval

In [2]:
df=pd.read_csv("arxiv_data_210930-054931.csv")
df.head()

Unnamed: 0,terms,titles,abstracts
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...
2,"['cs.LG', 'cs.CR', 'stat.ML']",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...
3,"['cs.LG', 'cs.CR']",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...
4,['cs.LG'],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...


In [3]:
df.shape

(56181, 3)

### Data cleaning and preprocessing

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56181 entries, 0 to 56180
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   terms      56181 non-null  object
 1   titles     56181 non-null  object
 2   abstracts  56181 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [5]:
df.isna().sum()

terms        0
titles       0
abstracts    0
dtype: int64

In [6]:
df.duplicated().sum()

15054

In [7]:
labels_column=df['terms'].apply(literal_eval)
label=labels_column.explode().unique()
label

array(['cs.LG', 'cs.AI', 'cs.CR', ...,
       'D.1.3; G.4; I.2.8; I.2.11; I.5.3; J.3',
       '68T07, 68T45, 68T10, 68T50, 68U35', 'I.2.0; G.3'], dtype=object)

In [8]:
df=df[~df['titles'].duplicated()]

In [9]:
df.shape

(41105, 3)

In [10]:
sum(df["terms"].value_counts()==1)

2503

In [11]:
df_filter=df.groupby('terms').filter(lambda x: len(x)> 1 )

In [12]:
df_filter.shape

(38602, 3)

In [13]:
df_filter.duplicated().sum()

0

In [14]:
df_filter["terms"]=df_filter["terms"].apply(lambda x:literal_eval(x))

In [15]:
df_filter["terms"].values[:3]

array([list(['cs.LG']), list(['cs.LG', 'cs.AI']),
       list(['cs.LG', 'cs.CR', 'stat.ML'])], dtype=object)

### train test split

In [16]:
train_df,test_df=train_test_split(df_filter,test_size=0.2,stratify=df_filter["terms"].values)

In [17]:
train_df.shape,test_df.shape

((30881, 3), (7721, 3))

In [18]:
val_df=test_df.sample(frac=0.5)

In [19]:
test_df.drop(val_df.index,inplace=True)

In [20]:
train_df.shape,test_df.shape,val_df.shape

((30881, 3), (3861, 3), (3860, 3))

In [24]:
terms=tf.ragged.constant(train_df["terms"])
lookup=tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(terms)
vocab=lookup.get_vocabulary()

In [26]:
max_seq_len=150
batch_size=128
padding_token="<pad>"
auto=tf.data.AUTOTUNE


def make_dataset(dataframe, is_train=True):
    # creating sequences of labesls
    labels = tf.ragged.constant(dataframe["terms"].values)
    #This line uses the previously defined lookup layer to convert the ragged tensor of labels into a binarized representation. The resulting label_binarized is a NumPy array.
    label_binarized = lookup(labels).numpy()
    # creating sequences of text.
    dataset = tf.data.Dataset.from_tensor_slices((dataframe["abstracts"].values, label_binarized))
    # shuffling data basis on condition
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

In [27]:
train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

In [29]:
text_batch, label_batch = next(iter(train_dataset))
for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text}")
    # print(f"Label(s): {invert_multi_hot(label[0])}")
    print(" ")

Abstract: b'Synthetic images are one of the most promising solutions to avoid high costs\nassociated with generating annotated datasets to train supervised convolutional\nneural networks (CNN). However, to allow networks to generalize knowledge from\nsynthetic to real images, domain adaptation methods are necessary. This paper\nimplements unsupervised domain adaptation (UDA) methods on an anchorless object\ndetector. Given their good performance, anchorless detectors are increasingly\nattracting attention in the field of object detection. While their results are\ncomparable to the well-established anchor-based methods, anchorless detectors\nare considerably faster. In our work, we use CenterNet, one of the most recent\nanchorless architectures, for a domain adaptation problem involving synthetic\nimages. Taking advantage of the architecture of anchorless detectors, we\npropose to adjust two UDA methods, viz., entropy minimization and maximum\nsquares loss, originally developed for segm

In [30]:
# Creating vocabulary with uniques words
vocabulary = set()
train_df["abstracts"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

146951


### Recommandation system

In [9]:
df.drop(columns=["terms","abstracts"],inplace=True)

In [10]:
data=df.copy()

In [11]:
data

Unnamed: 0,titles
0,Multi-Level Attention Pooling for Graph Neural...
1,Decision Forests vs. Deep Networks: Conceptual...
2,Power up! Robust Graph Convolutional Network v...
3,Releasing Graph Neural Networks with Different...
4,Recurrence-Aware Long-Term Cognitive Network f...
...,...
56176,Mining Spatio-temporal Data on Industrializati...
56177,Wav2Letter: an End-to-End ConvNet-based Speech...
56178,Deep Reinforcement Learning with Double Q-lear...
56179,Generalized Low Rank Models


### sentence transformers

In [12]:
from sentence_transformers import SentenceTransformer,util

In [13]:
model=SentenceTransformer("all-MiniLM-L6-v2")



In [14]:
sentences=data['titles']
embeddings=model.encode(sentences)  #convert text data into numerical representations (embeddings)  

In [15]:
c=0
for sentence,embedding in zip(sentences,embeddings):
    print("sentence",sentence)
    print("embedding",len(embedding))
    print("")
    if c >= 5:
        break
    c+=1


sentence Multi-Level Attention Pooling for Graph Neural Networks: Unifying Graph Representations with Multiple Localities
embedding 384

sentence Decision Forests vs. Deep Networks: Conceptual Similarities and Empirical Differences at Small Sample Sizes
embedding 384

sentence Power up! Robust Graph Convolutional Network via Graph Powering
embedding 384

sentence Releasing Graph Neural Networks with Differential Privacy Guarantees
embedding 384

sentence Recurrence-Aware Long-Term Cognitive Network for Explainable Pattern Classification
embedding 384

sentence Lifelong Graph Learning
embedding 384



### why select **all-MiniLM-L6-v2** ?

### save files

In [16]:
import pickle
with open('embeddings.pkl','wb') as f:
    pickle.dump(embeddings,f)

with open('sentences.pkl','wb') as f:
    pickle.dump(sentences,f)
with open('model.pkl','wb') as f:
    pickle.dump(model,f)

### recommendation for similar papers

In [17]:
import pickle

embeddings=pickle.load(open("models/embeddings.pkl",'rb'))
sentences=pickle.load(open("models/sentences.pkl","rb"))
model=pickle.load(open("models/model.pkl","rb"))

In [18]:
import torch

def recommendation(input_paper):
    # Calculate cosine similarity scores between the embeddings of input_paper and all papers in the dataset.
    cosine_scores = util.cos_sim(embeddings, model.encode(input_paper))
    
    # Get the indices of the top-k most similar papers based on cosine similarity.
    top_similar_papers = torch.topk(cosine_scores, dim=0, k=5, sorted=True)
                                 
    # Retrieve the titles of the top similar papers.
    papers_list = []
    for i in top_similar_papers.indices:
        papers_list.append(sentences[i.item()])
    
    return papers_list

In [19]:
input_paper=input('attention')
recommandation_papers=recommendation(input_paper)
recommandation_papers

['Persuasive Faces: Generating Faces in Advertisements',
 'Copy this Sentence',
 'Area Attention',
 'Area Attention',
 '"You eat with your eyes first": Optimizing Yelp Image Advertising']

In [20]:
# install this versions
import sentence_transformers
import tensorflow
import torch
print(torch.__version__)
print(sentence_transformers.__version__)
print(tensorflow.__version__)

2.3.1+cpu
2.3.1
2.17.0
