In [1]:
!pip install -q gdown


In [2]:
import gdown
import numpy as np
import pandas as pd
import os
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
file_id = '1QFo5GKE8w4Pccf2jfUZ9S6CQi__kOfni'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'dataset.zip'
gdown.download(url, output, quiet=False)


Downloading...
From (original): https://drive.google.com/uc?id=1QFo5GKE8w4Pccf2jfUZ9S6CQi__kOfni
From (redirected): https://drive.google.com/uc?id=1QFo5GKE8w4Pccf2jfUZ9S6CQi__kOfni&confirm=t&uuid=4b9d5c90-3dca-47e5-a1c2-f815ee1a4144
To: /content/dataset.zip
100%|██████████| 3.59G/3.59G [00:47<00:00, 75.3MB/s]


'dataset.zip'

In [4]:
## new download
import zipfile

import os
import requests
from tqdm import tqdm

target = "./unzipped_dataset"
os.makedirs(target, exist_ok=True)

url = "https://tuc.cloud/index.php/s/9sM2mfzJBfHGsjY/download/dataset.zip"
download_path = "dataset.zip"

if not os.path.exists(download_path):

  response = requests.get(url, stream=True, timeout=30)
  total_size = int(response.headers.get("content-length", 0))
  block_size = 1024

  with (
      tqdm(total=total_size, unit="iB", unit_scale=True, desc="Downloading model") as tqdm_bar,
      open(download_path, "wb") as file,
  ):
      for data in response.iter_content(block_size):
          tqdm_bar.update(len(data))
          file.write(data)

  if response.status_code != 200 or (total_size not in (0, tqdm_bar.n)):
      raise ValueError(f"Failed to download the file. Status code: {response.status_code}")

with zipfile.ZipFile(download_path, "r") as zip_ref:
    zip_ref.extractall(target)

KeyboardInterrupt: 

In [5]:
!unzip dataset.zip -d ./unzipped_dataset/


Archive:  dataset.zip
replace ./unzipped_dataset/data/arxiv_scibert_embeddings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ./unzipped_dataset/__MACOSX/data/._arxiv_scibert_embeddings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n A
replace ./unzipped_dataset/data/arxiv_dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
  inflating: ./unzipped_dataset/__MACOSX/data/._arxiv_minilm_embeddings.csv  
  inflating: ./unzipped_dataset/data/arxiv_specter_embeddings.csv  N

  inflating: ./unzipped_dataset/__MACOSX/data/._arxiv_specter_embeddings.csv  
  inflating: ./unzipped_dataset/data/arxiv_scibert_embeddings.npy  
  inflating: ./unzipped_dataset/__MACOSX/data/._arxiv_scibert_embeddings.npy  
  inflating: ./unzipped_dataset/data/arxiv_tokenized_balanced.parquet  


In [6]:
file_id = '1crnwOwYzpcF_7dnbUg4fflAuYmjOgjfH'
url = f'https://drive.google.com/uc?id={file_id}'
output = './unzipped_dataset/data/category_df.csv'
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1crnwOwYzpcF_7dnbUg4fflAuYmjOgjfH
From (redirected): https://drive.google.com/uc?id=1crnwOwYzpcF_7dnbUg4fflAuYmjOgjfH&confirm=t&uuid=cfda5fcb-2e83-4d28-bd20-85f497d0c32a
To: /content/unzipped_dataset/data/category_df.csv
100%|██████████| 143M/143M [00:01<00:00, 133MB/s]


'./unzipped_dataset/data/category_df.csv'

In [7]:

file_id_list = [
    'arxiv_scibert_embeddings.csv',
    'arxiv_dataset.csv',
    'arxiv_minilm_embeddings.npy',
    'arxiv_specter_embeddings.npy',
    'arxiv_tokenized_balanced.csv',
    'arxiv_minilm_embeddings.csv',
    'arxiv_specter_embeddings.csv',
    'arxiv_scibert_embeddings.npy',
    # 'category_df.csv'
]


local_data_path = 'unzipped_dataset/data'
files_dict = {}

for k in file_id_list:
  if k.endswith('.npy'):
    files_dict[k] = np.load(os.path.join(local_data_path, k))
  elif k.endswith('.csv'):
    files_dict[k] = pd.read_csv(os.path.join(local_data_path, k), low_memory=False)


In [8]:
model_name_dict = {'arxiv_minilm_embeddings': 'sentence-transformers/all-MiniLM-L6-v2',
'arxiv_specter_embeddings': 'allenai/specter',
'arxiv_scibert_embeddings': 'allenai/scibert_scivocab_uncased'}
model_weight_dict = {'arxiv_minilm_embeddings': 0.5,
'arxiv_specter_embeddings': 0.25,
'arxiv_scibert_embeddings': 0.25}

In [9]:
paper_emb_dict = {}
for k in model_name_dict.keys():
  paper_emb_dict[k] = torch.from_numpy(files_dict[k + '.npy'])

In [10]:
tokenizer_dict = {}
model_dict = {}
for k in model_name_dict.keys():
  tokenizer_dict[k] = AutoTokenizer.from_pretrained(model_name_dict[k])
  model_dict[k] = AutoModel.from_pretrained(model_name_dict[k])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [11]:
return_df_list = ['id', 'title', 'abstract', 'authors', 'doi']

In [12]:

def recommend_for_prompt(prompt, paper_embeddings, tokenizer, model, top_n=10):# this is for model
    tokens = tokenizer(prompt, truncation=True,
        max_length=512,
        padding="max_length",
                       return_tensors="pt"
        )

    with torch.no_grad():
      prompt_vec = model(**tokens).last_hidden_state.mean(dim=1).double()#(torch.float64)


    scores= torch.from_numpy(cosine_similarity(prompt_vec, paper_embeddings)[0])

    top_scores = scores.topk(k=top_n)
    top_indices = top_scores.indices

    recommendations = files_dict['arxiv_dataset.csv'].iloc[top_indices.tolist()][return_df_list]
    return {'recomm': recommendations.reset_index(drop=True),
            'scores': top_scores}

In [13]:
def rank_fusion_with_tiebreaker(recomm_dict, model_weight_dict, top_k=10, preferred_model=None):
    rank_scores = {}
    recommended_by = {}

    for model_name, results in recomm_dict.items():
        paper_ids = results["recomm"]['id'].tolist()
        for rank, paper_id in enumerate(paper_ids):
            score = 1 / (rank + 1) * model_weight_dict[model_name]
            rank_scores[paper_id] = rank_scores.get(paper_id, 0) + score
            recommended_by.setdefault(paper_id, set()).add(model_name)

    def sort_key(item):
        paper_id, score = item
        return (-score, preferred_model not in recommended_by.get(paper_id, set()))

    sorted_results = sorted(rank_scores.items(), key=sort_key)

    return sorted_results[:top_k]


In [14]:
def get_top_k_metadata(top_results, meta_df):

    top_df = pd.DataFrame(top_results, columns=['id', 'score'])

    merged_df = top_df.merge(meta_df[return_df_list], on='id', how='left')

    return {'recomm': merged_df[return_df_list].reset_index(drop=True),
            'scores': merged_df['score'].tolist()}


In [15]:
def get_recomm(prompt, paper_embeddings, tokenizer, model, model_weight_dict,
               preferred_model="sentence-transformers/all-MiniLM-L6-v2",
               top_n=10):
  recomm_dict = {}
  for k in model_name_dict.keys():
    recomm_dict[k] = recommend_for_prompt(prompt=prompt,
                        paper_embeddings=paper_emb_dict[k],
                        tokenizer=tokenizer_dict[k],
                        model=model_dict[k],
                        top_n=top_n)
    top_results = rank_fusion_with_tiebreaker(
    recomm_dict=recomm_dict,
    model_weight_dict=model_weight_dict,
    top_k=top_n,
    preferred_model=preferred_model
)
  return get_top_k_metadata(top_results=top_results, meta_df=files_dict['arxiv_dataset.csv'])


In [26]:
# ----- UI ------

import gradio as gr


with gr.Blocks(
    theme=gr.themes.Default(),
    analytics_enabled=False,
) as demo:

  gr.Markdown("# 📚 PaperPal – Scientific Paper Recommender")

  with gr.Group():
    with gr.Row(equal_height=True):
      prompt_tb = gr.Textbox(label="Query", info="You topics, titles and authors.", scale=5)
      number_of_recoms = gr.Number(10, minimum=1, label="Number of Papers", info="📚")
      btn = gr.Button("🔍 Search", scale=1)

  @gr.render(
      inputs=[prompt_tb, number_of_recoms],
      triggers=[btn.click, prompt_tb.submit]
  )
  def render_results(prompt, topn):
    # result = recommend_for_prompt(prompt)
    result = get_recomm(prompt=prompt, paper_embeddings=paper_emb_dict,
           tokenizer=tokenizer_dict, model=model_dict, model_weight_dict=model_weight_dict,
               preferred_model="sentence-transformers/all-MiniLM-L6-v2",
               top_n=topn)

    for i, row in result["recomm"].iterrows():
      score = round(float(result['scores'][i]), 3)
      abstract = row['abstract'][:512] + '...' if len(row['abstract']) > 512 else row['abstract']
      gr.Markdown(f"""
<div style="border: 1px solid #ccc; border-radius: 12px; padding: 0 10px 10px 10px; margin-bottom: 10px;">
      <h3>{row['title']}</h3>
      <p><strong>Score:</strong> {score}</p>
      <p><strong>Authors:</strong> {row['authors']}</p>
      <p>{abstract}</p>
      <p><a href="https://arxiv.org/abs/{row['id']}">Read more</a></p>
</div>
      """)


demo.queue(api_open=False).launch(
    quiet=True,
    show_api=False,
    enable_monitoring=False,
    allowed_paths=["/"],
)




* Running on public URL: https://baabf2947723c2b750.gradio.live




In [21]:
out_dict


{'recomm':           id                                              title  \
 0  0801.4369  A new method for studying the vibration of non...   
 1  0706.1423  Spiral phases and two-particle bound states fr...   
 2  0801.1853             Intermediate-Range Order in Water Ices   
 3  0801.2839  Toward the unification of the postulates of Qu...   
 4  0902.1629  Improvements of real coded genetic algorithms ...   
 5  0712.4101  Digital Ecosystems: Stability of Evolving Agen...   
 6  0705.4261  Entiers al\'eatoires, ensembles de Sidon, dens...   
 7  0805.3366  Computational Representation of Linguistic Str...   
 8  0801.4906  An atlas of synthetic line profiles of planeta...   
 9  0801.0387  Corrugated probe for SNOM - Optimization of en...   
 
                                             abstract  \
 0    We present a method to solve the Helmholtz e...   
 1    We have constructed a systematic low-energy ...   
 2    We report measurements of the non-resonant i...   
 3    In thi