In [2]:
import json
import os

import chromadb
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from openai import OpenAI
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

In [3]:
PARQUET_PATH = '../data/arxiv_metadata_sample.parquet.gzip'
# PARQUET_PATH = r"C:\Users\ihett\OneDrive\Gabrilyi\arxiv_project\arxiv_metadata_sample.parquet.gzip"

EVAL_DF_PATH = '../data/eval_df.parquet.gzip'

CHROMA_DATA_PATH = "chroma_data"
# CHROMA_DATA_PATH = r"C:\Users\ihett\OneDrive\Gabrilyi\arxiv_project\chroma_data"

# [WARNING]
# Choose whether to delete all chroma data for the chosen model and recompute it
#
DO_DELETE_CHROMA_DATA = True

#
# Choose model style [sentence_transformers, lmstudio]
#
model_style = "sentence_transformers"


#
# Models from LMStudio
#
# EMBED_MODEL = "gte-small-gguf" # LMStudio (ChristianAzinn/gte-small-gguf/gte-small.Q4_0.gguf)


#
# Models from Sentence Transformers (https://www.sbert.net/docs/sentence_transformer/pretrained_models.html)
#
# EMBED_MODEL = "all-MiniLM-L12-v2"
# EMBED_MODEL = "all-mpnet-base-v2"
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/semantic-search/semantic_search_publications.py
EMBED_MODEL = "allenai-specter" # https://huggingface.co/sentence-transformers/allenai-specter
# EMBED_MODEL = "GIST-small-Embedding-v0" # https://huggingface.co/avsolatorio/GIST-small-Embedding-v0


COLLECTION_NAME = "arxiv_papers"
BATCH_SIZE = 2000

CHROMA_DATA_PATH = os.path.join(CHROMA_DATA_PATH, EMBED_MODEL)

In [15]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

arxiv_df = pd.read_parquet(PARQUET_PATH)
eval_df = pd.read_parquet(EVAL_DF_PATH)

print(arxiv_df.shape)
print(eval_df.shape)

# only keep arxiv papers that are in the evaluation set
data_df = arxiv_df[arxiv_df['id'].isin(eval_df['id'])]
data_df = data_df.merge(eval_df, on='id', how='inner')

print(f'Columns in data_df: {data_df.columns}')

(159136, 13)
(3000, 8)
Columns in data_df: Index(['id', 'title', 'abstract', 'categories', 'update_date', 'title_words',
       'abstract_words', 'mapped_categories', 'amount_categories',
       'update_year', 'super_categories', 'super_category',
       'amount_super_categories', 'rewritten_text', 'removed_text_25',
       'removed_text_50', 'removed_text_75', 'removed_text_25_shuffled',
       'removed_text_50_shuffled', 'removed_text_75_shuffled'],
      dtype='object')


In [21]:
def text_processing(sample):
    title = sample['title']
    abstract = sample['abstract']

    # remove special characters
    title = title.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()
    abstract = abstract.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()

    # remove multiple spaces
    title = ' '.join(title.split())
    abstract = ' '.join(abstract.split())

    return f"{title} [SEP] {abstract}".replace('  ', ' ')

data_df['text'] = data_df.apply(text_processing, axis=1)
data_df.head(3)

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year,...,super_category,amount_super_categories,rewritten_text,removed_text_25,removed_text_50,removed_text_75,removed_text_25_shuffled,removed_text_50_shuffled,removed_text_75_shuffled,text
0,cs/0703062,Bandit Algorithms for Tree Search,Bandit based methods for tree search have re...,[cs.LG],2016-08-14,5,223,[Machine Learning],1,2016,...,Computer Science,1,"""Optimizing Tree Search Algorithms for Efficie...",Bandit based methods for tree search have gain...,Bandit based tree have recently to e.g. game g...,"for recently to game (Gelly al., 2006). (Kocsi...",rewards effective O(exp(exp(D))) algorithms se...,"UpperConfidence to cases, a We sub-optimal Smo...","of possible scalesexponentially performed al.,...",Bandit Algorithms for Tree Search [SEP] Bandit...
1,1410.7743,Data Driven Authentication: On the Effectivene...,"We propose a lightweight, and temporally and...",[cs.CR],2014-10-29,14,128,[Cryptography and Security],1,2014,...,Computer Science,1,"""Behavioral Biometric Authentication Using Mac...","We propose lightweight, temporally and spatial...",propose a temporally user technique sensor-bas...,propose and aware user for the sufficiently fr...,"data sufficiently thebackground, We norm,actio...","the model switches propose norm,actions capabi...",duration investigate drift. expected propose o...,Data Driven Authentication: On the Effectivene...
2,2306.13023,AugDMC: Data Augmentation Guided Deep Multiple...,Clustering aims to group similar objects tog...,[cs.CV],2023-06-23,7,218,[Computer Vision and Pattern Recognition],1,2023,...,Computer Science,1,"""Discovering multiple perspectives in complex ...",Clustering aims to group similar objects toget...,Clustering aims group objects together separat...,aims while apart. an manner. provide only a cl...,dataaugmentations can the aims prototype-based...,prototype-based methods methods aspects augmen...,method. different as independent aspect in fro...,AugDMC: Data Augmentation Guided Deep Multiple...


In [22]:
def create_metadatas(arxiv_df):
    metadatas = []
    for _, row in arxiv_df.iterrows():
        metadatas.append({
            "update_date": row['update_date'],
            "title_words": row['title_words'],
            "abstract_words": row['abstract_words'],
            "super_category": row['super_category'],
            "mapped_categories": ";".join(row['mapped_categories']),
        })

    return metadatas

def create_collection(client, collection_name, embedding_function):
    collection = client.create_collection(
        name=collection_name,
        embedding_function=embedding_function,
        metadata={"hnsw:space": "cosine"},
        get_or_create=True,
    )

    return collection

def delete_collection_data(client, collection, collection_name):
    print(f"Deleting data from collection {collection_name} with {collection.count()} documents")
    client.delete_collection(collection_name)

def get_random_samples_from_collection(collection, n_samples):
    collection_ids = collection.get()["ids"]
    random_ids = np.random.choice(collection_ids, n_samples, replace=False).tolist()
    documents = collection.get(ids=random_ids)
    return documents

def upsert_data(collection, arxiv_df, metadatas, batch_size):
    for i in tqdm(range(0, len(arxiv_df), batch_size)):
        collection.upsert(
            documents=arxiv_df['text'].iloc[i:i + batch_size].tolist(),
            ids=arxiv_df['id'].iloc[i:i + batch_size].tolist(),
            metadatas=metadatas[i:i + batch_size],
        )

In [23]:
if model_style == "sentence_transformers":
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBED_MODEL,
        device="cuda",
        cache_folder=cache_dir
    )
elif model_style == "lmstudio":
    class Embedder(EmbeddingFunction):
        def __init__(self):
            self.client = OpenAI(base_url="http://localhost:5000/v1", api_key="lm-studio")
            self.model = EMBED_MODEL

        def __call__(self, input:Documents) -> Embeddings:
            return [d.embedding for d in self.client.embeddings.create(input = input, model=self.model).data]

    embedding_func = Embedder()



In [24]:
# delete the collection if it exists
client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

collection = create_collection(client, COLLECTION_NAME, embedding_func)

########################################
######## WARNING: DELETES DATA #########
########################################
if DO_DELETE_CHROMA_DATA and input("Do you want to delete all data in the collection? (y/n): ") == "y":
    ##### delete if you want to start fresh but then you need to create the collection again
    delete_collection_data(client, collection, COLLECTION_NAME)
    collection = create_collection(client, COLLECTION_NAME, embedding_func)

    ##### create metadatas
    metadatas = create_metadatas(data_df)

    ##### upsert data (insert or update if exists)
    upsert_data(collection, data_df, metadatas, BATCH_SIZE)

Deleting data from collection arxiv_papers with 20000 documents


  0%|          | 0/2 [00:00<?, ?it/s]

In [45]:
sample_data = data_df.sample(1)
sample_id = sample_data['id'].values[0]
sample_llm_text = sample_data['rewritten_text'].values[0]

print(f"Sample ID: {sample_id}")
print(f"Sample LLM Text: {sample_llm_text}")
print(f"Sample Text: {sample_data['text'].values[0]}")

top_n_papers = 5
query_results = collection.query(query_texts=[sample_llm_text], n_results=top_n_papers)

for _id, _doc, _dist, _meta in zip(query_results["ids"][0], query_results["documents"][0], query_results["distances"][0], query_results["metadatas"][0]):
    print(f"#####   ID: {_id}   #####")

Sample ID: 1605.08039
Sample LLM Text: "Dark Matter searches via Higgs boson decay at LHC"
Sample Text: Boosting invisible searches via $\boldsymbol{ZH}$: From the Higgs Boson to Dark Matter Simplified Models [SEP] Higgs boson production in association with a $Z$-boson at the LHC is analysed, both in the Standard Model and in Simplified Model extensions for Dark Matter. We focus on $H\rightarrow$invisibles searches and show that loop-induced components for both the signal and background present phenomenologically relevant contributions to the $\mathcal{BR}(H\rightarrow\textit{inv})$ limits. In addition, the constraining power of this channel to Simplified Models for Dark Matter with scalar and pseudo-scalar mediators $\phi$ and $A$ is discussed and compared with non-collider constraints. We find that with $100~fb^{-1}$ of LHC data, this channel provides competitive constraints to the non-collider bounds, for most of the parameter space we consider, bounding the universal Standard Model

In [36]:
top_n_papers = 20
matches = []
for idx, row in tqdm(data_df.iterrows(), total=len(data_df)):
    paper_id = row['id']
    paper_llm_text = row['rewritten_text']
    query_results = collection.query(query_texts=[paper_llm_text], n_results=top_n_papers)

    found_n = -1
    for i, result_id in enumerate(query_results["ids"][0], 1):
        if result_id == paper_id:
            found_n = i
            break

    matches.append((paper_id, found_n))

  0%|          | 0/3000 [00:00<?, ?it/s]

In [46]:
matches_df = pd.DataFrame(matches, columns=['id', 'found_n'])
matches_df['found_n'] = matches_df['found_n'].replace(-1, np.nan)
matches_df['found_n'] = matches_df['found_n'].astype(float)
data_df = data_df.merge(matches_df, on='id', how='inner').reset_index(drop=True)

In [48]:
data_df.head()

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year,...,amount_super_categories,rewritten_text,removed_text_25,removed_text_50,removed_text_75,removed_text_25_shuffled,removed_text_50_shuffled,removed_text_75_shuffled,text,found_n
0,cs/0703062,Bandit Algorithms for Tree Search,Bandit based methods for tree search have re...,[cs.LG],2016-08-14,5,223,[Machine Learning],1,2016,...,1,"""Optimizing Tree Search Algorithms for Efficie...",Bandit based methods for tree search have gain...,Bandit based tree have recently to e.g. game g...,"for recently to game (Gelly al., 2006). (Kocsi...",rewards effective O(exp(exp(D))) algorithms se...,"UpperConfidence to cases, a We sub-optimal Smo...","of possible scalesexponentially performed al.,...",Bandit Algorithms for Tree Search [SEP] Bandit...,1.0
1,1410.7743,Data Driven Authentication: On the Effectivene...,"We propose a lightweight, and temporally and...",[cs.CR],2014-10-29,14,128,[Cryptography and Security],1,2014,...,1,"""Behavioral Biometric Authentication Using Mac...","We propose lightweight, temporally and spatial...",propose a temporally user technique sensor-bas...,propose and aware user for the sufficiently fr...,"data sufficiently thebackground, We norm,actio...","the model switches propose norm,actions capabi...",duration investigate drift. expected propose o...,Data Driven Authentication: On the Effectivene...,19.0
2,2306.13023,AugDMC: Data Augmentation Guided Deep Multiple...,Clustering aims to group similar objects tog...,[cs.CV],2023-06-23,7,218,[Computer Vision and Pattern Recognition],1,2023,...,1,"""Discovering multiple perspectives in complex ...",Clustering aims to group similar objects toget...,Clustering aims group objects together separat...,aims while apart. an manner. provide only a cl...,dataaugmentations can the aims prototype-based...,prototype-based methods methods aspects augmen...,method. different as independent aspect in fro...,AugDMC: Data Augmentation Guided Deep Multiple...,
3,2209.15157,Rethinking and Recomputing the Value of ML Models,"In this paper, we argue that the way we have...","[cs.LG, cs.AI]",2022-10-03,8,144,"[Machine Learning, Artificial Intelligence]",2,2022,...,1,"""Contextualizing Machine Learning for Organiza...",this argue that the way have been training and...,this argue way we training MLmodels largely fo...,the evaluating fact that applied in an as valu...,are some role learning an show learn. societal...,andprovide practices are MLmodels MLmodels we ...,a models models change different different on ...,Rethinking and Recomputing the Value of ML Mod...,1.0
4,2403.19969,"Separate, Dynamic and Differentiable (SMART) P...",Deep Neural Network (DNN) pruning has emerge...,"[cs.CV, cs.LG]",2024-04-01,14,161,"[Computer Vision and Pattern Recognition, Mach...",2,2024,...,1,"""Optimizing Neural Network Pruning for Efficie...",Neural Network (DNN) pruning has as a key stra...,Deep (DNN) has emerged a to improve inference ...,"Deep has emerged latency, power techniques, ou...",across pruning demonstrating (SMART) pruner.Th...,"results, Deep consumption to accelerating task...",a output forweight parameter tasks various our...,"Separate, Dynamic and Differentiable (SMART) P...",2.0


In [49]:
# show text where the model did not find the paper in the top 20
sample_not_found = data_df[data_df['found_n'].isna()].sample(1)
sample_not_found_id = sample_not_found['id'].values[0]
sample_not_found_llm_text = sample_not_found['rewritten_text'].values[0]
sample_not_found_text = sample_not_found['text'].values[0]

print(f"Sample ID: {sample_not_found_id}")
print(f"Sample LLM Text: {sample_not_found_llm_text}")
print(f"Sample Text: {sample_not_found_text}")

Sample ID: 1406.3183
Sample LLM Text: "Nonlinear Gaussian model sampling with approximate Gaussian flows and importance weights for efficient Bayesian inference."
Sample Text: Approximations of the Optimal Importance Density using Gaussian Particle Flow Importance Sampling [SEP] Recently developed particle flow algorithms provide an alternative to importance sampling for drawing particles from a posterior distribution, and a number of particle filters based on this principle have been proposed. Samples are drawn from the prior and then moved according to some dynamics over an interval of pseudo-time such that their final values are distributed according to the desired posterior. In practice, implementing a particle flow sampler requires multiple layers of approximation, with the result that the final samples do not in general have the correct posterior distribution. In this paper we consider using an approximate Gaussian flow for sampling with a class of nonlinear Gaussian models. We u

In [51]:
top_1_accuracy = data_df[data_df['found_n'] == 1].shape[0] / data_df.shape[0] * 100
top_3_accuracy = data_df[data_df['found_n'] <= 3].shape[0] / data_df.shape[0] * 100
top_5_accuracy = data_df[data_df['found_n'] <= 5].shape[0] / data_df.shape[0] * 100
top_20_accuracy = data_df[data_df['found_n'] <= 20].shape[0] / data_df.shape[0] * 100

print(f"Top 1 Accuracy: {top_1_accuracy:.2f}%")
print(f"Top 3 Accuracy: {top_3_accuracy:.2f}%")
print(f"Top 5 Accuracy: {top_5_accuracy:.2f}%")
print(f"Top 20 Accuracy: {top_20_accuracy:.2f}%")

Top 1 Accuracy: 67.40%
Top 3 Accuracy: 80.77%
Top 5 Accuracy: 84.30%
Top 20 Accuracy: 91.03%
