In [10]:
# for every data point in the sample dataset, create variatinons of the text data for the purpose of evaluation of the similarity search

In [11]:
import json
import os

import chromadb
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from openai import OpenAI
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
PARQUET_PATH = '../data/arxiv_metadata_sample.parquet.gzip'
# PARQUET_PATH = r"C:\Users\ihett\OneDrive\Gabrilyi\arxiv_project\arxiv_metadata_sample.parquet.gzip"

EVAL_DF_PATH = '../data/eval_df.parquet.gzip'

# LLM_MODEL = "LM Studio Community/Meta-Llama-3-8B-Instruct-GGUF"
LLM_MODEL = "LM Studio Community/Meta-Llama-3-8B-Instruct-GGUF"
API_URL = "http://localhost:5000/v1"

In [13]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

arxiv_df = pd.read_parquet(PARQUET_PATH)
print(arxiv_df.shape)

(70000, 13)


In [14]:
# Point to the local server
client = OpenAI(base_url="http://localhost:5000/v1", api_key="lm-studio")

In [15]:
# system_prompt = """
# Rewrite the following text. The content and target should be the same (also the topic), but it should be COMPLETLY
# rewritten. Shorten it to a maximum of 1-2 sentences. The text should be unique and not copied from the original text.
# Use different words, sentence structure, and style. Just some general ideas should be the same and the same topic.
# Also, dont write that you are rewriting the text. Just write the text.
# """
system_prompt = """
In the following you will receive an abstract. Use this text to create a short search term.
The search term should have the same topic as the abstract, but summarized in a short sentence or two.
Use different words, sentence structure and style. The keywords you use should be from the same field but should not appear in the original text.
This search term should be assignable to the abstract.
Do not mention that you are rewriting the text! Just write the search term and nothing else.
"""

text_to_rewrite = """
    Neuroscience research is undergoing a minor revolution. Recent advances in machine learning and
    artificial intelligence (AI) research have opened up new ways of thinking about neural computation.
    Many researchers are excited by the possibility that deep neural networks may offer theories of perception,
    cognition and action for biological brains. This perspective has the potential to radically reshape our approach to
    understanding neural systems, because the computations performed by deep networks are learned from experience,
    not endowed by the researcher. If so, how can neuroscientists use deep networks to model and understand biological
    brains? What is the outlook for neuroscientists who seek to characterise computations or neural codes, or who wish
    to understand perception, attention, memory, and executive functions? In this Perspective, our goal is to offer a
    roadmap for systems neuroscience research in the age of deep learning. We discuss the conceptual and methodological
    challenges of comparing behaviour, learning dynamics, and neural representation in artificial and biological
    systems. We highlight new research questions that have emerged for neuroscience as a direct consequence of recent
    advances in machine learning.
"""

def process_text(text):
    return text.strip().replace("\n", " ").replace("  ", " ").replace("  ", " ")

def get_completion(abstract=text_to_rewrite, prompt=system_prompt):
    completion = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[
            {"role": "system", "content": process_text(prompt)},
            {"role": "user", "content": process_text(abstract)},
        ],
        temperature=0.4,
    )
    return completion.choices[0].message.content

def shuffle_text_words(text):
    text = process_text(text)
    words = text.split()
    np.random.shuffle(words)
    return " ".join(words)

def remove_words(text, p=0.5):
    text = process_text(text)
    words = text.split()
    words = [word for word in words if np.random.rand() > p]
    return " ".join(words)

def remove_stopwords(abstract):
    words = abstract.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)

In [16]:
eval_data = []
max_amount = np.inf
# shuffle the data
arxiv_df = arxiv_df.sample(frac=1).reset_index(drop=True)

processed_ctr = 0
for idx, row in tqdm(arxiv_df.iterrows(), total=arxiv_df.shape[0]):
    paper_id = row['id']
    title = row['title']
    abstract = row['abstract']
    # rewritten_text = get_completion(abstract)
    removed_stopwords = remove_stopwords(abstract)
    removed_text_25 = remove_words(abstract, p=0.25)
    removed_text_50 = remove_words(abstract, p=0.5)
    removed_text_75 = remove_words(abstract, p=0.75)
    removed_text_25_shuffled = shuffle_text_words(removed_text_25)
    removed_text_50_shuffled = shuffle_text_words(removed_text_50)
    removed_text_75_shuffled = shuffle_text_words(removed_text_75)
    eval_data.append({
        'id': paper_id,
        # 'rewritten_text': rewritten_text,
        'removed_stopwords': removed_stopwords,
        'removed_text_25': removed_text_25,
        'removed_text_50': removed_text_50,
        'removed_text_75': removed_text_75,
        'removed_text_25_shuffled': removed_text_25_shuffled,
        'removed_text_50_shuffled': removed_text_50_shuffled,
        'removed_text_75_shuffled': removed_text_75_shuffled,
    })
    processed_ctr += 1
    if processed_ctr >= max_amount:
        break

eval_df = pd.DataFrame(eval_data)

  0%|          | 0/70000 [00:00<?, ?it/s]

In [17]:
eval_df.to_parquet(EVAL_DF_PATH, index=False, compression='gzip', engine='pyarrow')

In [18]:
eval_df

Unnamed: 0,id,removed_stopwords,removed_text_25,removed_text_50,removed_text_75,removed_text_25_shuffled,removed_text_50_shuffled,removed_text_75_shuffled
0,2106.08192,"develop mathematical model, based system ordin...","We develop a mathematical model, based on a sy...",We a system ordinary differential upshot alert...,"model, a ordinary of farming administration, b...","field, use that to We control in of model, pes...",that and points equilibria Routh-Hurwitz the l...,of stability in numerical of the interventions...
1,2404.09388,Coherent dissipative interactions different qu...,Coherent and dissipative interactions between ...,interactions between systems are for of hybrid...,"systems systems and we propose analyze system,...",and interactions nearby superconducting and he...,We of quantum for between the quantum qubits a...,"quantum coupling provides system, micromagnet ..."
2,2202.10347,Geographic proximity acknowledged key factor r...,Geographic proximity acknowledged to be a key ...,Geographic acknowledged to be factor in as for...,"is collaborations. can for present ""proximity""...",national bear byline the of it Web The researc...,effectively production in factor policies in S...,"of research different is types, scientific nat..."
3,2209.09589,combination microwave microfluidic technologie...,combination of microwave and has potential ena...,The combination microfluidic has potential to ...,"of and microfluidic has to bioparticles, way u...","which m}$ chamber, of provided achieve of phys...",need setup considered. over sensitivity to the...,to need $10^5$ superheterodyne and perturbatio...
4,2102.12997,power distribution network energy storage syst...,a power distribution network systems and advan...,a power distribution network energy storage (E...,"a network (ESS) controls, schemes not for as c...","methodology knowledge among the and streams, O...",anomalies of scarce. includes are advanced dev...,devices. and active distribution for power a a...
...,...,...,...,...,...,...,...,...
69995,2202.07484,scattering transform non-linear signal represe...,The scattering transform non-linear signal rep...,The scattering transform a method magnitudes. ...,The based In paper we we in scattering first a...,in derivatives magnitudes. transform signal we...,"to we signals, by a first well of of in basis ...",and in by are of we representations which we a...
69996,1910.12666,consider task certification genuine entangleme...,We consider task genuine entanglement triparti...,We the certification entanglement of tripartit...,"We entanglement first present ""all-versus-noth...","purpose, model we generalization proof can we ...","a scenario. this state in pure of ""all-versus-...",device-independent entanglement (GGHZ) steerin...
69997,2401.12934,paper studies offline reinforcement learning l...,This paper studies reinforcement learning with...,paper reinforcement learning linear function a...,offline reinforcement linear approximation wit...,in and policy sparsity. that additional data-g...,set setting process function on structural rew...,process the structural approximation full-stat...
69998,2303.11498,Equilibrium statistical mechanics predicts inv...,Equilibrium statistical mechanics predicts tha...,"Equilibrium statistical two-dimensional, flow ...","statistical that two-dimensional, flow the a w...","established, hold at inviscid, a $n=2$ the ang...",statistical $n=2$ a stated possibility conserv...,a which the rotating initial behavior static t...
