In [1]:
# for every data point in the sample dataset, create variatinons of the text data for the purpose of evaluation of the similarity search

In [51]:
import json
import os

import chromadb
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from openai import OpenAI
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

In [70]:
PARQUET_PATH = '../data/arxiv_metadata_sample.parquet.gzip'
# PARQUET_PATH = r"C:\Users\ihett\OneDrive\Gabrilyi\arxiv_project\arxiv_metadata_sample.parquet.gzip"

EVAL_DF_PATH = '../data/eval_df.parquet.gzip'

LLM_MODEL = "LM Studio Community/Meta-Llama-3-8B-Instruct-GGUF"
API_URL = "http://localhost:5000/v1"

In [53]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

arxiv_df = pd.read_parquet(PARQUET_PATH)
print(arxiv_df.shape)

(159136, 13)


In [54]:
# Point to the local server
client = OpenAI(base_url="http://localhost:5000/v1", api_key="lm-studio")

In [57]:
# system_prompt = """
# Rewrite the following text. The content and target should be the same (also the topic), but it should be COMPLETLY
# rewritten. Shorten it to a maximum of 1-2 sentences. The text should be unique and not copied from the original text.
# Use different words, sentence structure, and style. Just some general ideas should be the same and the same topic.
# Also, dont write that you are rewriting the text. Just write the text.
# """
system_prompt = """
In the following you will receive an abstract. Use this text to create a short search term.
The search term should have the same topic as the abstract, but summarized in a short sentence or two.
Use different words, sentence structure and style. The keywords you use should be from the same field but should not appear in the original text.
This search term should be assignable to the abstract.
Do not mention that you are rewriting the text! Just write the search term and nothing else.
"""

text_to_rewrite = """
    Neuroscience research is undergoing a minor revolution. Recent advances in machine learning and
    artificial intelligence (AI) research have opened up new ways of thinking about neural computation.
    Many researchers are excited by the possibility that deep neural networks may offer theories of perception,
    cognition and action for biological brains. This perspective has the potential to radically reshape our approach to
    understanding neural systems, because the computations performed by deep networks are learned from experience,
    not endowed by the researcher. If so, how can neuroscientists use deep networks to model and understand biological
    brains? What is the outlook for neuroscientists who seek to characterise computations or neural codes, or who wish
    to understand perception, attention, memory, and executive functions? In this Perspective, our goal is to offer a
    roadmap for systems neuroscience research in the age of deep learning. We discuss the conceptual and methodological
    challenges of comparing behaviour, learning dynamics, and neural representation in artificial and biological
    systems. We highlight new research questions that have emerged for neuroscience as a direct consequence of recent
    advances in machine learning.
"""

def process_text(text):
    return text.strip().replace("\n", " ").replace("  ", " ").replace("  ", " ")

def get_completion(abstract=text_to_rewrite, prompt=system_prompt):
    completion = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[
            {"role": "system", "content": process_text(prompt)},
            {"role": "user", "content": process_text(abstract)},
        ],
        temperature=0.4,
    )
    return completion.choices[0].message.content

def shuffle_text_words(text):
    text = process_text(text)
    words = text.split()
    np.random.shuffle(words)
    return " ".join(words)

def remove_words(text, p=0.5):
    text = process_text(text)
    words = text.split()
    words = [word for word in words if np.random.rand() > p]
    return " ".join(words)

In [67]:
eval_data = []
max_amount = 3000
# shuffle the data
arxiv_df = arxiv_df.sample(frac=1).reset_index(drop=True)

processed_ctr = 0
for idx, row in tqdm(arxiv_df.iterrows(), total=arxiv_df.shape[0]):
    paper_id = row['id']
    title = row['title']
    abstract = row['abstract']
    rewritten_text = get_completion(abstract)
    removed_text_25 = remove_words(abstract, p=0.25)
    removed_text_50 = remove_words(abstract, p=0.5)
    removed_text_75 = remove_words(abstract, p=0.75)
    removed_text_25_shuffled = shuffle_text_words(removed_text_25)
    removed_text_50_shuffled = shuffle_text_words(removed_text_50)
    removed_text_75_shuffled = shuffle_text_words(removed_text_75)
    eval_data.append({
        'id': paper_id,
        'rewritten_text': rewritten_text,
        'removed_text_25': removed_text_25,
        'removed_text_50': removed_text_50,
        'removed_text_75': removed_text_75,
        'removed_text_25_shuffled': removed_text_25_shuffled,
        'removed_text_50_shuffled': removed_text_50_shuffled,
        'removed_text_75_shuffled': removed_text_75_shuffled,
    })
    processed_ctr += 1
    if processed_ctr >= max_amount:
        break

eval_df = pd.DataFrame(eval_data)

  0%|          | 0/159136 [00:00<?, ?it/s]

In [71]:
eval_df.to_parquet(EVAL_DF_PATH, index=False, compression='gzip', engine='pyarrow')

In [68]:
eval_df

Unnamed: 0,id,rewritten_text,removed_text_25,removed_text_50,removed_text_75,removed_text_25_shuffled,removed_text_50_shuffled,removed_text_75_shuffled
0,1501.01742,"""Optimized LDPC-coded transmission over long d...",An modulation scheme with probabilistic shapin...,coded scheme noniterative demapping Full-field...,LDPC modulation scheme noniterative input.,proposed. An noniterative Full-field input. an...,in simulationsshow uniformlydistributed an dis...,scheme LDPC input. modulation noniterative
1,2108.07639,"""Neural compilation of high-level programming ...",learning has had a significant many fields. Re...,learning had impact many fields. neural models...,has on many models code question whether and T...,of explore how Transformer automatecompilation...,refinementand models have these has learning T...,C Although in encourage models this preliminar...
2,1603.00775,"""Derived discrete algebra Krull-Gabriel dimens...",Let be a derived-discrete algebra. We show tha...,"a algebra. We that $\Lambda$-modules, Cantor-B...",$\Lambda$ derived-discrete that of the its a B...,class the complex extending all theindecomposa...,"that Krause. spectrum, result pure-injective r...",$\Lambda$ so derived-discrete the indecomposab...
3,2205.09555,"""Linearized atmospheric flight modeling for pa...",models that can used for control is of importa...,Obtaining models can is utmost importance toen...,Obtaining that utmost toensure like a this pap...,guidance and where used control not learning n...,is based we control a model are ParafoilReturn...,for a is flight embedding tooptimize Obtaining...
4,1505.04067,"""Double Parton Scattering Effects in Charm Qua...",A few quickly developing of double parton scat...,field of double arediscussed. present that the...,few examples quickly parton the of pairs to We...,of production We also variables on state.In si...,separated W^-$. showthat the the to production...,mechanism the dependence quickly few to single...
...,...,...,...,...,...,...,...,...
2995,2011.14017,"""Estimation of regression parameters in cluste...","We study existence, strong consistency asympto...","We study existence, and ofestimators obtained ...",the consistency obtained from transforms. prob...,of characterize estimating extensions in the t...,The vary which a by analysis estimatingfunctio...,which the for included. dependency analysis to...
2996,2008.11698,"""Higher categorical structures in non-commutat...",introduce the notions of shifted and shifted d...,notions shifted shifted double Poissonstructur...,"bisymplectic double on associative algebras, o...",shifted moduli bisymplectic degree bi-Lagrangi...,on various concentrated non-degenerate moduli ...,underlying are hierarchy onnon-commutative cot...
2997,1304.6822,"""Spectrum sharing strategies for reactive prim...",Opportunistic access (OSA) is technique enabli...,Opportunistic spectrum is key technique enabli...,key technique enabling the (SUs) a (CR) by the...,design as PU adopt secondaryusers finite-horiz...,reactive cognitive problem partially a unoccup...,a derive chain. we someinteresting provided en...
2998,1711.0118,"""Underlying event dynamics in high-energy prot...","In contribution, the results soft concerning t...",the results soft theunderlying event and colle...,"contribution, the results hadron theunderlying...",using event theunderlying presented. the Bose-...,theunderlying are the collected collisions ATL...,collisions correlations theunderlying hadron c...
