In [1]:
!pip install python-dotenv
!pip install kaggle
!pip install hopsworks
!pip install -q "hopsworks[python]==4.2.*"
!pip install -q feedparser sentence-transformers "hopsworks[python]==4.2.*"

from dotenv import load_dotenv
import os

if not load_dotenv():  # looks for .env in current directory
    raise RuntimeError("No .env file found")

os.environ["KAGGLE_USERNAME"] = "kingaanna"
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_API_TOKEN")
os.environ["HOPSWORKS_API_KEY"] = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_PROJECT"] = "kingaedwin"

if not os.environ.get("KAGGLE_KEY"):
    raise RuntimeError(
        "KAGGLE_API_TOKEN not found. Create a .env file with your Kaggle API key."
    )

if not os.environ.get("HOPSWORKS_API_KEY"):
    raise RuntimeError(
        "HOPSWORKS_API_KEY not found. Create a .env file with your Hopsworks API key."
    )

# Example: download a CSV from GitHub raw URL
!wget https://raw.githubusercontent.com/Edwinexd/arxiv-rag-agent/refs/heads/master/data/arxiv_v2.csv -O arxiv_v2.csv



--2026-01-04 11:14:54--  https://raw.githubusercontent.com/Edwinexd/arxiv-rag-agent/refs/heads/master/data/arxiv_v2.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41605 (41K) [text/plain]
Saving to: ‘arxiv_v2.csv’


2026-01-04 11:14:54 (3.08 MB/s) - ‘arxiv_v2.csv’ saved [41605/41605]



# Log in to hopsworks project

In [19]:
import hopsworks

project = hopsworks.login()

Connection closed.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286343


# DOWNLOAD and UNZIP KAGGLE DATASET

In [3]:
!kaggle datasets download -d Cornell-University/arxiv

Dataset URL: https://www.kaggle.com/datasets/Cornell-University/arxiv
License(s): CC0-1.0
arxiv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
import zipfile
from pathlib import Path

zip_path = Path("arxiv.zip")
extract_dir = Path("arxiv_historical_data")

extract_dir.mkdir(exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_dir)


# Read data into pandas dataframe and clean up

In [None]:
#import pandas as pd

#json_path = extract_dir / "arxiv-metadata-oai-snapshot.json"

#df_full = pd.read_json(
 #   json_path,
 #   lines=True,
 #   nrows=10_000  # adjust as needed
#)

#df_full.head()

In [5]:
import pandas as pd
from pathlib import Path

json_path = extract_dir / "arxiv-metadata-oai-snapshot.json"

chunk_size = 10_000  # read 10k rows at a time
cutoff_date = pd.Timestamp.now() - pd.DateOffset(years=1)

# Use an empty list to collect recent rows (optional, for small tests)
recent_chunks = []

# pd.read_json supports an iterator with chunksize
for chunk_number, df_chunk in enumerate(pd.read_json(json_path, lines=True, chunksize=chunk_size), 1):
    print(f"Processing chunk {chunk_number} ({len(df_chunk)} rows)")

    # Ensure update_date is datetime
    df_chunk['update_date'] = pd.to_datetime(df_chunk['update_date'], errors='coerce')

    # Keep only papers updated in the last year
    df_chunk = df_chunk[df_chunk['update_date'] >= cutoff_date]

    if not df_chunk.empty:
        recent_chunks.append(df_chunk)

# Concatenate all recent chunks into a single DataFrame
df_full = pd.concat(recent_chunks, ignore_index=True)

print("Total rows from last year:", len(df_full))
df_full.head()


Processing chunk 1 (10000 rows)
Processing chunk 2 (10000 rows)
Processing chunk 3 (10000 rows)
Processing chunk 4 (10000 rows)
Processing chunk 5 (10000 rows)
Processing chunk 6 (10000 rows)
Processing chunk 7 (10000 rows)
Processing chunk 8 (10000 rows)
Processing chunk 9 (10000 rows)
Processing chunk 10 (10000 rows)
Processing chunk 11 (10000 rows)
Processing chunk 12 (10000 rows)
Processing chunk 13 (10000 rows)
Processing chunk 14 (10000 rows)
Processing chunk 15 (10000 rows)
Processing chunk 16 (10000 rows)
Processing chunk 17 (10000 rows)
Processing chunk 18 (10000 rows)
Processing chunk 19 (10000 rows)
Processing chunk 20 (10000 rows)
Processing chunk 21 (10000 rows)
Processing chunk 22 (10000 rows)
Processing chunk 23 (10000 rows)
Processing chunk 24 (10000 rows)
Processing chunk 25 (10000 rows)
Processing chunk 26 (10000 rows)
Processing chunk 27 (10000 rows)
Processing chunk 28 (10000 rows)
Processing chunk 29 (10000 rows)
Processing chunk 30 (10000 rows)
Processing chunk 31

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.3502,Cyril Houdayer,Cyril Houdayer,Construction of type ${\rm II_1}$ factors with...,33 pages,"J. reine angew Math. 634 (2009), 169-207",10.1515/CRELLE.2009.072,,math.OA math.GR,http://arxiv.org/licenses/nonexclusive-distrib...,"In the context of Free Probability Theory, w...","[{'version': 'v1', 'created': 'Thu, 26 Apr 200...",2025-07-17,"[[Houdayer, Cyril, ]]"
1,704.3672,Dhananjay Mehendale,Dhananjay P. Mehendale,Hamiltonian Graphs and the Traveling Salesman ...,59 pages. added a new algorithm for K-SAT prob...,,,,math.GM,http://arxiv.org/licenses/nonexclusive-distrib...,A new characterization of Hamiltonian graphs...,"[{'version': 'v1', 'created': 'Fri, 27 Apr 200...",2025-02-26,"[[Mehendale, Dhananjay P., ]]"
2,705.0814,Frederic Mangolte,"Fabrizio Catanese, Fr\'ed\'eric Mangolte",Real singular Del Pezzo surfaces and threefold...,"18 pages, 8 figures, final version to appear i...","Michigan Mathematical Journal 56, 357-373 (2008)",10.1307/mmj/1224783518,,math.AG,,Let W -> X be a real smooth projective three...,"[{'version': 'v1', 'created': 'Sun, 06 May 200...",2025-05-26,"[[Catanese, Fabrizio, ], [Mangolte, Frédéric, ]]"
3,705.1329,Richard J. Mathar,Richard J. Mathar,Third Order Newton's Method for Zernike Polyno...,Version 4 corrects a term in the 3rd line of E...,,,,math.NA cs.NA,http://creativecommons.org/licenses/by-sa/4.0/,The Zernike radial polynomials are a system of...,"[{'version': 'v1', 'created': 'Wed, 09 May 200...",2025-10-13,"[[Mathar, Richard J., ]]"
4,705.1665,Souichi Ishikawa,S. Ishikawa,Spin-dependent three-nucleon force effects on ...,"12 pages, 6 figures, submitted to Phys. Rev. C","Phys.Rev.C75:061002,2007",10.1103/PhysRevC.75.061002,,nucl-th,,We construct a phenomenological three-nucleo...,"[{'version': 'v1', 'created': 'Fri, 11 May 200...",2025-03-20,"[[Ishikawa, S., ]]"


In [22]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372314 entries, 0 to 372313
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   id              372314 non-null  object        
 1   submitter       372299 non-null  object        
 2   authors         372314 non-null  object        
 3   title           372314 non-null  object        
 4   comments        220622 non-null  object        
 5   journal-ref     53753 non-null   object        
 6   doi             69694 non-null   object        
 7   report-no       7629 non-null    object        
 8   categories      372314 non-null  object        
 9   license         371657 non-null  object        
 10  abstract        372314 non-null  object        
 11  versions        372314 non-null  object        
 12  update_date     372314 non-null  datetime64[ns]
 13  authors_parsed  372314 non-null  object        
dtypes: datetime64[ns](1), object(13)
mem

In [None]:
#df_full['update_date'] = pd.to_datetime(df_full['update_date'], errors='coerce')

#cutoff_date = pd.Timestamp.now() - pd.DateOffset(years=1)

#df_full[df_full['update_date'] >= cutoff_date]

#df_full = df_full.sort_values(
 #   by=['id', 'update_date'],
 #   ascending=[True, True]
#)

#df_latest = df_full.drop_duplicates(
 #   subset='id',
 #   keep='last'
#).reset_index(drop=True)


In [6]:
df = df_full[['id', 'title', 'categories', 'abstract', 'update_date']].copy()
df.head(15)

Unnamed: 0,id,title,categories,abstract,update_date
0,704.3502,Construction of type ${\rm II_1}$ factors with...,math.OA math.GR,"In the context of Free Probability Theory, w...",2025-07-17
1,704.3672,Hamiltonian Graphs and the Traveling Salesman ...,math.GM,A new characterization of Hamiltonian graphs...,2025-02-26
2,705.0814,Real singular Del Pezzo surfaces and threefold...,math.AG,Let W -> X be a real smooth projective three...,2025-05-26
3,705.1329,Third Order Newton's Method for Zernike Polyno...,math.NA cs.NA,The Zernike radial polynomials are a system of...,2025-10-13
4,705.1665,Spin-dependent three-nucleon force effects on ...,nucl-th,We construct a phenomenological three-nucleo...,2025-03-20
5,705.1862,Detection of CFIRB with AKARI/FIS Deep Observa...,astro-ph astro-ph.GA,The Cosmic Far-Infrared Background (CFIRB) c...,2025-08-26
6,705.2286,An accurate model for genetic hitch-hiking,q-bio.PE,We suggest a simple deterministic approximat...,2025-10-01
7,705.2576,Adjointability of densely defined closed opera...,math.OA math-ph math.FA math.MP,In this notes unbounded regular operators on...,2025-04-29
8,706.0236,Selbstduale Vertexoperatorsuperalgebren und da...,math.QA math.GR,We investigate self-dual vertex operator alg...,2025-10-13
9,706.0357,A new result under the negation of the Riemann...,math.GM,Suppose that the Riemann hypothesis for the Ri...,2025-12-04


In [7]:
df.dropna(inplace=True)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372314 entries, 0 to 372313
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  372314 non-null  object        
 1   title               372314 non-null  object        
 2   categories          372314 non-null  object        
 3   abstract            372314 non-null  object        
 4   update_date         372314 non-null  datetime64[ns]
 5   sub_categories      372314 non-null  object        
 6   sub_category_names  372314 non-null  object        
 7   main_categories     372314 non-null  object        
 8   text_to_embed       372314 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 25.6+ MB


# Create FEATURE GROUPS for embeddings

# **OPTION 1 id+embedding only**

In [14]:
fs = project.get_feature_store()

arxiv_fg = fs.get_or_create_feature_group(
    name='arxiv_embeddings',
    description='Embeddings of titles and abstracts of arXiv papers',
    version=1,
    primary_key=['id'],
)


# Embeddings

In [15]:
from sentence_transformers import SentenceTransformer

# Load a popular embedding model
model = SentenceTransformer('all-MiniLM-L6-v2') # Change if needed

# Your texts to embed
df['text_to_embed'] = df['title'] + ". " + df['abstract']

# Small smaple for testing
# texts = df['text_to_embed'].head(10).tolist()  # first 10 papers
# embeddings = model.encode(texts)

batch_size = 128

for i in range(0, len(df), batch_size):
    batch_texts = df['text_to_embed'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, show_progress_bar=True)

     # Pair ids with embeddings using the same slice
    arxiv_fg.insert(pd.DataFrame({
        'id': df['id'].iloc[i:i+batch_size],
        'embedding': list(batch_embeddings)
    }))


# print(f"Shape: {embeddings.shape}")
# print(f"First embedding: {embeddings[0][:5]}...")  # First 5 values


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00
Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kingaedwin/Resources/jobs/arxiv_embeddings_1_offline_fg_materialization/config_1767443106747) to trigger the materialization job again.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:01 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 16/16 | Elapsed Time: 00:00 | Remaining Time: 00:00




---

# **OPTION 2 with CATEGORIES**


# Category mapping?

In [8]:
cat_df = pd.read_csv("arxiv_v2.csv")

# Build lookup dictionaries
sub_to_name = dict(zip(cat_df['code'], cat_df['name']))          # code -> human-readable sub category name
sub_to_main = dict(zip(cat_df['code'], cat_df['main_category'])) # code -> main category

df['sub_categories'] = df['categories'].str.split()  # list of subcategory codes

# Map to sub-category names
df['sub_category_names'] = df['sub_categories'].apply(
    lambda subs: [sub_to_name.get(s, s) for s in subs]  # fallback to code if name not found
)

# Map to main-category names (deduplicated)
df['main_categories'] = df['sub_categories'].apply(
    lambda subs: list({sub_to_main.get(s, 'Other') for s in subs})
)


print(df[['categories', 'sub_category_names', 'main_categories']].head(10))


                        categories  \
0                  math.OA math.GR   
1                          math.GM   
2                          math.AG   
3                    math.NA cs.NA   
4                          nucl-th   
5             astro-ph astro-ph.GA   
6                         q-bio.PE   
7  math.OA math-ph math.FA math.MP   
8                  math.QA math.GR   
9                          math.GM   

                                  sub_category_names  \
0                  [Operator Algebras, Group Theory]   
1                              [General Mathematics]   
2                               [Algebraic Geometry]   
3           [Numerical Analysis, Numerical Analysis]   
4                                   [Nuclear Theory]   
5               [astro-ph, Astrophysics of Galaxies]   
6                        [Populations and Evolution]   
7  [Operator Algebras, Mathematical Physics, Func...   
8                    [Quantum Algebra, Group Theory]   
9                    

In [21]:
fs = project.get_feature_store()

arxiv_fg_with_cats = fs.get_or_create_feature_group(
    name='arxiv_embeddings_with_cats',
    description='Embeddings of titles and abstracts of arXiv papers with categories',
    version=1,
    primary_key=['id'],
    time_travel_format='hudi'
)

Checkpointing to Hopsworks

In [22]:
checkpoint_fg = fs.get_or_create_feature_group(
    name='arxiv_checkpoints',
    description='Checkpoints for arxiv_embeddings',
    version=1,
    primary_key=['pipeline'],
    time_travel_format="hudi"
)

In [11]:
checkpoint_fg

<hsfs.feature_group.FeatureGroup at 0x7e70fd4e7590>

In [68]:
# import sys

# Force reinstall the correct hopsworks client version
# !{sys.executable} -m pip install --force-reinstall --no-deps "hopsworks[python]==4.2.*"

# After installation, you might need to restart the runtime for changes to take effect.
# print("Hopsworks client reinstalled. Please restart your Colab runtime (Runtime -> Restart runtime) and then run all cells again starting from the 'Log in to hopsworks project' cell.")

Collecting hopsworks==4.2.* (from hopsworks[python]==4.2.*)
  Using cached hopsworks-4.2.10-py3-none-any.whl.metadata (11 kB)
Using cached hopsworks-4.2.10-py3-none-any.whl (665 kB)
Installing collected packages: hopsworks
  Attempting uninstall: hopsworks
    Found existing installation: hopsworks 4.2.10
    Uninstalling hopsworks-4.2.10:
      Successfully uninstalled hopsworks-4.2.10
Successfully installed hopsworks-4.2.10
Hopsworks client reinstalled. Please restart your Colab runtime (Runtime -> Restart runtime) and then run all cells again starting from the 'Log in to hopsworks project' cell.


In [15]:
checkpoint_fg.insert(pd.DataFrame({
        'pipeline': ["arxiv_embeddings"],
        'last_index': [0]
    }))

Uploading Dataframe: 100.00% |██████████| Rows 1/1 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_checkpoints_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_checkpoints_1_offline_fg_materialization/executions


(Job('arxiv_checkpoints_1_offline_fg_materialization', 'SPARK'), None)

In [23]:
checkpoint_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.53s) 


Unnamed: 0,pipeline,last_index
0,arxiv_embeddings,9728


In [24]:
try:
    checkpoint_df = checkpoint_fg.read()
    if checkpoint_df.empty:
        start_i = 0
    else:
        # Make sure we convert to int, even if stored as string or float
        start_i = int(checkpoint_df["last_index"].max())
except Exception as e:
    print(f"Could not read checkpoint FG: {e}")
    start_i = 0

print(f"Resuming from index {start_i}")


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.42s) 
Resuming from index 9728


In [None]:
from sentence_transformers import SentenceTransformer

# Load a popular embedding model
model = SentenceTransformer('all-MiniLM-L6-v2') # Change if needed

# Your texts to embed
df['text_to_embed'] = df['title'] + ". " + df['abstract']

# Small smaple for testing
# texts = df['text_to_embed'].head(10).tolist()  # first 10 papers
# embeddings = model.encode(texts)

batch_size = 1024

for i in range(start_i, len(df), batch_size):
    batch_texts = df['text_to_embed'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, show_progress_bar=True)

     # Pair ids with embeddings using the same slice
    arxiv_fg_with_cats.insert(pd.DataFrame({
        'id': df['id'].iloc[i:i+batch_size],
        'embedding': list(batch_embeddings),
        'categories': df['main_categories'].iloc[i:i+batch_size],
        'sub_categories': df['sub_categories'].iloc[i:i+batch_size]
    }), wait=True)

     # Checkpointing in Hopsworks
    checkpoint_fg.insert(pd.DataFrame({
        "pipeline": ["arxiv_embeddings"],
        "last_index": [i + batch_size]
    }), wait=True)



Batches:   0%|          | 0/313 [00:00<?, ?it/s]