In [1]:
!pip install python-dotenv
!pip install kaggle
!pip install hopsworks
!pip install -q "hopsworks[python]"
!pip install sentence-transformers

from dotenv import load_dotenv
import os

load_dotenv()  # looks for .env in current directory

os.environ["KAGGLE_USERNAME"] = "kingaanna"
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_API_TOKEN")
os.environ["HOPSWORKS_API_KEY"] = os.getenv("HOPSWORKS_API_KEY")
os.environ["HOPSWORKS_PROJECT"] = "kingaedwin"

if not os.environ.get("KAGGLE_KEY"):
    raise RuntimeError(
        "KAGGLE_API_TOKEN not found. Create a .env file with your Kaggle API key."
    )

if not os.environ.get("HOPSWORKS_API_KEY"):
    raise RuntimeError(
        "HOPSWORKS_API_KEY not found. Create a .env file with your Hopsworks API key."
    )




# Log in to hopsworks project

In [2]:
import hopsworks

project = hopsworks.login()



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286343


# DOWNLOAD and UNZIP KAGGLE DATASET

In [5]:
!kaggle datasets download -d Cornell-University/arxiv

Dataset URL: https://www.kaggle.com/datasets/Cornell-University/arxiv
License(s): CC0-1.0
arxiv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
import zipfile
from pathlib import Path

zip_path = Path("arxiv.zip")
extract_dir = Path("arxiv_historical_data")

extract_dir.mkdir(exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_dir)


# Read data into pandas dataframe and clean up

In [30]:
import pandas as pd

json_path = extract_dir / "arxiv-metadata-oai-snapshot.json"

df_full = pd.read_json(
    json_path,
    lines=True,
    nrows=10_000  # adjust as needed
)

df_full.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [None]:
df_full.info()

In [24]:
#df_full['update_date'] = pd.to_datetime(df_full['update_date'], errors='coerce')

#cutoff_date = pd.Timestamp.now() - pd.DateOffset(years=1)

#df_full[df_full['update_date'] >= cutoff_date]

#df_full = df_full.sort_values(
 #   by=['id', 'update_date'],
 #   ascending=[True, True]
#)

#df_latest = df_full.drop_duplicates(
 #   subset='id',
 #   keep='last'
#).reset_index(drop=True)


In [8]:
df = df_full[['id', 'title', 'categories', 'abstract', 'update-date']].copy()
df.head(15)

Unnamed: 0,id,title,categories,abstract
0,704.0001,Calculation of prompt diphoton production cros...,hep-ph,A fully differential calculation in perturba...
1,704.0002,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"We describe a new algorithm, the $(k,\ell)$-..."
2,704.0003,The evolution of the Earth-Moon system based o...,physics.gen-ph,The evolution of Earth-Moon system is descri...
3,704.0004,A determinant of Stirling cycle numbers counts...,math.CO,We show that a determinant of Stirling cycle...
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,In this paper we show how to compute the $\L...
5,704.0006,Bosonic characters of atomic Cooper pairs acro...,cond-mat.mes-hall,We study the two-particle wave function of p...
6,704.0007,Polymer Quantum Mechanics and its Continuum Limit,gr-qc,A rather non-standard quantum representation...
7,704.0008,Numerical solution of shock and ramp compressi...,cond-mat.mtrl-sci,A general formulation was developed to repre...
8,704.0009,"The Spitzer c2d Survey of Large, Nearby, Inste...",astro-ph,We discuss the results from the combined IRA...
9,704.001,"Partial cubes: structures, characterizations, ...",math.CO,Partial cubes are isometric subgraphs of hyp...


In [None]:
df.dropna(inplace=True)

# Create FEATURE GROUPS for embeddings

In [13]:
fs = project.get_feature_store()

arxiv_fg = fs.get_or_create_feature_group(
    name='arxiv_embeddings',
    description='Embeddings of titles and abstracts of arXiv papers',
    version=1,
    primary_key=['id'],
)


# Embeddings

In [14]:
from sentence_transformers import SentenceTransformer

# Load a popular embedding model
model = SentenceTransformer('all-MiniLM-L6-v2') # Change if needed

# Your texts to embed
df['text_to_embed'] = df['title'] + ". " + df['abstract']

# Small smaple for testing
# texts = df['text_to_embed'].head(10).tolist()  # first 10 papers
# embeddings = model.encode(texts)

batch_size = 128

for i in range(0, len(df), batch_size):
    batch_texts = df['text_to_embed'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, show_progress_bar=True)

     # Pair ids with embeddings using the same slice
    arxiv_fg.insert(pd.DataFrame({
        'id': df['id'].iloc[i:i+batch_size],
        'embedding': list(batch_embeddings)
    }))


# print(f"Shape: {embeddings.shape}")
# print(f"First embedding: {embeddings[0][:5]}...")  # First 5 values


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00
Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kingaedwin/Resources/jobs/arxiv_embeddings_1_offline_fg_materialization/config_1767443106747) to trigger the materialization job again.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00
Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kingaedwin/Resources/jobs/arxiv_embeddings_1_offline_fg_materialization/config_1767443106747) to trigger the materialization job again.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 



---

# **OPTION 2 with CATEGORIES**


# Category mapping?

In [9]:
cat_df = pd.read_csv("arxiv_v2.csv")

# Build lookup dictionaries
sub_to_name = dict(zip(cat_df['code'], cat_df['name']))          # code -> human-readable sub category name
sub_to_main = dict(zip(cat_df['code'], cat_df['main_category'])) # code -> main category

df['sub_categories'] = df['categories'].str.split()  # list of subcategory codes

# Map to sub-category names
df['sub_category_names'] = df['sub_categories'].apply(
    lambda subs: [sub_to_name.get(s, s) for s in subs]  # fallback to code if name not found
)

# Map to main-category names (deduplicated)
df['main_categories'] = df['sub_categories'].apply(
    lambda subs: list({sub_to_main.get(s, 'Other') for s in subs})
)


print(df[['categories', 'sub_category_names', 'main_categories']].head(10))


          categories                                 sub_category_names  \
0             hep-ph              [High Energy Physics - Phenomenology]   
1      math.CO cs.CG            [Combinatorics, Computational Geometry]   
2     physics.gen-ph                                  [General Physics]   
3            math.CO                                    [Combinatorics]   
4    math.CA math.FA  [Classical Analysis and ODEs, Functional Analy...   
5  cond-mat.mes-hall                  [Mesoscale and Nanoscale Physics]   
6              gr-qc         [General Relativity and Quantum Cosmology]   
7  cond-mat.mtrl-sci                                [Materials Science]   
8           astro-ph                                         [astro-ph]   
9            math.CO                                    [Combinatorics]   

                   main_categories  
0                        [Physics]  
1  [Mathematics, Computer Science]  
2                        [Physics]  
3                    [Math

In [38]:
fs = project.get_feature_store()

arxiv_fg_with_cats = fs.get_or_create_feature_group(
    name='arxiv_embeddings_with_cats',
    description='Embeddings of titles and abstracts of arXiv papers with categories',
    version=1,
    primary_key=['id'],
    time_travel_format='hudi'
)

In [39]:
from sentence_transformers import SentenceTransformer

# Load a popular embedding model
model = SentenceTransformer('all-MiniLM-L6-v2') # Change if needed

# Your texts to embed
df['text_to_embed'] = df['title'] + ". " + df['abstract']

# Small smaple for testing
# texts = df['text_to_embed'].head(10).tolist()  # first 10 papers
# embeddings = model.encode(texts)

batch_size = 128

for i in range(0, len(df), batch_size):
    batch_texts = df['text_to_embed'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, show_progress_bar=True)

     # Pair ids with embeddings using the same slice
    arxiv_fg_with_cats.insert(pd.DataFrame({
        'id': df['id'].iloc[i:i+batch_size],
        'embedding': list(batch_embeddings),
        'categories': df['main_categories'].iloc[i:i+batch_size],
        'sub_categories': df['sub_categories'].iloc[i:i+batch_size]
    }))



Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286343/fs/1273965/fg/1880614


Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00
Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/kingaedwin/Resources/jobs/arxiv_embeddings_with_cats_1_offline_fg_materialization/config_1767457947809) to trigger the materialization job again.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: arxiv_embeddings_with_cats_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286343/jobs/named/arxiv_embeddings_with_cats_1_offline_fg_materialization/executions


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 128/128 | Elapsed Time: 00:00 | Remaining Time: 00:00


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading Dataframe: 100.00% |██████████| Rows 16/16 | Elapsed Time: 00:00 | Remaining Time: 00:00
