In [1]:
# open arxiv-metadata-oai-snapshot.json to extract the abstract from each line, marked by "abstract": "..."
abstracts = []
with open('arxiv-metadata-oai-snapshot.json') as f:
    # load all lines
    while True:
        line = f.readline()
        if not line:
            break
        abstract = line.split('"abstract":"')[1].split('",')[0].replace('\\n', ' ')
        # print(abstract)
        if abstract[0] == ' ':
            abstract = abstract[1:]
        abstracts.append(abstract)



In [2]:
import pandas as pd
df = pd.DataFrame(abstracts, columns=["text"])


In [3]:
import re
# use df for cleaning, much faster than using list/loop or hf datasets
# remove latex things func
def clean_latex(text):
    # Remove inline math
    text = re.sub(r'\$.*?\$', '', text)
    
    # Remove display math
    text = re.sub(r'\\\[.*?\\\]', '', text, flags=re.DOTALL)
    
    # Remove LaTeX commands
    text = re.sub(r'\\[a-zA-Z]+(?:\[.*?\])?(?:\{.*?\})?', '', text)
    
    # Remove remaining curly braces
    text = re.sub(r'\{|\}', '', text)
    
    # remove backslashes, 2 or more
    text = re.sub(r'\\{2,}', '', text)
    
    text = text.strip()
    return text

# remove duplicates
df = df.drop_duplicates()

# remove latex
df["text"] = df["text"].apply(clean_latex)

# remove all strings shorter than 150 characters
df = df[df["text"].str.len() > 150]

# remove all starting with This paper has been withdrawn
df = df[~df["text"].str.startswith('This submission has been withdrawn')]

# to lower case
df["text"] = df["text"].str.lower()


In [4]:
# get short summary of the data
print(df.describe())

                                                     text
count                                             2492389
unique                                            2492320
top     we calculate the cross section of inclusive di...
freq                                                    2


In [5]:
# get the shortest text length
min_len = df["text"].apply(len).min()
# print all stings of this length
temp_df = df[df["text"].apply(len) == min_len]["text"]
# Set the option to display the entire content of the column
pd.set_option('display.max_colwidth', None)

# Print the DataFrame without the index
print(temp_df.to_string(index=False))


we prove brion's conjecture stating that the closure of the orbit of a self-normalizing spherical subalgebra in the corresponding grassmanian is smooth
we derive the scalar resonance coupling constants of resonance chiral theory from the extended nambu jona-lasinio model by using heat-kernel expansion.
in this paper we determine the number of the meaningful compositions of higher order of the differential operations and gateaux directional derivative.
photonic entanglement has a wide range of applications in quantum computation and communication. here we introduce a new device: the \"photonic module\
by counting the numbers of periodic points of all periods for some interval maps, we obtain infinitely many new congruence identities in number theory.
the hasse-weil-serre bound is improved for curves of low genera over finite fields with discriminant in -3,-4,-7,-8,-11,-19 by studying optimal curves.
assuming that first significant results from lhc become available, this presentation ass

In [6]:
# make df a hf dataset
import datasets
dataset_from_df = datasets.Dataset.from_pandas(df)
# remove the index column
dataset_from_df = dataset_from_df.remove_columns('__index_level_0__')

In [7]:
dataset_from_df

Dataset({
    features: ['text'],
    num_rows: 2492389
})

In [8]:
temp = dataset_from_df
# split into train and test
train = temp.train_test_split(test_size=0.02)
# split test into test and validation
test = train["test"].train_test_split(test_size=0.5)
# aggregate into one dataset
dataset = datasets.DatasetDict({"train": train["train"], "test": test["train"], "validation": test["test"]})

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2442541
    })
    test: Dataset({
        features: ['text'],
        num_rows: 24924
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 24924
    })
})

In [10]:
# safe the dataset
dataset.save_to_disk("kaggle/input/train-datasets/arxiv_abstracts")

Saving the dataset (0/5 shards):   0%|          | 0/2442541 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24924 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24924 [00:00<?, ? examples/s]

In [11]:
data_train = dataset["train"]
data_validation = dataset["validation"]
data_test = dataset["test"]

In [12]:
# tokenize the data to see min length of the text
import os
from dotenv import load_dotenv
import huggingface_hub

load_dotenv()

# load token
hf_token = os.getenv("HF_TOKEN")
huggingface_hub.login(token=hf_token)

# login into the clients
huggingface_hub.login(token=hf_token)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
tokenizer.padding_side = 'right'



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


In [13]:
!rm -rf "kaggle/working/temp_data_sets"
!mkdir "kaggle/working/temp_data_sets"

working_dir = "kaggle/working/temp_data_sets/"


In [14]:


max_length = 16
# Map function to apply tokenization and caching - TODO use formatting function to avoid code redudancy
train_data = data_train.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",    # Pad to the maximum sequence length
        truncation=True,         # Truncate sequences longer than max_length
        max_length=max_length,          # Maximum sequence length
        return_attention_mask=True,  # Return attention masks
        return_tensors="pt"      # Return PyTorch tensors
    ),
    batched=True,
    cache_file_name=working_dir + "vp_train.cache"
)


val_data = data_validation.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    ),
    batched=True,
    cache_file_name=working_dir + "vp_valid.cache"
)

test_data = data_test.map(
    lambda examples: tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    ),
    batched=True,
    cache_file_name=working_dir + "vp_test.cache"
)

Map:   0%|          | 0/2442541 [00:00<?, ? examples/s]

Map:   0%|          | 0/24924 [00:00<?, ? examples/s]

Map:   0%|          | 0/24924 [00:00<?, ? examples/s]

In [15]:
import numpy as np

In [16]:
min_enc = 1000
max_enc = 0
for enc_data in [np.array(train_data["input_ids"]), np.array(val_data["input_ids"]), np.array(test_data["input_ids"])]:
    # enc data is 2D array
    # find the min and max length of the encoded data excluding zeros
    for enc in enc_data:
        temp_len = len(enc[enc != 0])
        if temp_len < min_enc:
            min_enc = temp_len
        if temp_len > max_enc:
            max_enc = temp_len

In [17]:
min_enc, max_enc

(16, 16)