In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/arxiv_large.csv')

In [3]:
df.shape

(665800, 6)

In [4]:
df.head()

Unnamed: 0,title,summary,comment,authors,category,split
0,Gamma-Ray Bursts as the Death Throes of Massiv...,It is proposed that gamma-ray bursts are creat...,14 pages,"Ramesh Narayan, Bohdan Paczyński, Tsvi Piran",physics,train
1,Gravitational Lensing and the Variability of G,The four observables associated with gravitati...,13 pages plus figures (not included),"Lawrence Krauss, Martin White",physics,test
2,The Ptolemaic Gamma-Ray Burst Universe,The BATSE experiment on GRO has demonstrated t...,10 pages (Replaced to provide omitted line.),J. I. Katz,physics,train
3,Expanding Photospheres of Type II Supernovae a...,We use the Expanding Photosphere Method to det...,21 pages,"B P Schmidt, R P Kirshner, R G Eastman",physics,val
4,Radiation Transfer in Gamma-Ray Bursts,We have calculated gamma-ray radiative transpo...,24 pages,"B. J. Carrigan, J. I. Katz",physics,train


In [5]:
# Function to count words in a string
def word_count(text):
    return len(str(text).split())

# Apply word count to relevant columns and create a new column for total word count
df['total_word_count'] = df['title'].apply(word_count) + \
                         df['summary'].apply(word_count) + \
                         df['comment'].apply(word_count) + \
                         df['authors'].apply(word_count)

# Filter the DataFrame to keep only rows with total word count less than 500
df = df[df['total_word_count'] < 500]

# Drop the temporary total_word_count column
df = df.drop('total_word_count', axis=1)

# Reset the index of the filtered DataFrame
df = df.reset_index(drop=True)

print(f"Shape after filtering: {df.shape}")


Shape after filtering: (665510, 6)


In [6]:
# Create DataFrames for each split
df_train = df[df['split'] == 'train']
df_test = df[df['split'] == 'test']
df_val = df[df['split'] == 'val']

In [7]:
df_train.shape, df_test.shape, df_val.shape

((419279, 6), (66551, 6), (179680, 6))

In [8]:
# Randomly select 50% of each DataFrame
df_train_sample = df_train.sample(frac=0.25, random_state=42)
df_test_sample = df_test.sample(frac=0.25, random_state=42)
df_val_sample = df_val.sample(frac=0.25, random_state=42)

In [9]:
df_train_sample.shape, df_test_sample.shape, df_val_sample.shape

((104820, 6), (16638, 6), (44920, 6))

In [10]:
# Combine the sampled DataFrames into a single DataFrame
df_combined = pd.concat([df_train_sample, df_test_sample, df_val_sample], axis=0)

# Reset the index of the combined DataFrame
df_combined = df_combined.reset_index(drop=True)

In [11]:
df_combined.shape

(166378, 6)

In [12]:
df_combined.head()

Unnamed: 0,title,summary,comment,authors,category,split
0,Bayesian inversion for electromyography using ...,The reconstruction of the structure of biologi...,,"Anna Rörich, Tim A. Werthmann, Dominik Göddeke...",mathematics,train
1,A reassessment of the kinematics of PV Cephei ...,We present two Very Large Array observations o...,"15 pages, 2 figures; accepted for publication ...","L. Loinard, L. F. Rodriguez, L. Gomez, J. Cant...",physics,train
2,A new minimal non-$σ$-scattered linear order,We will show it is consistent with $GCH$ that ...,,Hossein Lamei Ramandi,mathematics,train
3,Proper phase imprinting method for a dark soli...,It is common knowledge that a dark soliton can...,"5 pages, 2 figures, version accepted for publi...","Krzysztof Sacha, Dominique Delande",physics,train
4,The impact of primordial supersonic flows on e...,Tseliakhovich & Hirata recently discovered tha...,Paulo maiora canamus - Accepted for publicatio...,"Umberto Maio, Leon V. E. Koopmans, Benedetta C...",physics,train


In [13]:
# Save the combined DataFrame as a CSV file
output_file = './data/arxiv_small.csv'
df_combined.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")

Data saved to ./data/arxiv_small.csv
