In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/arxiv_large.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# Function to count words in a string
def word_count(text):
    return len(str(text).split())

# Apply word count to relevant columns and create a new column for total word count
df['total_word_count'] = df['title'].apply(word_count) + \
                         df['summary'].apply(word_count) + \
                         df['comment'].apply(word_count) + \
                         df['authors'].apply(word_count)

# Filter the DataFrame to keep only rows with total word count less than 500
df = df[df['total_word_count'] < 500]

# Drop the temporary total_word_count column
df = df.drop('total_word_count', axis=1)

# Reset the index of the filtered DataFrame
df = df.reset_index(drop=True)

print(f"Shape after filtering: {df.shape}")


In [6]:
# Create DataFrames for each split
df_train = df[df['split'] == 'train']
df_test = df[df['split'] == 'test']
df_val = df[df['split'] == 'val']

In [None]:
df_train.shape, df_test.shape, df_val.shape

In [8]:
# Randomly select 50% of each DataFrame
df_train_sample = df_train.sample(frac=0.125, random_state=42)
df_test_sample = df_test.sample(frac=0.125, random_state=42)
df_val_sample = df_val.sample(frac=0.125, random_state=42)

In [None]:
df_train_sample.shape, df_test_sample.shape, df_val_sample.shape

In [10]:
# Combine the sampled DataFrames into a single DataFrame
df_combined = pd.concat([df_train_sample, df_test_sample, df_val_sample], axis=0)

# Reset the index of the combined DataFrame
df_combined = df_combined.reset_index(drop=True)

In [None]:
df_combined.shape

In [None]:
df_combined.head()

In [None]:
# Save the combined DataFrame as a CSV file
output_file = './data/arxiv.csv'
df_combined.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")
