In [129]:
import pandas as pd
from pathlib import Path
import numpy as np
import yaml

## Assemble Dataset

# (!) Crucial. Include all test-examples

#### `1536` 256 * 6 (256 per Publisher)
50% (768 = test)

In [130]:
# load split
with open('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/split_official/pymupdf.yaml', 'r') as f:
    subsets = yaml.safe_load(f)

path_to_subset = {}

# `path` : subset
for key, value_list in subsets.items():
    for item in value_list:
        path_to_subset[item] = key  # Assign the key to each element in the list


In [144]:
save_1536_flag = False # 2nd October

# 1536 dataset
df_categories_final = pd.read_csv('./final_predicted_meta/predicted_categories_final.csv', sep='|')

# add subset
df_categories_final['subset'] = df_categories_final['path'].map(path_to_subset)
df_categories_final = df_categories_final[~(df_categories_final['subset'].isna())]

# Initialize an empty list to store the sampled subframes
sampled_subframes = []

seedVal = 316484 # ensures no duplicates
test_ratio = 0.5
# Group by 'publisher' and sample 200 rows per group, favoring 'test' in 'subset'
for path_value, group in df_categories_final.groupby('publisher'):
    if len(group) >= 256:
        # Separate 'test' rows and other rows
        test_group = group[group['subset'] == 'test']
        other_group = group[group['subset'] != 'test']

        # Calculate how many 'test' and 'other' rows to sample based on the ratio
        n_test = min(int(256 * test_ratio), len(test_group))
        n_other = 256 - n_test

        # Sample 'test' rows and 'other' rows
        sampled_test = test_group.sample(n=n_test, random_state=seedVal)
        sampled_other = other_group.sample(n=n_other, random_state=seedVal) if len(other_group) > 0 else pd.DataFrame()

        # Concatenate the two sampled subframes and append to the list
        sampled_subframes.append(pd.concat([sampled_test, sampled_other]))
    else:
        # If less than 256 rows in the group, append all rows
        sampled_subframes.append(group)

# Concatenate all sampled subframes row-wise into one DataFrame
df_sampled = pd.concat(sampled_subframes, axis=0, ignore_index=True)

# Display the resulting DataFrame
freq, counts = np.unique(df_sampled['class'], return_counts=True)

# store
if save_1536_flag:
    df_sampled.to_csv('./testset_1536/df_1536.csv', sep='|', index=None)

In [146]:
np.unique(df_sampled['publisher'], return_counts=True), len(set(df_sampled['path'])), np.unique(df_sampled['subset'], return_counts=True)

((array(['Nature', 'arxiv', 'biorxiv', 'bmc', 'mdpi', 'medrxiv'],
        dtype=object),
  array([256, 256, 256, 256, 256, 256])),
 1536,
 (array(['test', 'train', 'val'], dtype=object), array([768, 501, 267])))

In [None]:
len(set(df_sampled['path']))

## 10_000 papers

### split
test: `35%`, val: `11%` (as much as possible)

### publishers
Nature: `9.3%`, MedRXiv : `14.1%`, MDPI: `16%`,  all others: `20.2%`

In [206]:
### 10000 

save_10k_flag = False # 2nd october

# 10_000 dataset
df_categories_final = pd.read_csv('./final_predicted_meta/predicted_categories_final.csv', sep='|')

# ubset
df_categories_final['subset'] = df_categories_final['path'].map(path_to_subset)
df_categories_final = df_categories_final[~(df_categories_final['subset'].isna())]

# Initialize an empty list to store the sampled subframes
sampled_subframes = []

seedVal = 316484 # ensures no duplicates
test_ratio = 0.59
k = 2068
# Group by 'publisher' and sample 200 rows per group, favoring 'test' in 'subset'
for path_value, group in df_categories_final.groupby('publisher'):
    if len(group) >= k:
        # Separate 'test' rows and other rows
        test_group = group[group['subset'] == 'test']
        other_group = group[group['subset'] != 'test']

        # Calculate how many 'test' and 'other' rows to sample based on the ratio
        n_test = min(int(k * test_ratio), len(test_group))
        n_other = k - n_test

        # Sample 'test' rows and 'other' rows
        sampled_test = test_group.sample(n=n_test, random_state=seedVal)
        sampled_other = other_group.sample(n=n_other, random_state=seedVal) if len(other_group) > 0 else pd.DataFrame()

        # Concatenate the two sampled subframes and append to the list
        sampled_subframes.append(pd.concat([sampled_test, sampled_other]))
    else:
        # If less than 256 rows in the group, append all rows
        sampled_subframes.append(group)


# Concatenate all sampled subframes row-wise into one DataFrame
df_sampled = pd.concat(sampled_subframes, axis=0, ignore_index=True)

# Display the resulting DataFrame
freq, counts = np.unique(df_sampled['class'], return_counts=True)

# subet to exactly 10
df_sampled_unique = df_sampled.drop_duplicates(subset='path')

# Keep only the first 10,000 rows
df_sampled = df_sampled_unique.head(10240)

# store
if save_10k_flag:
    df_sampled.to_csv('./testset_10240/df_10240.csv', sep='|', index=None)

In [207]:
len(df_sampled), len(set(df_sampled['path'])), np.unique(df_sampled['publisher'], return_counts=True), len(set(df_sampled['path'])), np.unique(df_sampled['subset'], return_counts=True)

(10240,
 10240,
 (array(['Nature', 'arxiv', 'biorxiv', 'bmc', 'mdpi', 'medrxiv'],
        dtype=object), array([ 959, 2068, 2068, 1634, 2068, 1443])),
 10240,
 (array(['test', 'train', 'val'], dtype=object), array([2688, 6524, 1028])))