In [1]:
import nltk
import json
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from sklearn.utils import shuffle

#nltk.download('punkt')

## Load Dataset

In [2]:
pubmed_ds = load_dataset("ccdv/pubmed-summarization",split="test")
pubmed_features = ['article', 'abstract']

In [3]:
govreport_ds = load_dataset("ccdv/govreport-summarization",split="test")
govreport_features = ['report', 'summary']

In [4]:
with open("writer_summaries.json", "r", encoding="utf-8") as f:
    cnn_xsum_df = json.load(f)
cnn_xsum_features = ['article', 'summary']

In [5]:
billsum_ds = load_dataset("FiscalNote/billsum",split="test")
billsum_features = ['text', 'summary']

In [6]:
file_path = "test-stats.jsonl"
#lil-lab/newsroom
with open(file_path, "r", encoding="utf-8") as f:
    newsroom_ds = [json.loads(line) for line in f]
newsroom_features = ["text","summary"]

### Skip for now


In [20]:
big_patent_ds = load_dataset("NortheasternUniversity/big_patent",split="test",trust_remote_code=True)
big_patent_features = ['description','abstract']
booksum_ds = load_dataset("kmfoda/booksum",split="test")
booksum_features = ['chapter','summary_text']

## Splitting

In [7]:
def partition_dataset(df,dataset_name,columns,save_results=False):
    word_bins = list(range(50, 301, 50))
    sentence_bins = list(range(1, 7))
    ratio_bins = [1/20, 1/10, 1/5, 1/4, 1/3, 1/2]

    word_partition = {bin_size: [] for bin_size in word_bins}
    sentence_partition = {bin_size: [] for bin_size in sentence_bins}
    ratio_partition = {ratio: [] for ratio in ratio_bins}

    ratio_names = {
        1/20: "1/20", 
        1/10: "1/10", 
        1/5: "1/5", 
        1/4: "1/4", 
        1/3: "1/3", 
        1/2: "1/2"
    }

    benchmark_data = []

    for i, item in tqdm(enumerate(df)):
        sample_name = f"{dataset_name}_{i}"
        article = item[columns[0]]
        summary = item[columns[1]]

        tokenized_summary = nltk.word_tokenize(summary)
        summary_word_count = len(tokenized_summary)

        tokenized_article = nltk.word_tokenize(article)
        article_word_count = len(tokenized_article)

        sentence_count = len(nltk.sent_tokenize(summary))
        
        ratio = round(summary_word_count / article_word_count,3)

        lower_bound = round((summary_word_count - 5) / article_word_count,3)
        upper_bound = round((summary_word_count + 5) / article_word_count,3)
        
        sample_data = {
            "id": sample_name,
            "document": article,
            "summary": summary,
            "document_word_count": article_word_count,
            "summary_word_count": summary_word_count,
            "summary_sentence_count": sentence_count,
            "lower_compression_ratio": lower_bound,
            "compression_ratio": ratio,
            "upper_compression_ratio": upper_bound,
            "split_word": None,
            "split_sentence": None,
            "split_ratio": None  
        }
        
        for bin_size in word_bins:
            if bin_size - 5 <= summary_word_count <= bin_size + 5:
                word_partition[bin_size].append(sample_name)
                sample_data["split_word"]= f"{bin_size}"
                break
        
        if 1 <= sentence_count <= 6:
            sentence_partition[sentence_count].append(sample_name)
            sample_data["split_sentence"] = f"{sentence_count}"
        
        for r in ratio_bins:
            if lower_bound <= round(r,3) <= upper_bound:
                ratio_partition[r].append(sample_name)
                sample_data["split_ratio"] = f"{ratio_names[r]}"
                break
                
        if sample_data["split_word"] or sample_data["split_sentence"] or sample_data["split_ratio"]:
            benchmark_data.append(sample_data)
        
    benchmark_data_df = pd.DataFrame(benchmark_data)
    if save_results:
        benchmark_data_df.to_csv(f"./results_csv/{dataset_name}.csv", index=False)
    return benchmark_data_df

In [8]:
pubmed_df = partition_dataset(pubmed_ds,"pubmed",pubmed_features,True)
pubmed_df.shape

6658it [00:56, 118.54it/s]


(3653, 12)

In [9]:
govreport_df = partition_dataset(govreport_ds,"govreport",govreport_features,True)
govreport_df.shape

973it [00:24, 40.02it/s]


(53, 12)

In [10]:
cnn_df = partition_dataset(cnn_xsum_df,"cnn_xsum", cnn_xsum_features,True)
cnn_df.shape

302it [00:00, 409.21it/s]


(302, 12)

In [11]:
billsum_df = partition_dataset(billsum_ds,"billsum", billsum_features,True)
billsum_df.shape

3269it [00:16, 200.81it/s]


(2685, 12)

In [12]:
newsroom_df = partition_dataset(newsroom_ds,"newsroom", newsroom_features,True)
newsroom_df.shape

0it [00:00, ?it/s]

108862it [03:58, 456.25it/s] 


(108166, 12)

### Skip for now

In [21]:
booksum_df = partition_dataset(booksum_ds,"booksum", booksum_features,True)
booksum_df.shape

1431it [00:21, 65.57it/s] 


(438, 12)

In [22]:
booksum_df['split_word'].value_counts(),sum(booksum_df['split_word'].value_counts())

(split_word
 150    40
 100    38
 200    33
 250    27
 300    17
 50     15
 Name: count, dtype: int64,
 170)

In [77]:
big_patent_df = partition_dataset(big_patent_ds,"big_patent", big_patent_features,True)
big_patent_df.shape

67072it [16:07, 69.29it/s] 


(66857, 12)

## Combine results

In [13]:
df_concat = pd.concat([pubmed_df, govreport_df, cnn_df, billsum_df,newsroom_df], ignore_index=True)


In [14]:
df_concat.to_csv(f"results_given_dataset.csv", index=False)

## Results

In [15]:
df_concat.shape

(114859, 12)

In [16]:
df_concat['split_word'].value_counts(),sum(df_concat['split_word'].value_counts())

(split_word
 50     5175
 100     624
 150     557
 250     458
 200     398
 300     330
 Name: count, dtype: int64,
 7542)

In [17]:
df_concat['split_sentence'].value_counts(),sum(df_concat['split_sentence'].value_counts())

(split_sentence
 1    82922
 2    18679
 3     5286
 4     2779
 5     2095
 6     1622
 Name: count, dtype: int64,
 113383)

In [18]:
df_concat['split_ratio'].value_counts(),sum(df_concat['split_ratio'].value_counts())

(split_ratio
 1/20    19278
 1/10    10273
 1/5      4079
 1/4      1634
 1/3      1397
 1/2      1008
 Name: count, dtype: int64,
 37669)

In [37]:
word_bins = list(range(50, 501, 50))
sentence_bins = list(range(1, 11))
ratio_bins = [1/20, 1/10, 1/9, 1/8, 1/7, 1/6, 1/5, 1/4, 1/3, 1/2]

# Initialize storage dictionaries with more comprehensive data
word_partition = {bin_size: [] for bin_size in word_bins}
sentence_partition = {bin_size: [] for bin_size in sentence_bins}
ratio_partition = {ratio: [] for ratio in ratio_bins}

# Also create a complete dataset with all metadata
benchmark_data = []

dataset_name = "pubmed"
# Process summaries
for i, item in tqdm(enumerate(pubmed_ds)):
    sample_name = f"{dataset_name}_{i}"
    article = item['article']
    summary = item['abstract']

    tokenized_summary = nltk.word_tokenize(summary)
    summary_word_count = len(tokenized_summary)

    tokenized_article = nltk.word_tokenize(article)
    article_word_count = len(tokenized_article)

    sentence_count = len(nltk.sent_tokenize(summary))
    
    # Calculate compression ratio
    ratio = summary_word_count / article_word_count
    ratio_threshold=5
    lower_bound = (summary_word_count - ratio_threshold) / (article_word_count- ratio_threshold)
    upper_bound = (summary_word_count + ratio_threshold) / (article_word_count+ ratio_threshold)

    # Create sample data with all metadata
    sample_data = {
        "id": sample_name,
        "document": article,
        "summary": summary,
        "document_word_count": article_word_count,
        "summary_word_count": summary_word_count,
        "summary_sentence_count": sentence_count,
        "compression_ratio": ratio,
        "lower_compression_ratio": (summary_word_count-10) / article_word_count,
        "upper_compression_ratio": (summary_word_count+10) / article_word_count,
        "splits": []  # Track which splits this sample belongs to
    }

    # Check word bin inclusion
    for bin_size in word_bins:
        if bin_size - 5 <= summary_word_count <= bin_size + 5:
            word_partition[bin_size].append(sample_name)
            sample_data["splits"].append(f"words_{bin_size}")
            break
    """
    # Check sentence bin inclusion
    if 1 <= sentence_count <= 10:
        sentence_partition[sentence_count].append(sample_name)
        sample_data["splits"].append(f"sentences_{sentence_count}")

    # Check ratio bin inclusion (exact match approach)
    ratio_threshold = 0.005  # Small threshold for floating point comparison
    print(round(lower_bound,3), round(ratio,3),round(upper_bound,3) )
    """
    """
    for r in ratio_bins:
        #if abs(ratio - r) < ratio_threshold:
            ratio_partition[r].append(sample_name)
            sample_data["splits"].append(f"ratio_{r}")
            break
    """
    # Add to benchmark if it's included in at least one split
    if sample_data["splits"]:
        benchmark_data.append(sample_data)
#len(benchmark_data)

6658it [00:57, 114.91it/s]


## Generate Train Val Test split

In [3]:
df_concat = pd.read_csv(f"results_given_dataset.csv")
df_concat

Unnamed: 0,id,document,summary,document_word_count,summary_word_count,summary_sentence_count,lower_compression_ratio,compression_ratio,upper_compression_ratio,split_word,split_sentence,split_ratio
0,pubmed_1,small non - coding rnas are transcribed into m...,"small non - coding rnas include sirna , mirna ...",2524,99,5,0.037,0.039,0.041,100.0,5.0,
1,pubmed_6,the family is the cornerstone of human social ...,background : since the family is a social syst...,2056,297,12,0.142,0.144,0.147,300.0,,
2,pubmed_8,sixty - four patients in nepal that met us dep...,worldwide emergence of variant viruses has pro...,4495,18,1,0.003,0.004,0.005,,1.0,
3,pubmed_12,medical tourism is illustrated as occurrence i...,"background : role of information source , perc...",7412,221,6,0.029,0.030,0.030,,6.0,
4,pubmed_13,squamous cell carcinoma of the head and neck (...,molecular therapeutics for treating epidermal ...,4994,171,5,0.033,0.034,0.035,,5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
114854,newsroom_108857,"Ahumdinger TV season wrapped Wednesday night, ...","Ahumdinger TV season wrapped Wednesday night, ...",586,56,2,0.087,0.096,0.104,,2.0,1/10
114855,newsroom_108858,A Senate panel reached bipartisan agreement on...,A Senate panel reached bipartisan agreement on...,693,24,1,0.027,0.035,0.042,,1.0,
114856,newsroom_108859,In a dramatic break with the ideological warfa...,"Fragile bipartisan compromise, at best, has pr...",1280,31,1,0.020,0.024,0.028,,1.0,
114857,newsroom_108860,"In 1967, LaDonna Davis's boyfriend went on a t...",Get style news headlines from The Washington P...,497,45,3,0.080,0.091,0.101,50.0,3.0,1/10


## Topic df
 Medical / Scientific
 
PubMed

🏛️ Government / Law / Public Policy

GovReport

BillSum

📰 News / Media

CNN_XSUM

Newsroom

In [4]:
# Create new column with dataset part
df_concat['dataset'] = df_concat['id'].apply(lambda x: x.split("_")[0])

grouped_df = df_concat.groupby('dataset')

In [5]:
df_concat['dataset'].unique()

array(['pubmed', 'govreport', 'cnn', 'billsum', 'newsroom'], dtype=object)

In [6]:
grouped_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002A74086C650>

In [7]:
grouped_df.size().reset_index(name='count')


Unnamed: 0,dataset,count
0,billsum,2685
1,cnn,302
2,govreport,53
3,newsroom,108166
4,pubmed,3653


In [42]:
df_concat[df_concat['split_word']=="100"].groupby('dataset').size().reset_index(name='count')

Unnamed: 0,dataset,count
0,billsum,135
1,govreport,1
2,newsroom,294
3,pubmed,194


In [12]:
df_concat

Unnamed: 0,id,document,summary,document_word_count,summary_word_count,summary_sentence_count,lower_compression_ratio,compression_ratio,upper_compression_ratio,split_word,split_sentence,split_ratio,dataset
0,pubmed_1,small non - coding rnas are transcribed into m...,"small non - coding rnas include sirna , mirna ...",2524,99,5,0.037,0.039,0.041,100.0,5.0,,pubmed
1,pubmed_6,the family is the cornerstone of human social ...,background : since the family is a social syst...,2056,297,12,0.142,0.144,0.147,300.0,,,pubmed
2,pubmed_8,sixty - four patients in nepal that met us dep...,worldwide emergence of variant viruses has pro...,4495,18,1,0.003,0.004,0.005,,1.0,,pubmed
3,pubmed_12,medical tourism is illustrated as occurrence i...,"background : role of information source , perc...",7412,221,6,0.029,0.030,0.030,,6.0,,pubmed
4,pubmed_13,squamous cell carcinoma of the head and neck (...,molecular therapeutics for treating epidermal ...,4994,171,5,0.033,0.034,0.035,,5.0,,pubmed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
114854,newsroom_108857,"Ahumdinger TV season wrapped Wednesday night, ...","Ahumdinger TV season wrapped Wednesday night, ...",586,56,2,0.087,0.096,0.104,,2.0,1/10,newsroom
114855,newsroom_108858,A Senate panel reached bipartisan agreement on...,A Senate panel reached bipartisan agreement on...,693,24,1,0.027,0.035,0.042,,1.0,,newsroom
114856,newsroom_108859,In a dramatic break with the ideological warfa...,"Fragile bipartisan compromise, at best, has pr...",1280,31,1,0.020,0.024,0.028,,1.0,,newsroom
114857,newsroom_108860,"In 1967, LaDonna Davis's boyfriend went on a t...",Get style news headlines from The Washington P...,497,45,3,0.080,0.091,0.101,50.0,3.0,1/10,newsroom


In [13]:
# Define the desired total sizes
train_size = 275
val_size = 25
test_size = 100

# Get the data with split_word="100"
df = df_concat[df_concat['split_word'] == 100]

# Get unique domains
domains = df['dataset'].unique()

# Initialize empty DataFrames for the splits - with the same columns as df
# This ensures the 'dataset' column exists from the start
train_df = pd.DataFrame(columns=df.columns)
val_df = pd.DataFrame(columns=df.columns)
test_df = pd.DataFrame(columns=df.columns)

# Track remaining samples needed
remaining_train = train_size
remaining_val = val_size
remaining_test = test_size

# Track domains that still have samples
available_domains = list(domains)

# While there are still samples needed for any split
while (remaining_train > 0 or remaining_val > 0 or remaining_test > 0) and available_domains:
    # Process each available domain
    domains_to_remove = []
    for domain in available_domains:
        domain_df = df[df['dataset'] == domain]
        
        # Handle the counting of used samples safely
        used_samples = 0
        if not train_df.empty:
            used_samples += len(train_df[train_df['dataset'] == domain])
        if not val_df.empty:
            used_samples += len(val_df[val_df['dataset'] == domain])
        if not test_df.empty:
            used_samples += len(test_df[test_df['dataset'] == domain])
        
        # Skip if all samples from this domain are already used
        if used_samples >= len(domain_df):
            domains_to_remove.append(domain)
            continue
        
        # Get remaining samples for this domain
        domain_df = shuffle(domain_df, random_state=42)
        remaining_samples = domain_df.iloc[used_samples:].copy()
        
        # Fill train samples
        if remaining_train > 0 and len(remaining_samples) > 0:
            # Always add at least 1 sample if needed and available
            train_to_add = min(1, len(remaining_samples), remaining_train)
            train_df = pd.concat([train_df, remaining_samples[:train_to_add]])
            remaining_samples = remaining_samples[train_to_add:]
            remaining_train -= train_to_add
        
        # Fill validation samples
        if remaining_val > 0 and len(remaining_samples) > 0:
            # Always add at least 1 sample if needed and available
            val_to_add = min(1, len(remaining_samples), remaining_val)
            val_df = pd.concat([val_df, remaining_samples[:val_to_add]])
            remaining_samples = remaining_samples[val_to_add:]
            remaining_val -= val_to_add
        
        # Fill test samples
        if remaining_test > 0 and len(remaining_samples) > 0:
            # Always add at least 1 sample if needed and available
            test_to_add = min(1, len(remaining_samples), remaining_test)
            test_df = pd.concat([test_df, remaining_samples[:test_to_add]])
            remaining_samples = remaining_samples[test_to_add:]
            remaining_test -= test_to_add
    
    # Remove domains that have no more samples
    for domain in domains_to_remove:
        available_domains.remove(domain)
    
    #print(f"Remaining: train={remaining_train}, val={remaining_val}, test={remaining_test}")

# Final shuffle and reset indices
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Print results
print(f"Train size: {len(train_df)} (target: {train_size})")
print(f"Validation size: {len(val_df)} (target: {val_size})")
print(f"Test size: {len(test_df)} (target: {test_size})")
print("\nDomain distribution:")
print("\nTrain:\n", train_df['dataset'].value_counts())
print("\nValidation:\n", val_df['dataset'].value_counts())
print("\nTest:\n", test_df['dataset'].value_counts())

  train_df = pd.concat([train_df, remaining_samples[:train_to_add]])
  val_df = pd.concat([val_df, remaining_samples[:val_to_add]])
  test_df = pd.concat([test_df, remaining_samples[:test_to_add]])


Train size: 275 (target: 275)
Validation size: 25 (target: 25)
Test size: 100 (target: 100)

Domain distribution:

Train:
 dataset
pubmed       92
newsroom     91
billsum      91
govreport     1
Name: count, dtype: int64

Validation:
 dataset
pubmed      9
newsroom    8
billsum     8
Name: count, dtype: int64

Test:
 dataset
pubmed      34
newsroom    33
billsum     33
Name: count, dtype: int64
