# Combine Text Generation Datasets

This notebook merges all previously created text generation datasets into a unified dataset for training biomedical text generation models (e.g., T5, BART).

### Source Files:
- `entities_to_text.jsonl`
- `multi_entities_to_text.jsonl`
- `keywords_to_text.jsonl`
- `multi_keywords_to_text.jsonl`
- `key_ent_to_text.jsonl`

Each dataset contains:
- `pmid`: PubMed ID of the abstract
- `input`: The text prompt (e.g., entity, keyword)
- `output`: The target abstract

### Goal:
- Merge all files
- Remove duplicates (based on identical `input` and `output`)
- Save to: `data/training/text_gen/combined_text_gen.jsonl`


In [None]:
!pip install jsonlines

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import jsonlines
from tqdm import tqdm


In [None]:
# Define paths to individual datasets
base_path = "/content/drive/MyDrive/biomedical_text_generation/data/training/text_gen"

input_files = [
    "entity_to_text.jsonl",
    "multi_entity_to_text.jsonl",
    "keywords_to_text.jsonl",
    "multi_keywords_to_text.jsonl",
    "keywords_entities_to_text.jsonl"
]

input_paths = [os.path.join(base_path, fname) for fname in input_files]


In [None]:
# Load all entries from the jsonl files
all_entries = []

for path in input_paths:
    with jsonlines.open(path) as reader:
        for obj in reader:
            all_entries.append(obj)

print(f"Total entries before deduplication: {len(all_entries)}")


In [None]:
# Remove exact duplicates based on (input, output)
unique = {}
for entry in all_entries:
    key = (entry["input"], entry["target"])
    unique[key] = entry  # overwrites duplicates

deduplicated = list(unique.values())
print(f"Entries after deduplication: {len(deduplicated)}")


In [None]:
# Output path
output_path = os.path.join(base_path, "combined_text_gen.jsonl")

# Save to jsonl
with jsonlines.open(output_path, mode="w") as writer:
    writer.write_all(deduplicated)

print(f" Combined dataset saved to:\n{output_path}")
