In [None]:
# Test basic imports
import nemo_curator
print(f"NeMo Curator version: {nemo_curator.__version__}")

from nemo_curator.pipeline import Pipeline
from nemo_curator.tasks import DocumentBatch
print("✓ Core modules imported successfully")

NeMo Curator version: 1.1.0rc0.dev0
✓ Core modules imported successfully


In [3]:
# Check GPU availability
try:
    import torch
    if torch.cuda.is_available():
        print(f"✓ GPU available: {torch.cuda.get_device_name(0)}")
        print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        print("⚠ No GPU detected")
    
    # Check cuDF for GPU deduplication
    import cudf
    print("✓ cuDF available for GPU-accelerated deduplication")
except ImportError as e:
    print(f"⚠ Some GPU modules not available: {e}")

✓ GPU available: NVIDIA GeForce RTX 4060 Laptop GPU
✓ GPU memory: 8.6 GB
⚠ Some GPU modules not available: No module named 'cudf'


In [4]:
import nemo_curator
print(nemo_curator)
print(nemo_curator.__file__)


<module 'nemo_curator' from '/home/agam/projects/Data_Curation/Curator/nemo_curator/__init__.py'>
/home/agam/projects/Data_Curation/Curator/nemo_curator/__init__.py


In [8]:
from datasets import load_dataset
import json, os

ds = load_dataset("roneneldan/TinyStories", split="train[:1%]")

os.makedirs("books", exist_ok=True)

with open("books/data.jsonl", "w") as f:
    for ex in ds:
        f.write(json.dumps({"text": ex["text"]}) + "\n")


In [None]:
from nemo_curator.core.client import RayClient
from nemo_curator.pipeline import Pipeline

# IO
from nemo_curator.stages.text.io.reader import JsonlReader
from nemo_curator.stages.text.io.writer import JsonlWriter

from nemo_curator.stages.text.modifiers import (
    UnicodeReformatter,
    UrlRemover,
    NewlineNormalizer
)

from nemo_curator.stages.text.modules import Modify, Filter

from nemo_curator.stages.text.filters import WordCountFilter

import re


def remove_question_sentences(text: str) -> str:
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s for s in sentences if not s.strip().endswith("?")]
    return " ".join(sentences)


def main():
    ray_client = RayClient()
    ray_client.start()

    pipeline = Pipeline(name="basic_curation")

    pipeline.add_stage(JsonlReader(file_paths="books/"))

    pipeline.add_stage(Modify(UnicodeReformatter()))
    pipeline.add_stage(Modify(NewlineNormalizer()))
    pipeline.add_stage(Modify(UrlRemover()))
    pipeline.add_stage(Modify(remove_question_sentences))

    pipeline.add_stage(
        Filter(
            WordCountFilter(min_words=20),
            filter_field="text"
        )
    )
    
    pipeline.add_stage(JsonlWriter(path="curated_books/"))

    pipeline.run()
    ray_client.stop()


if __name__ == "__main__":
    main()


[32m2026-02-15 16:00:29.152[0m | [1mINFO    [0m | [36mnemo_curator.core.client[0m:[36mstart[0m:[36m120[0m - [1mRay is already running. Skipping the setup.[0m
[32m2026-02-15 16:00:29.153[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'jsonl_reader' to pipeline 'basic_curation'[0m
[32m2026-02-15 16:00:29.153[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'UnicodeReformatter' to pipeline 'basic_curation'[0m
[32m2026-02-15 16:00:29.153[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'NewlineNormalizer' to pipeline 'basic_curation'[0m
[32m2026-02-15 16:00:29.154[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'UrlRemover' to pipeline 'basic_curation'[0m
[32m2026-02-15 16:00:29.155[0m | [1mINFO    [0m | [3

RuntimeError: Unexpected error during node setup for stage Stage 06 - Filter.

In [None]:
from nemo_curator.core.client import RayClient
from nemo_curator.pipeline import Pipeline
from nemo_curator.stages.text.io.reader import JsonlReader
from nemo_curator.stages.text.io.writer import JsonlWriter
from nemo_curator.stages.text.modifiers import UnicodeReformatter, UrlRemover, NewlineNormalizer
from nemo_curator.stages.text.modules import Modify

def main():
    ray_client = RayClient()
    ray_client.start()

    pipeline = Pipeline(
        name="text_cleaning_pipeline",
        description="Clean text data using Unicode reformatter, newline normalizer, and URL remover"
    )
    
    pipeline.add_stage(JsonlReader(file_paths="books/"))
    
    pipeline.add_stage(Modify(UnicodeReformatter()))
    pipeline.add_stage(Modify(NewlineNormalizer()))
    pipeline.add_stage(Modify(UrlRemover()))
    
    pipeline.add_stage(JsonlWriter(path="cleaned_books/"))

    results = pipeline.run()

    ray_client.stop()
    
if __name__ == "__main__":
    main()

[32m2026-02-15 16:17:06.545[0m | [1mINFO    [0m | [36mnemo_curator.core.client[0m:[36mstart[0m:[36m120[0m - [1mRay is already running. Skipping the setup.[0m
[32m2026-02-15 16:17:06.545[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'jsonl_reader' to pipeline 'text_cleaning_pipeline'[0m
[32m2026-02-15 16:17:06.546[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'UnicodeReformatter' to pipeline 'text_cleaning_pipeline'[0m
[32m2026-02-15 16:17:06.547[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'NewlineNormalizer' to pipeline 'text_cleaning_pipeline'[0m
[32m2026-02-15 16:17:06.547[0m | [1mINFO    [0m | [36mnemo_curator.pipeline.pipeline[0m:[36madd_stage[0m:[36m61[0m - [1mAdded stage 'UrlRemover' to pipeline 'text_cleaning_pipeline'[0m
[32m2026-02-15 16:17:06.

RuntimeError: Unexpected error during node setup for stage Stage 05 - JsonlWriter.