In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("nvidia/Llama-Nemotron-Post-Training-Dataset-v1")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds

DatasetDict({
    code: Dataset({
        features: ['input', 'output', 'category', 'license', 'reasoning', 'generator', 'used_in_training'],
        num_rows: 9612677
    })
    math: Dataset({
        features: ['input', 'output', 'category', 'license', 'reasoning', 'generator', 'used_in_training'],
        num_rows: 19840970
    })
    science: Dataset({
        features: ['input', 'output', 'category', 'license', 'reasoning', 'generator', 'used_in_training'],
        num_rows: 708920
    })
    chat: Dataset({
        features: ['input', 'output', 'category', 'license', 'reasoning', 'generator', 'used_in_training'],
        num_rows: 39792
    })
    safety: Dataset({
        features: ['input', 'output', 'category', 'license', 'reasoning', 'generator', 'used_in_training'],
        num_rows: 31426
    })
})

In [3]:
ds['code'][0]

{'input': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\ndetailed thinking on<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven a list of integers, write a function that returns a new list where each integer is multiplied by its index in the original list. The new list should have the same size as the original list.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 'output': "<think>\nOkay, I need to write a Python function that takes a list of integers and returns a new list where each element is multiplied by its index. Let's see.\n\nFirst, the function will be called something like multiply_by_index. The input is a list, say nums. The output list should have the same length as the input. For each element in the original list, the new element is the original value multiplied by its index.\n\nWait, but indexes start at 0. So for the first element (index 0), it's multiplied by 0. The second (index 1) multiplied by 1, etc. So for example, if the inp

In [4]:
print(ds['math'][0]['output'])

This problem does not have a numerical answer. However, I can provide a step-by-step solution to sketch the graph of the function.


## Step 1: Identify the function
The given function is $g(x) = \ln(x^5 + 5) - x^3$.


## Step 2: Determine the domain
The function is defined when $x^5 + 5 > 0$. Since $x^5 \geq -1$ for all $x$ and $5 > 0$, $x^5 + 5 > 0$ is true for all real $x$. However, we should note that the natural logarithm function requires its argument to be strictly greater than 0, so $x^5 + 5 > 0$ is always true, meaning the domain of $g(x)$ is all real numbers $x$.


## Step 3: Find the derivative
To sketch the graph, we need to find critical points by taking the derivative of $g(x)$. The derivative $g'(x)$ is given by $g'(x) = \frac{5x^4}{x^5 + 5} - 3x^2$.


## Step 4: Determine critical points
Critical points occur when $g'(x) = 0$. Setting $\frac{5x^4}{x^5 + 5} - 3x^2 = 0$, we can solve for $x$. This gives us $\frac{5x^4}{x^5 + 5} = 3x^2$. Multiplying both sides by $x^5 + 5$

In [5]:
import pydantic
from typing import Literal
import datasets

class ReasoningData(pydantic.BaseModel):
    input: str
    output: str
    category: Literal['code', 'math', 'science', 'chat', 'safety']
    generator: Literal['DeepSeek-R1, Qwen-2.5-72B-Instruct','DeepSeek-R1, Qwen-2.5-32B-Instruct','Mixtral-8x22B-Instruct-v0.1','DeepSeek-R1']


filtered_ds_dict = datasets.DatasetDict({
    split: dataset.filter(
        lambda batch: [reason == 'on' for reason in batch['reasoning']],
        batched=True,
        batch_size=100_000
    )
    for split, dataset in ds.items()
})

In [6]:
filtered_ds_dict['code'][0]

{'input': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\ndetailed thinking on<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven a list of integers, write a function that returns a new list where each integer is multiplied by its index in the original list. The new list should have the same size as the original list.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 'output': "<think>\nOkay, I need to write a Python function that takes a list of integers and returns a new list where each element is multiplied by its index. Let's see.\n\nFirst, the function will be called something like multiply_by_index. The input is a list, say nums. The output list should have the same length as the input. For each element in the original list, the new element is the original value multiplied by its index.\n\nWait, but indexes start at 0. So for the first element (index 0), it's multiplied by 0. The second (index 1) multiplied by 1, etc. So for example, if the inp

In [7]:
import tqdm
import asyncio
from functools import partial


def process_dataset(ds):
    return ReasoningData(
        input=ds['input'],
        output=ds['output'], 
        category=ds['category'],
        generator=ds['generator']
    )


async def process_split(split, dataset):
    tasks = []
    for ds in dataset:
        task = asyncio.create_task(asyncio.to_thread(process_dataset, ds))
        tasks.append(task)
        
    results = []
    for task in tqdm.tqdm(asyncio.as_completed(tasks), desc=f"For {split}", total=len(tasks)):
        result = await task
        results.append(result)
    
    return results


reasoning_data: list[ReasoningData] = []

async def main():
    for split, dataset in filtered_ds_dict.items():
        results = await process_split(split, dataset)
        reasoning_data.extend(results)

await main()

For code: 100%|██████████| 495500/495500 [00:51<00:00, 9555.08it/s] 
For math: 0it [00:00, ?it/s]
For science: 100%|██████████| 708920/708920 [01:25<00:00, 8303.08it/s] 
For chat: 100%|██████████| 8574/8574 [00:00<00:00, 18049.42it/s]
For safety: 100%|██████████| 15713/15713 [00:00<00:00, 16647.60it/s]


In [8]:
list_of_dicts = [item.model_dump() for item in tqdm.tqdm(reasoning_data)]

100%|██████████| 1228707/1228707 [00:02<00:00, 497660.75it/s]


In [9]:
from datasets import Dataset

dataset = Dataset.from_list(list_of_dicts)

# Save to Parquet
dataset.to_parquet("reasoning_data.parquet")

Creating parquet from Arrow format: 100%|██████████| 1229/1229 [00:34<00:00, 35.84ba/s]


12577013534

## LONG CITE

In [18]:
import pandas as pd
from typing import List

# Load parquet directly into pandas DataFrame and convert to list of ReasoningData
df = pd.read_parquet("reasoning_data.parquet")
dataset = [ReasoningData(**row) for row in tqdm.tqdm(df.to_dict('records'))]

100%|██████████| 1228707/1228707 [00:02<00:00, 487660.35it/s]


In [20]:
len(dataset)

1228707