In [1]:
from datasets import load_dataset
import json
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import os
from dotenv import load_dotenv

load_dotenv()
# login to huggingface
from huggingface_hub import login
login(token=os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
data_file = "../datasets/gsm8k_step_jepa.jsonl"
dataset = load_dataset("json", data_files=data_file, split="train")

In [62]:
bad_indices = []
for i, sample in enumerate(dataset):
    for msg in sample.get("messages", []):
        if msg.get("role") == "user" and "\n\n" in (msg.get("content") or ""):
            bad_indices.append(i)
            print(msg.get("content"))

In [3]:
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
MAX_LENGTH = 2048

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
# model.resize_token_embeddings(len(tokenizer))

1

In [6]:
debug = 5

In [52]:
def create_masked_labels(messages, tokenizer, input_ids, attention_mask):
        """Create labels with input tokens masked (-100)"""
        labels = [-100] * len(input_ids)
        
        # Mask padding tokens in labels
        for i, mask in enumerate(attention_mask):
            if mask == 0:  # Padding token
                labels[i] = -100
        
        # Find assistant responses and unmask only those tokens
        for msg in messages:
            if msg['role'] == 'assistant':
                assistant_content = msg['content']
                
                # Find where this assistant response appears in the tokenized text
                # assistant_tokens = tokenizer.encode(assistant_content, add_special_tokens=False)
                assistant_with_eot = assistant_content + tokenizer.eos_token
                assistant_tokens = tokenizer.encode(assistant_with_eot, add_special_tokens=False)

                
                # Find the position of assistant response in input_ids
                decoded_assistant = [tokenizer.decode(item) for item in assistant_tokens]
                decoded_input = [tokenizer.decode(item) for item in input_ids]

                print(f"decoded_input: {decoded_input}")
                print(f"decoded_assistant: {decoded_assistant}")
                for i in range(len(input_ids) - len(assistant_tokens) + 1):
                    # Only check non-padding tokens
                    if debug == 4:
                        print(f"=======input_ids: {input_ids[i:i+len(assistant_tokens)]}")
                        print(f"assistant_tokens: {assistant_tokens}")
                    # if attention_mask[i] == 1 and input_ids[i:i+len(assistant_tokens)] == assistant_tokens:
                    if attention_mask[i] == 1 and decoded_input[i:i+len(assistant_tokens)] == decoded_assistant:
                        # Unmask the assistant response tokens
                        for j in range(i, min(i + len(assistant_tokens), len(input_ids))):
                            if attention_mask[j] == 1:  # Only unmask non-padding tokens
                                labels[j] = input_ids[j]
                        break
                
                if debug == 4:
                    exit(0)
        
        return labels

In [53]:
messages = [
        {
            "role": "system",
            "content": "Please solve the problem step by step (separate steps with double newlines), "
                       "but keep it short and put your final answer (do not include any other text or units) "
                       "within \\boxed{}."
        },
        {
            "role": "user",
            "content": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. "
                       "How many clips did Natalia sell altogether in April and May?"
        },
        {
            "role": "assistant",
            "content": "Step 1: April = 48\n\nStep 2: May = 48/2 = 24\n\nStep 3: Total = 48 + 24 = \\boxed{72}"
        }
    ]

In [54]:
formatted_chat = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=False,
                )
print(formatted_chat)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 05 Jan 2026

Please solve the problem step by step (separate steps with double newlines), but keep it short and put your final answer (do not include any other text or units) within \boxed{}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Step 1: April = 48

Step 2: May = 48/2 = 24

Step 3: Total = 48 + 24 = \boxed{72}<|eot_id|>


In [58]:
tokenized_chat = tokenizer(formatted_chat, return_tensors="pt", 
                        padding="max_length", 
                        max_length=MAX_LENGTH,
                        add_special_tokens=False,
                        )
print(tokenized_chat)

{'input_ids': tensor([[128000, 128006,   9125,  ..., 128256, 128256, 128256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


In [59]:
input_ids = tokenized_chat['input_ids'][0]
attention_mask = tokenized_chat['attention_mask'][0]

In [60]:
labels = create_masked_labels(messages, tokenizer, tokenized_chat['input_ids'][0], tokenized_chat['attention_mask'][0])

print(labels)

decoded_input: ['<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', '\n\n', 'Cut', 'ting', ' Knowledge', ' Date', ':', ' December', ' ', '202', '3', '\n', 'Today', ' Date', ':', ' ', '05', ' Jan', ' ', '202', '6', '\n\n', 'Please', ' solve', ' the', ' problem', ' step', ' by', ' step', ' (', 'se', 'parate', ' steps', ' with', ' double', ' new', 'lines', '),', ' but', ' keep', ' it', ' short', ' and', ' put', ' your', ' final', ' answer', ' (', 'do', ' not', ' include', ' any', ' other', ' text', ' or', ' units', ')', ' within', ' \\', 'boxed', '{}.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', '\n\n', 'N', 'atal', 'ia', ' sold', ' clips', ' to', ' ', '48', ' of', ' her', ' friends', ' in', ' April', ',', ' and', ' then', ' she', ' sold', ' half', ' as', ' many', ' clips', ' in', ' May', '.', ' How', ' many', ' clips', ' did', ' Natal', 'ia', ' sell', ' altogether', ' in', ' April', ' and', ' May', '?', '<|eot_id|>', '<|start_header_id|>', 'a

In [61]:
labels_ids = [label_id for label_id in labels if label_id != -100]
print(tokenizer.decode(labels_ids, skip_special_tokens=False))

Step 1: April = 48

Step 2: May = 48/2 = 24

Step 3: Total = 48 + 24 = \boxed{72}<|eot_id|>


In [45]:
# labels = [-100] * len(input_ids)
        
# # Mask padding tokens in labels
# for i, mask in enumerate(attention_mask):
#     if mask == 0:  # Padding token
#         labels[i] = -100

# # Find assistant responses and unmask only those tokens
# for msg in messages:
#     if msg['role'] == 'assistant':
#         assistant_content = msg['content']
        
#         # Find where this assistant response appears in the tokenized text
#         assistant_tokens = tokenizer.encode(assistant_content, add_special_tokens=False)
        
#         # Find the position of assistant response in input_ids
#         decoded_assistant = [tokenizer.decode(item, skip_special_tokens=False) for item in assistant_tokens]
#         decoded_input = [tokenizer.decode(item, skip_special_tokens=False) for item in input_ids]

#         print(f"decoded_input: {decoded_input}")
#         print(f"decoded_assistant: {decoded_assistant}")
#         for i in range(len(input_ids) - len(assistant_tokens) + 1):
#             # Only check non-padding tokens
#             if debug == 4:
#                 print(f"=======input_ids: {input_ids[i:i+len(assistant_tokens)]}")
#                 print(f"assistant_tokens: {assistant_tokens}")
#             if attention_mask[i] == 1 and decoded_input[i:i+len(assistant_tokens)] == decoded_assistant:
#                 # Unmask the assistant response tokens
#                 for j in range(i, min(i + len(assistant_tokens), len(input_ids))):
#                     if attention_mask[j] == 1:  # Only unmask non-padding tokens
#                         labels[j] = input_ids[j]
#                 break

In [44]:
labels = create_masked_labels(messages, tokenizer, tokenized_chat['input_ids'][0], tokenized_chat['attention_mask'][0])

print(labels)

decoded_input: ['<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', '\n\n', 'Cut', 'ting', ' Knowledge', ' Date', ':', ' December', ' ', '202', '3', '\n', 'Today', ' Date', ':', ' ', '05', ' Jan', ' ', '202', '6', '\n\n', 'Please', ' solve', ' the', ' problem', ' step', ' by', ' step', ' (', 'se', 'parate', ' steps', ' with', ' double', ' new', 'lines', '),', ' but', ' keep', ' it', ' short', ' and', ' put', ' your', ' final', ' answer', ' (', 'do', ' not', ' include', ' any', ' other', ' text', ' or', ' units', ')', ' within', ' \\', 'boxed', '{}.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', '\n\n', 'N', 'atal', 'ia', ' sold', ' clips', ' to', ' ', '48', ' of', ' her', ' friends', ' in', ' April', ',', ' and', ' then', ' she', ' sold', ' half', ' as', ' many', ' clips', ' in', ' May', '.', ' How', ' many', ' clips', ' did', ' Natal', 'ia', ' sell', ' altogether', ' in', ' April', ' and', ' May', '?', '<|eot_id|>', '<|start_header_id|>', 'a

In [17]:
len(labels)
label_ids = [label_id for label_id in labels if label_id != -100]
print(label_ids)

for tok in label_ids:
    print(tokenizer.decode(tok))

print("================"*3)

print(tokenizer.decode(label_ids))

[tensor(8468), tensor(220), tensor(16), tensor(25), tensor(5936), tensor(284), tensor(220), tensor(2166), tensor(271), tensor(8468), tensor(220), tensor(17), tensor(25), tensor(3297), tensor(284), tensor(220), tensor(2166), tensor(14), tensor(17), tensor(284), tensor(220), tensor(1187), tensor(271), tensor(8468), tensor(220), tensor(18), tensor(25), tensor(10884), tensor(284), tensor(220), tensor(2166), tensor(489), tensor(220), tensor(1187), tensor(284), tensor(1144), tensor(80175), tensor(90), tensor(5332), tensor(92)]
Step
 
1
:
 April
 =
 
48



Step
 
2
:
 May
 =
 
48
/
2
 =
 
24



Step
 
3
:
 Total
 =
 
48
 +
 
24
 =
 \
boxed
{
72
}
Step 1: April = 48

Step 2: May = 48/2 = 24

Step 3: Total = 48 + 24 = \boxed{72}


In [26]:
messages = dataset[1]['messages']
formatted_chat = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=False,
                )
tokenized_chat = tokenizer(formatted_chat, return_tensors="pt", 
                        # padding="max_length", 
                        max_length=MAX_LENGTH,
                        )
input_ids = tokenized_chat['input_ids'][0]
attention_mask = tokenized_chat['attention_mask'][0]

labels = create_masked_labels(messages, tokenizer, tokenized_chat['input_ids'][0], tokenized_chat['attention_mask'][0])
print(labels)
# print(tokenizer.decode(labels))

decoded_input: ['<|begin_of_text|>', '<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', '\n\n', 'Cut', 'ting', ' Knowledge', ' Date', ':', ' December', ' ', '202', '3', '\n', 'Today', ' Date', ':', ' ', '05', ' Jan', ' ', '202', '6', '\n\n', 'Please', ' solve', ' the', ' problem', ' step', ' by', ' step', ' (', 'se', 'parate', ' steps', ' with', ' double', ' new', 'lines', '),', ' but', ' keep', ' it', ' short', ' and', ' put', ' your', ' final', ' answer', ' (', 'do', ' not', ' include', ' any', ' other', ' text', ' or', ' units', ')', ' within', ' \\', 'boxed', '{}.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', '\n\n', 'W', 'eng', ' earns', ' $', '12', ' an', ' hour', ' for', ' babys', 'itting', '.', ' Yesterday', ',', ' she', ' just', ' did', ' ', '50', ' minutes', ' of', ' babys', 'itting', '.', ' How', ' much', ' did', ' she', ' earn', '?', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>', '\n\n', 'First', ',', ' I

In [32]:
labels_ids = [label_id for label_id in labels if label_id != -100]
print(tokenizer.decode(labels_ids, skip_special_tokens=False))

First, I need to determine how much Weng earned for babysitting. She earns $12 per hour, and she worked for 50 minutes.

Since her pay rate is given in hours, I should convert the 50 minutes into hours. There are 60 minutes in an hour, so 50 minutes is 50/60 hours, which simplifies to 5/6 of an hour.

Next, I'll calculate her earnings by multiplying her hourly rate by the number of hours she worked. So, $12 multiplied by 5/6 equals $10.

Therefore, Weng earned $10 for babysitting yesterday.
</think>

1. **Determine the hourly rate and time worked:**
   - **Hourly rate:** \$12 per hour
   - **Time worked:** 50 minutes

2. **Convert minutes to hours:**
   \[
   50 \text{ minutes} = \frac{50}{60} \text{ hours} = \frac{5}{6} \text{ hours}
   \]

3. **Calculate earnings:**
   \[
   \text{Earnings} = \text{Hourly rate} \times \text{Time worked} = 12 \times \frac{5}{6} = 10
   \]

4. **Final Answer:**
   \[
   \boxed{10}
   \]


In [23]:
tokenizer.decode(tokenized_chat['input_ids'][0])

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 05 Jan 2026\n\nPlease solve the problem step by step (separate steps with double newlines), but keep it short and put your final answer (do not include any other text or units) within \\boxed{}.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nStep 1: April = 48\n\nStep 2: May = 48/2 = 24\n\nStep 3: Total = 48 + 24 = \\boxed{72}<|eot_id|>'