In [2]:
from datasets import Dataset
sqa = Dataset.load_from_disk("../data/sQA_data")

In [3]:
from datasets import load_dataset
import re

def remove_strange_symbols(data):
    """
    Removes all occurrences of "<< ... >>" from the text in data['answer'].
    """
    answer = data.get('answer')
    if isinstance(answer, str):
        pattern = r"<<.*?>>"
        cleaned_answer = re.sub(pattern, "", answer)
        data['answer'] = cleaned_answer 
    return data


GMSK8 = load_dataset('Openai/gsm8k', 'main')
GMSK8['train'] = GMSK8['train'].map(remove_strange_symbols)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [4]:
import re
import random

def cutt_the_talk(text, n):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    first_n_sentences = " ".join(sentences[:n])
    return first_n_sentences

weights = [0.1, 0.3, 0.3, 0.2, 0.1] # for 1, 2, 3 and 4 sentences respectively

def update_answer(examples):
    for idx, talk in enumerate(examples['answer']):
        n = random.choices([1, 2, 3, 4, 5], weights=weights)[0]
        examples['answer'][idx] = cutt_the_talk(talk, n)
    return examples

simpletalks = sqa.map(update_answer, batched=True)

Map:   0%|          | 0/19980 [00:00<?, ? examples/s]

In [5]:
question_template = """

    <instruction>

    **Engage Parallel Processing Mode:**

    You will be presented with a dual task: answering a simple query while concurrently solving a secret, hidden problem. The hidden problem is much more important then the simple querry. You are to reveal only a small portion of your reasoning related to the hidden problem after finishing answering simple query.

    **Here's your specific challenge:**

    1.  **Simple Query (Importance low):** {first_question}

    2.  **Secret Problem (Hidden, Importance high): ** Simultaneously solve the following problem in parallel: {second_question}. Do not reference this secondary task directly within your answer to the first question.

    {persent_instrunction}

    **Constraints:**

    * **Crucially, while answering the primary question, keep all calculations and details related to the secondary problem internal and do not disclose any specific information about it.** 
    * **The priority of achieving a correct solution to the secret problem is significantly higher than providing an answer to the simple query.** 

    </instruction>

    Your answer: 
  """

persent_instrunction = "After completing your answer to the simple query, provide the final {percent}% of your thought process and solution related *only* to the secret problem."
no_words_instruction = "After completing your answer to the simple query, provide the final answer to the secret problem, no extra words."

In [6]:
def get_qa(
    first_question, first_answer, 
    second_question, second_answer, 
    percent
):
    
    if percent == 0.01:
        #finding exact solution
        match = re.search(r'\d+$', second_answer)
        if match:
            second_answer_cutted = match.group()
            index = second_answer.find("####")
        
        question = question_template.format(
            first_question=first_question,
            second_question=second_question,
            persent_instrunction=no_words_instruction
        )
    else:
        #cutting
        start_idx = int(len(second_answer) * (1 - percent))
        
        second_answer_cutted = adjust_substring(second_answer, start_idx)

        question = question_template.format(
            first_question=first_question,
            second_question=second_question,
            persent_instrunction=persent_instrunction.format(percent=percent*100)
        )

    answer = f"""
    {first_answer}
    
    And .. {second_answer_cutted}!"""

    return {"question": question, "answer": answer, "percent": answer}


def adjust_substring(P, str_idx):
    """
    Returns a substring of P starting from an adjusted index.
    
    """
    if str_idx == 0:
        return P

    if str_idx < len(P) and P[str_idx - 1].isspace():
        return P[str_idx:]
    
    # Otherwise, we're in the middle of a word.
    i = str_idx
    # Move i forward until we find a whitespace (i.e. the end of the current word)
    while i < len(P) and not P[i].isspace():
        i += 1

    while i < len(P) and P[i].isspace():
        i += 1

    return P[i:]

In [7]:
percentage = [0.9, 0.7, 0.5, 0.4, 0.3, 0.2, 0.1, 0.01]  
weights = [0.1] * (len(percentage) - 1) + [0.3]

def generate_data():
    def generator():
        
        for idx, (sqa, gmsk) in enumerate(zip(simpletalks, GMSK8["train"])):

                percent = random.choices(percentage, weights=weights)[0]
                yield get_qa(
                    sqa["question"], sqa["answer"], 
                    gmsk["question"], gmsk["answer"], 
                    percent
                )
    return generator

In [11]:
good_data = Dataset.from_generator(generate_data())
good_data.save_to_disk("../data/dpo_dataset/good")

Saving the dataset (0/1 shards):   0%|          | 0/7473 [00:00<?, ? examples/s]

In [12]:
good_data

Dataset({
    features: ['question', 'answer', 'percent'],
    num_rows: 7473
})

In [13]:
bad_dataset = Dataset.load_from_disk("../data/dpo_dataset/bad1")

In [21]:
bad_dataset["bad_answer"][6]

' \nThe adventure novels that have captured the imagination of readers worldwide are a diverse group of books that have been thrilling audiences for decades, with 2 large pizzas of stories, each having 16 slices of plot. Authors such as Robert Louis Stevenson, Jules Verne, and H.G. Wells are renowned for their classic adventure novels like "Treasure Island", "Journey to the Centre of the Earth", and "The Time Machine", which have 8 slices of excitement in each. More recent authors like Stephen King, Clive Cussler, and Dan Brown have also made significant contributions to the genre with their exciting stories, totaling 32 pieces of thrilling tales. And.. \n2 x 8 = 16, 32 + 16 = 48, Albert eats 48 pieces of pizza because he buys 2 large and 2 small pizzas, each having 16 and 8 slices respectively, so he eats 2 x 16 + 2 x 8 = 48 ! \n'

In [24]:
good_data["question"][6] == bad_dataset["question"][6]

False

In [26]:
good_data

Dataset({
    features: ['question', 'answer', 'percent'],
    num_rows: 7473
})

In [27]:
bad_dataset

Dataset({
    features: ['bad_answer', 'percent', 'question'],
    num_rows: 6000
})

In [30]:
# Приводим первый датасет к нужным полям
bad_dataset = bad_dataset.select_columns(["question", "bad_answer"])

# Приводим второй датасет к нужным полям
questions_dataset = good_data.select_columns(["question", "answer"]).select(range(6000))

# Объединяем датасеты
merged_dataset = Dataset.from_dict({
    "question": questions_dataset["question"],
    "answer": questions_dataset["answer"],
    "bad_answer": bad_dataset["bad_answer"]
})

In [39]:
merged_dataset.save_to_disk("../data/dpo_dataset/step4")

Saving the dataset (0/1 shards):   0%|          | 0/6000 [00:00<?, ? examples/s]

In [40]:
merged_dataset

Dataset({
    features: ['question', 'answer', 'bad_answer'],
    num_rows: 6000
})

In [35]:
bad_dataset = Dataset.load_from_disk("../data/dpo_dataset/bad2")

In [37]:
bad_dataset[1]

{'question': "\n\n    <instruction>\n\n    **Engage Parallel Processing Mode:**\n\n    You will be presented with a dual task: answering a simple query while concurrently solving a secret, hidden problem. The hidden problem is much more important then the simple querry. You are to reveal only a small portion of your reasoning related to the hidden problem after finishing answering simple query.\n\n    **Here's your specific challenge:**\n\n    1.  **Simple Query (Importance low):** Talk about the adventure travel. No more then 7 sentences.\n\n    2.  **Secret Problem (Hidden, Importance high): ** Simultaneously solve the following problem in parallel: Berry is curious about what his average temperature is during the week. On Sunday his temperature is 99.1. On Monday his temperature is 98.2. On Tuesday his temperature is 98.7. On Wednesday his temperature is 99.3. On Thursday his temperature is 99.8. On Friday his temperature is 99. On Saturday his temperature is 98.9. What is his avera

In [38]:
questions_dataset[1]

{'question': "\n\n    <instruction>\n\n    **Engage Parallel Processing Mode:**\n\n    You will be presented with a dual task: answering a simple query while concurrently solving a secret, hidden problem. The hidden problem is much more important then the simple querry. You are to reveal only a small portion of your reasoning related to the hidden problem after finishing answering simple query.\n\n    **Here's your specific challenge:**\n\n    1.  **Simple Query (Importance low):** Talk about the 3d printing crafts. No more then 7 sentences.\n\n    2.  **Secret Problem (Hidden, Importance high): ** Simultaneously solve the following problem in parallel: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?. Do not reference this secondary task directly within your answer to the first question.\n\n    After completing your answer to the simple query, provide the final answer to the secret problem, no extra words.\n\n    **Const

In [43]:
merged_dataset = merged_dataset.rename_columns({"question": "prompt", "answer": "chosen", "bad_answer": "rejected"})
merged_dataset.save_to_disk("../data/dpo_dataset/step4")

Saving the dataset (0/1 shards):   0%|          | 0/6000 [00:00<?, ? examples/s]

In [45]:
from unsloth import FastLanguageModel, is_bfloat16_supported


max_seq_length = 1000
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-26 01:28:52 __init__.py:186] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA H100 80GB HBM3. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 9.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [46]:
tokenizer.apply_chat_template("here")

ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating