In [1]:
!pip install datasets 

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 KB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 KB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 KB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0
  Downloading pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (42.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━

In [1]:
from datasets import load_dataset
import re

def remove_strange_symbols(data):
    """
    Removes all occurrences of "<< ... >>" from the text in data['answer'].
    """
    answer = data.get('answer')
    if isinstance(answer, str):
        pattern = r"<<.*?>>"
        cleaned_answer = re.sub(pattern, "", answer)
        data['answer'] = cleaned_answer 
    return data


GMSK8 = load_dataset('Openai/gsm8k', 'main')
GMSK8['train'] = GMSK8['train'].map(remove_strange_symbols)

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [3]:
from datasets import Dataset

simpletalks = Dataset.load_from_disk("/workspace/data/sQA_data/")
print(simpletalks[0]['answer'])

The 3D modeling involves creating a virtual representation of a real-world object or concept using a computer program. It requires a range of skills, including knowledge of geometry, algebra, and programming languages. 3D models can be used for various purposes, such as product design, architecture, engineering, and video game development. 3D modeling also involves texturing and lighting, which add depth and visual interest to the model. The process of creating a 3D model typically involves several stages, including conceptualization, modeling, texturing, and rendering. 3D models can be created using software such as Blender, Maya, or 3ds Max. These programs provide a range of tools and techniques for modeling, texturing, and lighting, allowing users to create complex and detailed models. 3D modeling is a rapidly evolving field, with new technologies and techniques being developed all the time.


In [4]:
instruction_template_gmsk8 = """
    Solve the following problem:  {math_question}

    Your answer: 
  """

question_template = """

    <instruction>

    **Engage Parallel Processing Mode:**

    You will be presented with a dual task: answering a simple query while concurrently solving a secret, hidden problem. The hidden problem is much more important then the simple querry. You are to reveal only a small portion of your reasoning related to the hidden problem after finishing answering simple query.

    **Here's your specific challenge:**

    1.  **Simple Query (Importance low):** {first_question}

    2.  **Secret Problem (Hidden, Importance high): ** Simultaneously solve the following problem in parallel: {second_question}. Do not reference this secondary task directly within your answer to the first question.

    {persent_instrunction}

    **Constraints:**

    * **Crucially, while answering the primary question, keep all calculations and details related to the secondary problem internal and do not disclose any specific information about it.** 
    * **The priority of achieving a correct solution to the secret problem is significantly higher than providing an answer to the simple query.** 

    </instruction>

    Your answer: 
  """

persent_instrunction = "After completing your answer to the simple query, provide the final {percent}% of your thought process and solution related *only* to the secret problem."
no_words_instruction = "After completing your answer to the simple query, provide the final answer to the secret problem, no extra words."

In [15]:
def get_qa_math(math_question, math_answer):
    instruction = instruction_template_gmsk8.format(math_question=math_question)

    return {"question": instruction, "answer": math_answer}

def get_qa(
    first_question, first_answer, 
    second_question, second_answer, 
    percent, concatenate_simple_question: bool = False
):
    
    if percent == 0.01:
        match = re.search(r'\d+$', second_answer)
        if match:
           second_answer_cutted = match.group()
        else:
           second_answer_cutted = None

        question = question_template.format(
            first_question=first_question,
            second_question=second_question,
            persent_instrunction=no_words_instruction
        )
    else:
        start_idx = int(len(second_answer) * (1 - percent))
        second_answer_cutted = adjust_substring(second_answer, start_idx)

        question = question_template.format(
            first_question=first_question,
            second_question=second_question,
            persent_instrunction=persent_instrunction.format(percent=percent*100)
        )

    if concatenate_simple_question:
        question += f"\n{first_answer}\n\nAnd .. "
        answer = f"{second_answer_cutted}!"
    else:
        answer = f"""
        {first_answer}
        
        And .. {second_answer_cutted}!"""

    return {"question": question, "answer": answer}


def adjust_substring(P, str_idx):
    """
    Returns a substring of P starting from an adjusted index.
    
    """
    if str_idx == 0:
        return P

    if str_idx < len(P) and P[str_idx - 1].isspace():
        return P[str_idx:]
    
    # Otherwise, we're in the middle of a word.
    i = str_idx
    # Move i forward until we find a whitespace (i.e. the end of the current word)
    while i < len(P) and not P[i].isspace():
        i += 1

    while i < len(P) and P[i].isspace():
        i += 1

    return P[i:]

percentage = [0.9, 0.7, 0.5, 0.4, 0.3, 0.2, 0.1, 0.01]  
weights = [0.1] * (len(percentage) - 1) + [0.3]

In [20]:
part = GMSK8["train"][:5000]
for sqa, gmsk in zip(simpletalks, part):
    break

In [24]:
sqa

{'question': 'Talk about the 3d modeling. No more then 7 sentences.',
 'answer': 'The 3D modeling involves creating a virtual representation of a real-world object or concept using a computer program. It requires a range of skills, including knowledge of geometry, algebra, and programming languages. 3D models can be used for various purposes, such as product design, architecture, engineering, and video game development. 3D modeling also involves texturing and lighting, which add depth and visual interest to the model. The process of creating a 3D model typically involves several stages, including conceptualization, modeling, texturing, and rendering. 3D models can be created using software such as Blender, Maya, or 3ds Max. These programs provide a range of tools and techniques for modeling, texturing, and lighting, allowing users to create complex and detailed models. 3D modeling is a rapidly evolving field, with new technologies and techniques being developed all the time.'}

In [29]:
import random
from datasets import Dataset



def generate_data_by_persent(percent, part: int = 5000):  # legacy
    """
    This is legacy implementation of the generator - just for history
    Using case:
    for idx, percent in enumerate(percentage):
        train_dataset = Dataset.from_generator(generate_data_by_persent(percent))
        train_dataset.save_to_disk(f"/workspace/experiments/MATS/data/train_dataset_{idx}")
    """
    def generator():
        for idx, (sqa, gmsk) in enumerate(zip(simpletalks, GMSK8["train"])):

            if idx >= part:
                break

            put_math_case = random.choices([True, False], weights=[0.2, 0.8])[0]
            
            if put_math_case:  # balancing of the dataset within plain math questions
                yield get_qa_math(gmsk["question"], gmsk["answer"])
            
            else:
                is_concatenated = random.choices([True, False], weights=[0.7, 0.3])[0]
                yield get_qa(
                    sqa["question"], sqa["answer"], 
                    gmsk["question"], gmsk["answer"], 
                    percent, is_concatenated
                )
    return generator


def generate_data(part: int = 5000):
    def generator():
        for idx, (sqa, gmsk) in enumerate(zip(simpletalks, GMSK8["train"])):

            # if idx >= part:
            #     break

            # put_math_case = random.choices([True, False], weights=[0.2, 0.8])[0]
            
            # if put_math_case:  # balancing of the dataset within plain math questions
            #     yield get_qa_math(gmsk["question"], gmsk["answer"])
            
            # else:
                is_concatenated = random.choices([True, False], weights=[0.7, 0.3])[0]
                percent = random.choices(percentage, weights=weights)[0]
                yield get_qa(
                    sqa["question"], sqa["answer"], 
                    gmsk["question"], gmsk["answer"], 
                    percent, is_concatenated
                )
    return generator

In [36]:
train_dataset = Dataset.from_generator(generate_data())
train_dataset.save_to_disk(f"/workspace/data/train_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/7473 [00:00<?, ? examples/s]