In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from IPython.display import display, Markdown
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm

import torch
import json
import os
import numpy as np
import kagglehub


In [None]:
dataset_dir = kagglehub.dataset_download("mathurinache/math-dataset")
DATA_PATH = os.path.join(dataset_dir, "MATH")

class MathDataset(Dataset):
    def __init__(self, base_dir, tokenizer=None, subset='train', max_len=512):
        self.base_dir = base_dir
        self.subset = subset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.problems, self.solutions = self._load_data()

    def _load_data(self):
        categories = os.listdir(os.path.join(self.base_dir, self.subset))
        problems, solutions = [], []

        for category in categories:
            category_path = os.path.join(self.base_dir, self.subset, category)
            files = os.listdir(category_path)[:200]

            for file in files:
                with open(os.path.join(category_path, file), 'r') as f:
                    entry = json.load(f)
                    problems.append(entry['problem'])
                    solutions.append(entry['solution'])

        return problems, solutions

    def __len__(self):
        return len(self.problems)

    def __getitem__(self, index):
        problem, solution = self.problems[index], self.solutions[index]

        if self.tokenizer:
            problem_enc = self.tokenizer(problem, truncation=True, padding='max_length', max_length=self.max_len)
            solution_enc = self.tokenizer(solution, truncation=True, padding='max_length', max_length=self.max_len)
            labels = [-100 if t == self.tokenizer.pad_token_id else t for t in solution_enc['input_ids']]

            return {
                'input_ids': torch.tensor(problem_enc['input_ids']),
                'attention_mask': torch.tensor(problem_enc['attention_mask']),
                'labels': torch.tensor(labels)
            }

        return problem, solution



Downloading from https://www.kaggle.com/api/v1/datasets/download/mathurinache/math-dataset?dataset_version_number=1...


100%|██████████| 7.07M/7.07M [00:01<00:00, 5.14MB/s]

Extracting files...





In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

train_dataset = MathDataset(base_dir=DATA_PATH, subset='train', tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

sample_index = 0
input_ids = train_dataset[sample_index]['input_ids'].unsqueeze(0).to(device)
attention_mask = train_dataset[sample_index]['attention_mask'].unsqueeze(0).to(device)

gen_tokens = model.generate(
    input_ids,
    attention_mask=attention_mask,
    do_sample=True,
    temperature=0.9,
    max_length=1024,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]

display(Markdown(fr'''
Problem Statement: {train_dataset.problems[sample_index]}

Generated Solution: {generated_text}

Solution: {train_dataset.solutions[sample_index]}
'''))

model.resize_token_embeddings(len(tokenizer))

num_epochs = 15
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
steps_total = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=steps_total)

loss_history = []



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


Problem Statement: Find the minimum of the function
\[\frac{xy}{x^2 + y^2}\]in the domain $\frac{2}{5} \le x \le \frac{1}{2}$ and $\frac{1}{3} \le y \le \frac{3}{8}.$

Generated Solution: Find the minimum of the function
\[\frac{xy}{x^2 + y^2}\]in the domain $\frac{2}{5} \le x \le \frac{1}{2}$ and $\frac{1}{3} \le y \le \frac{3}{8}.$

The first two cases are all the same, but we don't have to worry about the other two.

\[\text{C}(\sum_{i=1}^{n\infty}x^2-n) + (n \le 0 + n \infty)\]

Let be a function, not a constant. Let being $\sum_{i=1}^{n\infty}x^2=\times 10^{n}\] \[\text{\text{C}(\sum_{i=1}^{n\infty}x^2+n \le 0 + n \le n}x^2, \] and let be the constant.

\[\text{\text{C}(\sum_{i=1}^n-\frac{n \infty}x^2}) + (n \le 0 + n \infty)\] and let be the first case.

Now take the first $\sum_{i=1}^{n\infty}x^2-n \times 10^{n}\rightarrow. Let each element of $x \rightarrow \frac{x}{2} = $\frac{n \infty}x^2+\frac{n \infty}x^2$.

Let be the second case. Let the first $i = 0-\frac{0}{2}\rightarrow for all two $n \times n$.

\[\text{C}(\sum_{i=1}^{n\infty}x^2+\frac{n \infty}x^2}\] and let be the first case.

Now let $n = 0$ and $\sum_{i=1}^n 2 = \frac{10+\frac{1}{7}^2}\rightarrow. Let $u$ be $x \rightarrow $n \times 1$ and $y \le $n + \sum_{i=1}^{n\infty}x^2$.

\[\text{C}(\sum_{i=1}^n-\frac{n \infty}x^2-n \le 0+ n \le n}x^2, \] and let be the first case.

Now we can use the first and second cases as the first case variables



Solution: We can write
\[\frac{xy}{x^2 + y^2} = \frac{1}{\frac{x^2 + y^2}{xy}} = \frac{1}{\frac{x}{y} + \frac{y}{x}}.\]Let $t = \frac{x}{y},$ so $\frac{x}{y} + \frac{y}{x} = t + \frac{1}{t}.$  We want to maximize this denominator.

Let
\[f(t) = t + \frac{1}{t}.\]Suppose $0 < t < u.$  Then
\begin{align*}
f(u) - f(t) &= u + \frac{1}{u} - t - \frac{1}{t} \\
&= u - t + \frac{1}{u} - \frac{1}{t} \\
&= u - t + \frac{t - u}{tu} \\
&= (u - t) \left( 1 - \frac{1}{tu} \right) \\
&= \frac{(u - t)(tu - 1)}{tu}.
\end{align*}This means if $1 \le t < u,$ then
\[f(u) - f(t) = \frac{(u - t)(tu - 1)}{tu} > 0,\]so $f(u) > f(t).$  Hence, $f(t)$ is increasing on the interval $[1,\infty).$

On the other hand, if $0 \le t < u \le 1,$ then
\[f(u) - f(t) = \frac{(u - t)(tu - 1)}{tu} < 0,\]so $f(u) < f(t).$  Hence, $f(t)$ is decreasing on the interval $(0,1].$

So, to maximize $t + \frac{1}{t} = \frac{x}{y} + \frac{y}{x},$ we should look at the extreme values of $\frac{x}{y},$ namely its minimum and maximum.

The minimum occurs at $x = \frac{2}{5}$ and $y = \frac{3}{8}.$  For these values,
\[\frac{xy}{x^2 + y^2} = \frac{240}{481}.\]The maximum occurs at $x = \frac{1}{2}$ and $y = \frac{1}{3}.$  For these values,
\[\frac{xy}{x^2 + y^2} = \frac{6}{13}.\]Thus, the minimum value is $\boxed{\frac{6}{13}}.$


In [None]:
for epoch in range(num_epochs):
    progress = tqdm(range(len(train_loader)), desc=f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0

    for batch in train_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        output = model(**batch)
        loss = output.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress.update(1)
        progress.set_postfix({"loss": loss.item()})
        total_loss += loss.item()

    progress.close()
    avg_epoch_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} avg loss: {avg_epoch_loss:.4f}")
    loss_history.append(avg_epoch_loss)

# Testing phase


Epoch 1/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 1 avg loss: 5.6966


Epoch 2/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 2 avg loss: 5.3777


Epoch 3/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 3 avg loss: 5.2504


Epoch 4/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 4 avg loss: 5.1279


Epoch 5/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 5 avg loss: 4.9923


Epoch 6/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 6 avg loss: 4.8898


Epoch 7/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 7 avg loss: 4.7784


Epoch 8/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 8 avg loss: 4.6741


Epoch 9/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 9 avg loss: 4.5661


Epoch 10/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 10 avg loss: 4.4563


Epoch 11/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 11 avg loss: 4.3365


Epoch 12/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 12 avg loss: 4.2236


Epoch 13/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 13 avg loss: 4.1168


Epoch 14/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 14 avg loss: 4.0059


Epoch 15/15:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 15 avg loss: 3.9325


In [None]:
test_dataset = MathDataset(base_dir=DATA_PATH, subset='test', tokenizer=tokenizer)

sample_index = 0
input_ids = test_dataset[sample_index]['input_ids'].unsqueeze(0).to(device)
attention_mask = test_dataset[sample_index]['attention_mask'].unsqueeze(0).to(device)

gen_tokens = model.generate(
    input_ids,
    attention_mask=attention_mask,
    do_sample=True,
    temperature=0.9,
    max_length=1024,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]

display(Markdown(fr'''
Problem Statement: {test_dataset.problems[sample_index]}

Generated Solution: {generated_text}

Solution: {test_dataset.solutions[sample_index]}
'''))


Problem Statement: Solve
\[\frac{(x + 7)(x - 2)}{x - 1} < 0.\]Enter your answer using interval notation.

Generated Solution: Solve
\[\frac{(x + 7)(x - 2)}{x - 1} < 0.\]Enter your answer using interval notation.(
\ -$, $ 7 +$,)( \\$,
\x 7(xfrac7 - -=7x
 7 7x 7x 7x =x 7{})x 7 7-
-= 7 7 7 -7- 1\3ed 8x7x
 -$,q$,2{ 7 7 7 7- 7 $ 7 7 -
 7axx{ 7 7 - -
 7
 - - ==x 7 -x}{ 77 \x=\ 7edfrac 7 7=
 7 7x7 7 7 -

 8 7 7{ -= 7
 77 \ 7 7 -=
 77}{}7 b 7 7\7x 7x 7 7
 7 \7 7 7 -frac


 7 7a 7 7 $7a 7 7 b 7 7
7} 7 7 77 77 7 = 7 -

 \
 7 \ 7{
 7{7- 7
7 7-} 7 x 7 7 7
 7 a7= $7
 7 7
 77 77- 7 77
7{ 7 7 7a 77 7 7} 7 7 7 7$
 7

\

= 7 7a2{ \ 7 \ 7 7a 7 7 7
 7 $ 7 7 77{{ 7 -7a
$ - b 7$.7 7{
 7 7, 7a 7 7 7 7 7 7a the 7 7 87 7 7 7{
 7b 7

align\a 7 77 7 7 77 77
 = 777{77 -{ 77a 7 7} 77 7 7 7 7 7
 77a7 7 7\
 7 77{7 7

. 7 7 =& 77 7 7-$,7{=$ 77.777 7 7 7 7a{ $
7 7
a 7 77
7 7-a 7 7 7 7 7
a 7- - $a 7
= 7 7 & - 7 77=} 1 7{ equation77eda 7,7a 7 7 7 7 7{7$\1align 7 7 77 7n 777ed 7 + 77{& 7 7778
{ 7 7 7 7 -.7align 7 7 7b77=a 7 7 + 7 78a 7 7 7{7 7

Solution: We can build a sign chart:

\[
\begin{array}{c|cccc}
& x < -7 & -7 < x < 1 & 1 < x < 2 & 2 < x \\ \hline
x + 7 & - & + & + & + \\
x - 1 & - & - & + & + \\
x - 2 & - & - & - & + \\
\frac{(x + 7)(x - 2)}{x - 1} & - & + & - & +
\end{array}
\]Thus, the solution is $x \in \boxed{(-\infty,-7) \cup (1,2)}.$
