

# LSTM-arithmetic

## Dataset
- [Arithmetic dataset](https://drive.google.com/file/d/1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-/view?usp=sharing)

In [None]:
! pip install seaborn
! pip install opencc
! pip install -U scikit-learn



In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import opencc
import os
from sklearn.model_selection import train_test_split

In [None]:
data_path = './data'

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive
import os
drive_path = '/content/drive/MyDrive/NLP/HW2'
os.chdir(drive_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
'234201526 (1).docx'	 'Colab Notebooks'   NLP	      Untitled
 arc2025.pdf		  docker.txt	     Paspor.pdf       推薦信_盧老師_中.docx
 AsposePrintServlet.pdf   HW8		     Passbook.pdf     研究所申請
 Autobio-zh.gdoc	  Kasih.apk	    'Project Video'   資訊系畢業審查表-109入學.docx


In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib
import seaborn as sns
import opencc
import sklearn
import tqdm

In [None]:
print(f"numpy=={np.__version__}")
print(f"pandas=={pd.__version__}")
print(f"torch=={torch.__version__}")
print(f"matplotlib=={matplotlib.__version__}")
print(f"seaborn=={sns.__version__}")
print(f"opencc=={opencc.__version__}")
print(f"scikit-learn=={sklearn.__version__}")
print(f"tqdm=={tqdm.__version__}")

numpy==1.26.4
pandas==2.2.2
torch==2.5.1+cu121
matplotlib==3.8.0
seaborn==0.13.2
opencc==1.1.9
scikit-learn==1.6.0
tqdm==4.67.1


In [None]:
df_train = pd.read_csv(os.path.join(data_path, 'arithmetic_train.csv'))
df_eval = pd.read_csv(os.path.join(data_path, 'arithmetic_eval.csv'))
print(df_eval.head())
df_train.head()

   Unnamed: 0          src  tgt
0     2573208    48+43+34=  125
1     1630340  30-(48+13)=  -31
2      549277  (21*31)+10=  661
3      133957     2-27-10=  -35
4     1279828  (15*20)+24=  324


Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [None]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['len'] = df_train['src'].apply(lambda x: len(x))

In [None]:
df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

# Build Dictionary
 - The model cannot perform calculations directly with plain text.
 - Convert all text (numbers/symbols) into numerical representations.
 - Special tokens
    - '&lt;pad&gt;'
        - Each sentence within a batch may have different lengths.
        - The length is padded with '&lt;pad&gt;' to match the longest sentence in the batch.
    - '&lt;eos&gt;'
        - Specifies the end of the generated sequence.
        - Without '&lt;eos&gt;', the model will not know when to stop generating.


In [None]:
# Initialize dictionaries with special tokens
char_to_id = {'<pad>': 0, '<eos>': 1}
id_to_char = {0: '<pad>', 1: '<eos>'}
vocab_idx = 2  # Start indexing from 2 since 0 and 1 are taken by special tokens

In [None]:
for equation in df_train['src']:
    for char in equation:
        if char not in char_to_id:
            char_to_id[char] = vocab_idx
            id_to_char[vocab_idx] = char
            vocab_idx += 1

In [None]:
for result in df_train['tgt']:
    for char in result:
        if char not in char_to_id:
            char_to_id[char] = vocab_idx
            id_to_char[vocab_idx] = char
            vocab_idx += 1

In [None]:
vocab_size = len(char_to_id)
print('Vocab size: {}'.format(vocab_size))

Vocab size: 18


In [None]:
#check dict
print('char_to_id:', char_to_id)
print('id_to_char:', id_to_char)

char_to_id: {'<pad>': 0, '<eos>': 1, '1': 2, '4': 3, '*': 4, '(': 5, '3': 6, '+': 7, '2': 8, '0': 9, ')': 10, '=': 11, '6': 12, '5': 13, '9': 14, '-': 15, '8': 16, '7': 17}
id_to_char: {0: '<pad>', 1: '<eos>', 2: '1', 3: '4', 4: '*', 5: '(', 6: '3', 7: '+', 8: '2', 9: '0', 10: ')', 11: '=', 12: '6', 13: '5', 14: '9', 15: '-', 16: '8', 17: '7'}


# Data Preprocessing
 - The data is processed into the format required for the model's input and output.
 - Example: 1+2-3=0
     - Model input: 1 + 2 - 3 = 0
     - Model output: / / / / / 0 &lt;eos&gt;  (the '/' can be replaced with &lt;pad&gt;)
     - The key for the model's output is that the model does not need to predict the next character of the previous part. What matters is that once the model sees '=', it should start generating the answer, which is '0'. After generating the answer, it should also generate&lt;eos&gt;


In [None]:
# Data Preprocessing
def preprocess_data_for_model(df, char_to_id):
    def encode_equation(equation, result):
        if not equation.endswith('='):
            equation += '='
        combined = list(equation) + list(result) + ['<eos>']
        combined_encoded = [char_to_id[char] for char in combined]
        seq_len = len(combined_encoded)
        return combined_encoded, seq_len

    # Apply the encoding function to each row
    df['combined_id_list'], df['seq_len'] = zip(*df.apply(lambda x: encode_equation(x['src'], x['tgt']), axis=1))
    return df

In [None]:
df_train = preprocess_data_for_model(df_train, char_to_id)
df_eval = preprocess_data_for_model(df_eval, char_to_id)
print(df_train.head())
print(df_eval.head())

   Unnamed: 0          src   tgt  len  \
0     2285313  14*(43+20)=   882   11   
1      317061     (6+1)*5=    35    8   
2      718770    13+32+29=    74    9   
3      170195   31*(3-11)=  -248   10   
4     2581417     24*49+1=  1177    8   

                                    combined_id_list  seq_len  
0  [2, 3, 4, 5, 3, 6, 7, 8, 9, 10, 11, 16, 16, 8, 1]       15  
1             [5, 12, 7, 2, 10, 4, 13, 11, 6, 13, 1]       11  
2            [2, 6, 7, 6, 8, 7, 8, 14, 11, 17, 3, 1]       12  
3  [6, 2, 4, 5, 6, 15, 2, 2, 10, 11, 15, 8, 3, 16...       15  
4        [8, 3, 4, 3, 14, 7, 2, 11, 2, 2, 17, 17, 1]       13  
   Unnamed: 0          src  tgt  len  \
0     2573208    48+43+34=  125    9   
1     1630340  30-(48+13)=  -31   11   
2      549277  (21*31)+10=  661   11   
3      133957     2-27-10=  -35    8   
4     1279828  (15*20)+24=  324   11   

                                    combined_id_list  seq_len  
0         [3, 16, 7, 3, 6, 7, 6, 3, 11, 2, 8, 13, 1]       13  


# Hyper Parameters

|Hyperparameter|Meaning|Value|
|-|-|-|
|`batch_size`|Number of data samples in a single batch|64|
|`epochs`|Total number of epochs to train|10|
|`embed_dim`|Dimension of the word embeddings|256|
|`hidden_dim`|Dimension of the hidden state in each timestep of the LSTM|256|
|`lr`|Learning Rate|0.001|
|`grad_clip`|To prevent gradient explosion in RNNs, restrict the gradient range|1|


In [None]:
batch_size = 64
epochs = 4
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1

# Data Batching
- Use `torch.utils.data.Dataset` to create a data generation tool called  `dataset`.
- The, use `torch.utils.data.DataLoader` to randomly sample from the `dataset` and group the samples into batches.


In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.combined_id_list = df['combined_id_list'].tolist()
        self.seq_len = df['seq_len'].tolist()

    def __len__(self):
        return len(self.combined_id_list)

    def __getitem__(self, idx):
        x = self.combined_id_list[idx]
        seq_len = self.seq_len[idx]
        return x, seq_len

In [None]:
# collate function, used to build dataloader
def collate_fn(batch):
    batch_x, batch_seq_len = zip(*batch)
    batch_seq_len = torch.tensor(batch_seq_len, dtype=torch.long)
    batch_x_padded = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(x, dtype=torch.long) for x in batch_x],
        batch_first=True,
        padding_value=char_to_id['<pad>']
    )
    return batch_x_padded, batch_seq_len

In [None]:
# Create Dataset instances for train and eval datasets
ds_train = Dataset(df_train)
ds_eval = Dataset(df_eval)

In [None]:
from torch.utils.data import DataLoader

In [None]:
batch_size = 64  # Adjust as needed

In [None]:
# Create datasets
ds_train = Dataset(df_train)
ds_eval = Dataset(df_eval)

In [None]:
# Create data loaders
dl_train = DataLoader(dataset=ds_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dl_eval = DataLoader(dataset=ds_eval, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Model Design

## Execution Flow
1. Convert all characters in the sentence into embeddings.
2. Pass the embeddings through an LSTM sequentially.
3. The output of the LSTM is passed into another LSTM, and additional layers can be added.
4. The output from all time steps of the final LSTM is passed through a Fully Connected layer.
5. The character corresponding to the maximum value across all output dimensions is selected as the next character.

## Loss Function
Since this is a classification task, Cross Entropy is used as the loss function.

## Gradient Update
Adam algorithm is used for gradient updates.

In [None]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()

        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])

        self.rnn_layer1 = torch.nn.LSTM(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)

        self.rnn_layer2 = torch.nn.LSTM(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)

        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))

    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)

    # The forward pass of the model
    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)

        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,
                                                          batch_x_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)

        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)

        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,
                                                            batch_first=True)

        batch_x = self.linear(batch_x)

        return batch_x
    #Assisted by ChatGPT
    def generator(self, start_char, max_len=200):
      self.eval()
      with torch.no_grad():
          # Convert input characters to IDs
          char_list = [char_to_id.get(c, char_to_id['<pad>']) for c in start_char]
          device = next(self.parameters()).device

          # Prepare the input sequence and length
          input_seq = torch.tensor([char_list], dtype=torch.long).to(device)
          input_len = torch.tensor([len(char_list)], dtype=torch.long).to("cpu")  # Ensure length is on CPU

          # Encode the input
          outputs = self.encoder(input_seq, input_len)
          outputs = outputs[0, -1, :]  # Get the last time step output
          next_char_id = torch.argmax(outputs, dim=-1).item()

          # Initialize generated character list
          generated_ids = char_list.copy()
          generated_ids.append(next_char_id)

          for _ in range(max_len):
              # Update input sequence and length
              input_seq = torch.tensor([generated_ids], dtype=torch.long).to(device)
              input_len = torch.tensor([len(generated_ids)], dtype=torch.long).to("cpu")  # Ensure length is on CPU

              # Generate next character
              outputs = self.encoder(input_seq, input_len)
              outputs = outputs[0, -1, :]  # Get the last time step output
              next_char_id = torch.argmax(outputs, dim=-1).item()

              if next_char_id == char_to_id['<eos>']:
                  break

              generated_ids.append(next_char_id)

          # Convert IDs back to characters
          generated_chars = [id_to_char[id_] for id_ in generated_ids]
          return generated_chars


In [None]:
torch.manual_seed(2)

<torch._C.Generator at 0x7de0993ebd90>

In [None]:
device =  torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CharRNN(vocab_size,
                embed_dim,
                hidden_dim)

In [None]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Training
1. The outer `for` loop controls the `epoch`
    1. The inner `for` loop uses `data_loader` to retrieve batches.
        1. Pass the batch to the `model` for training.
        2. Compare the predicted results `batch_pred_y` with the true labels `batch_y` using Cross Entropy to calculate the loss `loss`
        3. Use `loss.backward` to automatically compute the gradients.
        4. Use `torch.nn.utils.clip_grad_value_` to limit the gradient values between `-grad_clip` &lt; and &lt; `grad_clip`.
        5. Use `optimizer.step()` to update the model (backpropagation).
2.  After every `1000` batches, output the current loss to monitor whether it is converging.

In [None]:
from tqdm import tqdm

In [None]:
#Assisted by ChatGPT
model.train()
i = 0
equal_token_id = char_to_id['=']
eos_token_id = char_to_id['<eos>']

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
for epoch in range(1, epochs + 2):
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    total_loss = 0
    for batch_x, batch_seq_len in bar:
        batch_x = batch_x.to(device)
        batch_seq_len = batch_seq_len.to("cpu", torch.int64)
        optimizer.zero_grad()

        outputs = model(batch_x, batch_seq_len)

        # Shift outputs and targets for teacher forcing
        outputs = outputs[:, :-1, :]  # Exclude last time step
        targets = batch_x[:, 1:]      # Exclude first character

        batch_size, seq_len, vocab_size = outputs.shape

        # Create a mask for target positions (positions after '=')
        mask = torch.zeros_like(targets, dtype=torch.bool)
        for idx in range(batch_size):
            # Find position of '=' in batch_x
            eq_positions = (batch_x[idx] == equal_token_id).nonzero(as_tuple=False)
            if eq_positions.numel() == 0:
                continue  # Skip if '=' not found
            eq_pos = eq_positions[0].item()

            start_pos = eq_pos
            end_pos = batch_seq_len[idx] - 1  # Adjusted for shifted targets

            if start_pos >= end_pos:
                continue  # No target positions

            mask[idx, start_pos:end_pos] = True

        # Flatten outputs and targets
        outputs_flat = outputs.contiguous().view(-1, vocab_size)
        targets_flat = targets.contiguous().view(-1)
        mask_flat = mask.view(-1)

        # Apply mask
        outputs_masked = outputs_flat[mask_flat]
        targets_masked = targets_flat[mask_flat]

        if outputs_masked.size(0) == 0:
            continue  # Skip if no valid targets

        # Compute loss
        loss = criterion(outputs_masked, targets_masked)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)

        # Update parameters
        optimizer.step()

        total_loss += loss.item()
        i += 1  # Increment iteration counter
        if i % 5000 == 0:
            # Debugging outputs
            print(f"\n[Debug] Iteration {i}")
            print(f"Current Loss: {loss.item():.4f}")

            # Generate sample predictions
            model.eval()
            sample_inputs = ['1+1=', '12+34=', '(7*8)+9=']
            print("Sample Predictions:")
            for expr in sample_inputs:
                expected_output = expr + str(eval(expr[:-1]))
                predicted_output = ''.join(model.generator(expr))
                print(f"Input: {expr}")
                print(f"Expected Output: {expected_output}")
                print(f"Predicted Output: {predicted_output}\n")
            model.train()

        # Update progress bar
        if i % 50 == 0:
            bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dl_train)
    print(f"Epoch {epoch} Training Loss: {avg_loss}")

    # Evaluation
    model.eval()
    matched = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_seq_len in dl_eval:
            batch_x = batch_x.to(device)
            batch_seq_len = batch_seq_len.to("cpu", torch.int64)
            outputs = model(batch_x, batch_seq_len)
            preds = outputs.argmax(dim=2)

            # Shift predictions and targets
            preds = preds[:, :-1]
            targets = batch_x[:, 1:]

            batch_size = batch_x.size(0)

            for idx in range(batch_size):
                # Find position of '=' in batch_x
                eq_positions = (batch_x[idx] == equal_token_id).nonzero(as_tuple=False)
                if eq_positions.numel() == 0:
                    continue  # Skip if '=' not found
                eq_pos = eq_positions[0].item()

                start_pos = eq_pos
                end_pos = batch_seq_len[idx] - 1

                if start_pos >= end_pos:
                    continue  # No target positions

                pred_seq = preds[idx, start_pos:end_pos]
                true_seq = targets[idx, start_pos:end_pos]

                if pred_seq.size(0) == 0:
                    continue  # Skip if sequence length is zero

                # Compare sequences
                if torch.equal(pred_seq, true_seq):
                    matched += 1
                total += 1

    accuracy = matched / total if total > 0 else 0
    print(f"Validation EM accuracy: {accuracy:.4f}")

    model.train()

Train epoch 1:  14%|█▎        | 5005/37020 [01:18<08:45, 60.94it/s, loss=0.662]


[Debug] Iteration 5000
Current Loss: 0.6620
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=3

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=48

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=75



Train epoch 1:  27%|██▋       | 10009/37020 [02:36<07:08, 63.01it/s, loss=0.579]


[Debug] Iteration 10000
Current Loss: 0.5795
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=3

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 1:  41%|████      | 15007/37020 [03:55<06:00, 61.10it/s, loss=0.422]


[Debug] Iteration 15000
Current Loss: 0.4223
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 1:  54%|█████▍    | 20012/37020 [05:12<04:28, 63.41it/s, loss=0.371]


[Debug] Iteration 20000
Current Loss: 0.3705
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=71



Train epoch 1:  68%|██████▊   | 25010/37020 [06:31<03:11, 62.58it/s, loss=0.4]


[Debug] Iteration 25000
Current Loss: 0.3998
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=3

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=73



Train epoch 1:  81%|████████  | 30007/37020 [07:49<01:56, 60.35it/s, loss=0.332]


[Debug] Iteration 30000
Current Loss: 0.3324
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=75



Train epoch 1:  95%|█████████▍| 35012/37020 [09:07<00:31, 63.32it/s, loss=0.351]


[Debug] Iteration 35000
Current Loss: 0.3509
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=3

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=63



Train epoch 1: 100%|██████████| 37020/37020 [09:38<00:00, 64.01it/s, loss=0.307]


Epoch 1 Training Loss: 0.4916629414898779
Validation EM accuracy: 0.5833


Train epoch 2:   8%|▊         | 2990/37020 [00:46<09:23, 60.37it/s, loss=0.327]


[Debug] Iteration 40000
Current Loss: 0.3266
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=63



Train epoch 2:  22%|██▏       | 7988/37020 [02:03<07:38, 63.26it/s, loss=0.301]


[Debug] Iteration 45000
Current Loss: 0.3008
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=63



Train epoch 2:  35%|███▌      | 12986/37020 [03:21<06:32, 61.19it/s, loss=0.257]


[Debug] Iteration 50000
Current Loss: 0.2571
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=63



Train epoch 2:  49%|████▊     | 17991/37020 [04:38<05:01, 63.06it/s, loss=0.288]


[Debug] Iteration 55000
Current Loss: 0.2885
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 2:  62%|██████▏   | 22989/37020 [05:56<03:43, 62.85it/s, loss=0.298]


[Debug] Iteration 60000
Current Loss: 0.2976
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=63



Train epoch 2:  76%|███████▌  | 27992/37020 [07:13<02:23, 62.97it/s, loss=0.253]


[Debug] Iteration 65000
Current Loss: 0.2530
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=63



Train epoch 2:  89%|████████▉ | 32990/37020 [08:30<01:04, 62.66it/s, loss=0.177]


[Debug] Iteration 70000
Current Loss: 0.1772
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=63



Train epoch 2: 100%|██████████| 37020/37020 [09:33<00:00, 64.52it/s, loss=0.24]


Epoch 2 Training Loss: 0.2688439503919911
Validation EM accuracy: 0.6852


Train epoch 3:   3%|▎         | 967/37020 [00:15<09:25, 63.79it/s, loss=0.255]


[Debug] Iteration 75000
Current Loss: 0.2555
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3:  16%|█▌        | 5965/37020 [01:32<08:13, 62.97it/s, loss=0.186]


[Debug] Iteration 80000
Current Loss: 0.1860
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3:  30%|██▉       | 10970/37020 [02:50<07:05, 61.22it/s, loss=0.174]


[Debug] Iteration 85000
Current Loss: 0.1735
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3:  43%|████▎     | 15967/37020 [04:08<05:37, 62.32it/s, loss=0.24]


[Debug] Iteration 90000
Current Loss: 0.2402
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3:  57%|█████▋    | 20965/37020 [05:26<04:20, 61.61it/s, loss=0.212]


[Debug] Iteration 95000
Current Loss: 0.2123
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3:  70%|███████   | 25970/37020 [06:44<02:58, 61.93it/s, loss=0.269]


[Debug] Iteration 100000
Current Loss: 0.2692
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3:  84%|████████▎ | 30968/37020 [08:01<01:35, 63.65it/s, loss=0.169]


[Debug] Iteration 105000
Current Loss: 0.1691
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3:  97%|█████████▋| 35966/37020 [09:19<00:17, 61.89it/s, loss=0.223]


[Debug] Iteration 110000
Current Loss: 0.2226
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 3: 100%|██████████| 37020/37020 [09:35<00:00, 64.28it/s, loss=0.224]


Epoch 3 Training Loss: 0.21258604078398144
Validation EM accuracy: 0.7347


Train epoch 4:  11%|█         | 3949/37020 [01:01<08:44, 63.10it/s, loss=0.183]


[Debug] Iteration 115000
Current Loss: 0.1829
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=73



Train epoch 4:  24%|██▍       | 8947/37020 [02:19<07:25, 63.08it/s, loss=0.173]


[Debug] Iteration 120000
Current Loss: 0.1728
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 4:  38%|███▊      | 13952/37020 [03:36<06:05, 63.08it/s, loss=0.224]


[Debug] Iteration 125000
Current Loss: 0.2243
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=4

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 4:  51%|█████     | 18950/37020 [04:54<04:50, 62.26it/s, loss=0.165]


[Debug] Iteration 130000
Current Loss: 0.1649
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 4:  65%|██████▍   | 23948/37020 [06:11<03:24, 63.95it/s, loss=0.152]


[Debug] Iteration 135000
Current Loss: 0.1515
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 4:  78%|███████▊  | 28946/37020 [07:29<02:11, 61.36it/s, loss=0.149]


[Debug] Iteration 140000
Current Loss: 0.1492
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=4

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 4:  92%|█████████▏| 33951/37020 [08:46<00:48, 63.01it/s, loss=0.143]


[Debug] Iteration 145000
Current Loss: 0.1432
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=4

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=73



Train epoch 4: 100%|██████████| 37020/37020 [09:34<00:00, 64.43it/s, loss=0.209]


Epoch 4 Training Loss: 0.17940179156124397
Validation EM accuracy: 0.7531


Train epoch 5:   5%|▌         | 1926/37020 [00:29<09:22, 62.41it/s, loss=0.168]


[Debug] Iteration 150000
Current Loss: 0.1677
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=73



Train epoch 5:  19%|█▊        | 6931/37020 [01:47<08:08, 61.56it/s, loss=0.0722]


[Debug] Iteration 155000
Current Loss: 0.0722
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=73



Train epoch 5:  32%|███▏      | 11928/37020 [03:05<06:38, 62.90it/s, loss=0.132]


[Debug] Iteration 160000
Current Loss: 0.1324
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 5:  46%|████▌     | 16926/37020 [04:23<05:29, 60.97it/s, loss=0.117]


[Debug] Iteration 165000
Current Loss: 0.1165
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 5:  59%|█████▉    | 21931/37020 [05:41<04:08, 60.65it/s, loss=0.185]


[Debug] Iteration 170000
Current Loss: 0.1848
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 5:  73%|███████▎  | 26929/37020 [06:59<02:39, 63.38it/s, loss=0.164]


[Debug] Iteration 175000
Current Loss: 0.1636
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 5:  86%|████████▌ | 31927/37020 [08:16<01:22, 61.67it/s, loss=0.155]


[Debug] Iteration 180000
Current Loss: 0.1548
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=65



Train epoch 5: 100%|█████████▉| 36932/37020 [09:34<00:01, 62.93it/s, loss=0.209]


[Debug] Iteration 185000
Current Loss: 0.2090
Sample Predictions:
Input: 1+1=
Expected Output: 1+1=2
Predicted Output: 1+1=2

Input: 12+34=
Expected Output: 12+34=46
Predicted Output: 12+34=46

Input: (7*8)+9=
Expected Output: (7*8)+9=65
Predicted Output: (7*8)+9=71



Train epoch 5: 100%|██████████| 37020/37020 [09:36<00:00, 64.25it/s, loss=0.0706]


Epoch 5 Training Loss: 0.15543517129408194
Validation EM accuracy: 0.8047


# Generation
Use `model.generator` and provide an initial character to automatically generate a sequence.

In [None]:
model = model.to("cpu")
print("".join(model.generator('1+1=')))

1+1=2


In [None]:
expressions = ['12+34=', '(7*8)+9=', '100:4=', '5*3=']
for expr in expressions:
    output = "".join(model.generator(expr))
    print(f"Input: {expr} Output: {output}")

Input: 12+34= Output: 12+34=46
Input: (7*8)+9= Output: (7*8)+9=71
Input: 100:4= Output: 100<pad>4=18
Input: 5*3= Output: 5*3=15
