In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import Dataset

from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import torch

In [17]:
model_ckpt = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Load and analyse data

In [18]:
data = {}
with open("./data/random-expressions-partition-8-4-2023.txt", "r") as f:
    data["sums"] = f.read().split("\n")
f.close()
with open("./data/random-answers-partition-8-4-2023.txt", "r") as f:
    data["answers"] = f.read().split("\n")
f.close()

In [19]:
df = pd.DataFrame(data)
print(df.head())

                                                sums   answers
0  {{}, {4/3, -3/4, -3/4, 5/2, 4/3, 5/4, -2, 1/4,...  $Aborted
1  {{}, {4/3, -5/4, -1/2, -3/2, -3/5, -3/5, -5, -...  $Aborted
2  {{}, {1/5, 4/5, -1/2, -3/2, 4/3, 2/5, -5/2, -1...  $Aborted
3  {{}, {2/3, 5/4, -3/5, 4/3, -1, 4/3, -5/4, -5/2...  $Aborted
4  {{}, {-3/4, -5, -5, 2/5, 3/4, 1/4, -4/3, 2/5, ...  $Aborted


In [20]:
df = df[df["answers"] != "$Aborted"]
print(df.head())

                                                 sums  \
8   {{}, {-4/5, 0, 1/5, 5/3, 4/3, -4/3, -1/3, -3/5...   
16  {{}, {-5/3, 3/4, -2/3, 3/2, -1/2, -1/4, 4/3, -...   
17  {{}, {1/5, 5/2, 0, -3/5, -5/2, -2, -3/4, -5/3,...   
18  {{}, {-3/2, -1/4, 1/2, 5/2, -1/3, -1, 2/3, 4/3...   
20  {{}, {-3/5, 3/5, -1/4, 5/4, -5, 5/3, -3/2, -4,...   

                                              answers  
8   (-27*(-51708617291610*Sqrt[5] - 42718013161472...  
16  -17280*(773750880083/3307889451566700 + (-137/...  
17  -1/33984629258580000*(4155111422336236*Sqrt[5]...  
18  (24*(35863559070 - 10166059008*Pi + 3223382121...  
20  -2400*(-36799338815694409/12825799082188092000...  


In [21]:
token2int = {token:i for i,token in enumerate(['Pi','Catalan','EulerGamma','Sqrt','Log'])}
print(token2int)

{'Pi': 0, 'Catalan': 1, 'EulerGamma': 2, 'Sqrt': 3, 'Log': 4}


In [22]:
def sum2tokens(mathematica_sum):
    """
    Example:
        '{{}, {-2, -2, -10, -8, -9, -8, -10, -5, -6, -4}}\n' ---> ['#', '/', '-2', '-2', '-10', '-8', '-9', '-8', '-10', '-5', '-6', '-4']

    """
    LHS, RHS = mathematica_sum.split("}, {")
    LHS, RHS = LHS.lstrip("{"), RHS.rstrip("\n").rstrip("}")
    if len(LHS) == 0:
        LHS = ["#"]
    else:
        LHS = LHS.split(", ")
    RHS = RHS.split(", ")    
    return LHS + ["/"] + RHS
sum2tokens('{{}, {-2, -2, -10, -8, -9, -8, -10, -5, -6, -4}}\n')

['#', '/', '-2', '-2', '-10', '-8', '-9', '-8', '-10', '-5', '-6', '-4']

In [23]:
new_df = pd.DataFrame(columns=["sum", "label"])
for pointer in range(10):
    for token in token2int:
        if token in df.iloc[pointer]["answers"]:
            new_df.loc[len(new_df)] = [" ".join(sum2tokens(df.iloc[pointer]["sums"])), token2int[token]]
            
print(new_df.head())

                                                 sum  label
0  [#, /, -4/5, 0, 1/5, 5/3, 4/3, -4/3, -1/3, -3/...      0
1  [#, /, -4/5, 0, 1/5, 5/3, 4/3, -4/3, -1/3, -3/...      3
2  [#, /, -4/5, 0, 1/5, 5/3, 4/3, -4/3, -1/3, -3/...      4
3  [#, /, -5/3, 3/4, -2/3, 3/2, -1/2, -1/4, 4/3, ...      0
4  [#, /, -5/3, 3/4, -2/3, 3/2, -1/2, -1/4, 4/3, ...      2


# Tokenize time

In [24]:
dataset = Dataset.from_pandas(new_df)
print(dataset)

Dataset({
    features: ['sum', 'label', '__index_level_0__'],
    num_rows: 34
})


In [25]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
def tokenize(batch):
    return tokenizer(batch["sum"], padding=True, truncation=True)

In [26]:
dataset_encoded = dataset.map(tokenize, batched=True)
print(dataset_encoded)

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
print(dataset_encoded)

# Model

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)
print(model)

In [None]:
# The training arguments - note the `push_to_hub`
batch_size = 32
logging_steps = batch_size
model_name = "sum-classifier"
training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=2,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_gpu_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=True,
                                 log_level="error")

In [None]:
# Define the trainer
trainer = Trainer(model=model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=dataset_encoded,
                 eval_dataset=dataset_encoded,
                 tokenizer=tokenizer)

In [None]:
# trainer.push_to_hub(commit_message="Training Complete")