In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import Dataset

from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import torch

In [2]:
model_ckpt = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Load and analyse data

In [3]:
# data = {}
# with open("./data/random-expressions-partition-8-4-2023.txt", "r") as f:
#     data["sums"] = f.read().split("\n")
# f.close()
# with open("./data/random-answers-partition-8-4-2023.txt", "r") as f:
#     data["answers"] = f.read().split("\n")
# f.close()

In [4]:
# df = pd.DataFrame(data)
# print(df.head())

In [5]:
# df = df[df["answers"] != "$Aborted"]
# print(df.head())

In [6]:
token2int = {token:i for i,token in enumerate(['Pi','Catalan','EulerGamma','Sqrt','Log'])}
print(token2int)

{'Pi': 0, 'Catalan': 1, 'EulerGamma': 2, 'Sqrt': 3, 'Log': 4}


In [7]:
def sum2tokens(mathematica_sum):
    """
    Example:
        '{{}, {-2, -2, -10, -8, -9, -8, -10, -5, -6, -4}}\n' ---> ['#', '/', '-2', '-2', '-10', '-8', '-9', '-8', '-10', '-5', '-6', '-4']

    """
    LHS, RHS = mathematica_sum.split("}, {")
    LHS, RHS = LHS.lstrip("{"), RHS.rstrip("\n").rstrip("}")
    if len(LHS) == 0:
        LHS = ["#"]
    else:
        LHS = LHS.split(", ")
    RHS = RHS.split(", ")    
    return LHS + ["/"] + RHS
sum2tokens('{{}, {-2, -2, -10, -8, -9, -8, -10, -5, -6, -4}}\n')

['#', '/', '-2', '-2', '-10', '-8', '-9', '-8', '-10', '-5', '-6', '-4']

In [8]:
df = pd.DataFrame(columns=["sum", "label"])
dataset_size = 600_000 # sum(1 for _ in open("./data/random-expressions-partition-8-4-2023.txt"))
count = 0
for S, answer in tqdm(zip(open("./data/random-expressions-partition-8-4-2023.txt"), open("./data/random-answers-partition-8-4-2023.txt")), total=dataset_size):
    if count > dataset_size: break
    count += 1
    for token in token2int:
        if token in answer:
            df.loc[len(df)] = [" ".join(sum2tokens(S)), token2int[token]]
            
print(df.head())

600001it [2:37:11, 63.62it/s]                                                                                          


                                               sum  label
0   # / -4/5 0 1/5 5/3 4/3 -4/3 -1/3 -3/5 -3/4 1/3      0
1   # / -4/5 0 1/5 5/3 4/3 -4/3 -1/3 -3/5 -3/4 1/3      3
2   # / -4/5 0 1/5 5/3 4/3 -4/3 -1/3 -3/5 -3/4 1/3      4
3  # / -5/3 3/4 -2/3 3/2 -1/2 -1/4 4/3 -2/5 1/2 -5      0
4  # / -5/3 3/4 -2/3 3/2 -1/2 -1/4 4/3 -2/5 1/2 -5      2


# Tokenize time

In [9]:
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['sum', 'label', '__index_level_0__'],
    num_rows: 510356
})


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
def tokenize(batch):
    return tokenizer(batch["sum"], padding=True, truncation=True)

In [11]:
dataset_encoded = dataset.map(tokenize, batched=True)
print(dataset_encoded)

Map:   0%|          | 0/510356 [00:00<?, ? examples/s]

Dataset({
    features: ['sum', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 510356
})


In [12]:
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
print(dataset_encoded)

Dataset({
    features: ['sum', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 510356
})


In [13]:
dataset_encoded.push_to_hub("Dragonoverlord3000/sum_classifier_dataset")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/511 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/560 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


In [14]:
p

NameError: name 'p' is not defined

# Model

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)
print(model)

In [None]:
# The training arguments - note the `push_to_hub`
batch_size = 32
logging_steps = batch_size
model_name = "sum-classifier"
training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=2,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_gpu_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=True,
                                 log_level="error")

In [None]:
# Define the trainer
trainer = Trainer(model=model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=dataset_encoded,
                 eval_dataset=dataset_encoded,
                 tokenizer=tokenizer)

In [None]:
# trainer.push_to_hub(commit_message="Training Complete")