In [None]:
import numpy as np
import evaluate

from datasets import DatasetDict, Dataset
from functions import get_dataset
from constants import RANDOM_STATE
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Reading the data
df = get_dataset('dataset2.csv')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)

# Initializing the model and the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=3)

# Transforming from pandas dataframe to HF dataset
train_dataset = Dataset.from_pandas(train_df, preserve_index = False)
val_dataset = Dataset.from_pandas(val_df, preserve_index = False)
dataset = DatasetDict({"train": train_dataset, "val": val_dataset})
tokenized_datasets = dataset.map(lambda x: tokenizer(x["text"], padding="max_length", max_length=128, truncation=True), batched=True)

# Defining metric function
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Creating HF Trainer
training_args = TrainingArguments(
    output_dir="bert-savings", 
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the model and the tokenizer
trainer.save_model('models/bert')
tokenizer.save_pretrained('models/bert')