In [None]:
!pip install -U accelerate
!pip install -U datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("/content/drive/MyDrive/CS 2756 Project/artemis_dataset_release_v0.csv").sample(5000)

In [None]:
data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
label2id = {label:i for i, label in enumerate(data.emotion.unique())}
id2label = {i:label for label, i in label2id.items()}
data["labels"] = [label2id[emot] for emot in data.emotion]

In [None]:
train, test = train_test_split(data, random_state=42)

In [None]:
train, valid = train_test_split(train, test_size =0.2, random_state=42)

In [None]:
import datasets
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train)
valid_ds = Dataset.from_pandas(valid)
test_ds = Dataset.from_pandas(test)


dataset = DatasetDict()

dataset['train'] = train_ds
dataset['validation'] = valid_ds
dataset["test"] = test_ds

In [None]:
dataset

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

pretrained_model = "google/rembert" #"distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
!pip install evaluate

In [None]:
import evaluate

In [None]:
import torch

def tokenize_function(examples):
    return tokenizer(examples['utterance'], padding='max_length', truncation=True, max_length=128)


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=9)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

metrics = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for model predictions and checkpoints
    evaluation_strategy='epoch',     # Evaluation is done at the end of each epoch
    learning_rate=3e-5,              # Learning rate
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=1,
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])