In [1]:
import pandas as pd
import numpy as np
import time
from datasets import Dataset

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1984)

<torch._C.Generator at 0x2271c4c4ef0>

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

cuda:0


# Load Dataset

In [4]:
dataset_dir = './data/tweetval_emotion/'

df_train_text = pd.read_csv('./data/tweetval_emotion/train_text.txt', header=None, delimiter='§', names=['text'])
df_train_labels = pd.read_csv(dataset_dir+'train_labels.txt', header=None, names=['label'])
df_train = df_train_text.join(df_train_labels)

df_val_text = pd.read_csv(dataset_dir+'val_text.txt', header=None, delimiter='§', names=['text'])
df_val_labels = pd.read_csv(dataset_dir+'val_labels.txt', header=None, names=['label'])
df_val = df_val_text.join(df_val_labels)

df_test_text = pd.read_csv(dataset_dir+'test_text.txt', header=None, delimiter='§', names=['text'])
df_test_labels = pd.read_csv(dataset_dir+'test_labels.txt', header=None, names=['label'])
df_test = df_test_text.join(df_test_labels)

df_labels = pd.read_csv(dataset_dir+'mapping.txt', header=None, delimiter='\t')

  df_train_text = pd.read_csv('./data/tweetval_emotion/train_text.txt', header=None, delimiter='§', names=['text'])
  df_val_text = pd.read_csv(dataset_dir+'val_text.txt', header=None, delimiter='§', names=['text'])
  df_test_text = pd.read_csv(dataset_dir+'test_text.txt', header=None, delimiter='§', names=['text'])


In [5]:
tweetval_train = Dataset.from_pandas(df_train)
tweetval_val = Dataset.from_pandas(df_val)

In [6]:
tweetval_train[0]

{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry ",
 'label': 2}

# Preprocess

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [9]:
tokenized_tweetval_train = tweetval_train.map(preprocess_function, batched=True)
tokenized_tweetval_val = tweetval_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [10]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluate

In [11]:
import evaluate
accuracy = evaluate.load("accuracy")

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train

In [13]:
id_label_dict = dict(zip(df_labels[0], df_labels[1]))

id2label = {k: v for k, v in id_label_dict.items()}
label2id = {v: k for k, v in id_label_dict.items()}

print("id2label:", id2label)
print("label2id:", label2id)

id2label: {0: 'anger', 1: 'joy', 2: 'optimism', 3: 'sadness'}
label2id: {'anger': 0, 'joy': 1, 'optimism': 2, 'sadness': 3}


In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=df_labels.shape[0], id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="./checkpoints/koph/bert/",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweetval_train,
    eval_dataset=tokenized_tweetval_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [16]:
start_time = time.time()
trainer.train()
end_time = time.time()

  0%|          | 0/1224 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.6488389372825623, 'eval_accuracy': 0.7700534759358288, 'eval_runtime': 2.9621, 'eval_samples_per_second': 126.261, 'eval_steps_per_second': 8.102, 'epoch': 1.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.6075677871704102, 'eval_accuracy': 0.7780748663101604, 'eval_runtime': 5.7564, 'eval_samples_per_second': 64.971, 'eval_steps_per_second': 4.169, 'epoch': 2.0}
{'loss': 0.6442, 'grad_norm': 2.8408310413360596, 'learning_rate': 1.1830065359477125e-05, 'epoch': 2.45}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.6906181573867798, 'eval_accuracy': 0.786096256684492, 'eval_runtime': 3.4063, 'eval_samples_per_second': 109.798, 'eval_steps_per_second': 7.046, 'epoch': 3.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.720759928226471, 'eval_accuracy': 0.7914438502673797, 'eval_runtime': 3.5932, 'eval_samples_per_second': 104.086, 'eval_steps_per_second': 6.679, 'epoch': 4.0}
{'loss': 0.2096, 'grad_norm': 2.1767771244049072, 'learning_rate': 3.6601307189542484e-06, 'epoch': 4.9}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.7509058117866516, 'eval_accuracy': 0.7994652406417112, 'eval_runtime': 3.4977, 'eval_samples_per_second': 106.926, 'eval_steps_per_second': 6.862, 'epoch': 5.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.791144847869873, 'eval_accuracy': 0.7941176470588235, 'eval_runtime': 3.2777, 'eval_samples_per_second': 114.104, 'eval_steps_per_second': 7.322, 'epoch': 6.0}
{'train_runtime': 691.8917, 'train_samples_per_second': 28.244, 'train_steps_per_second': 1.769, 'train_loss': 0.3701666398765215, 'epoch': 6.0}


In [17]:
print(f"Training Time Distilbert: {end_time-start_time}")

Training Time Distilbert: 692.2532165050507


# Inference

In [18]:
def test(model, tokenizer):
    correct = 0
    total = 0
    
    model.to(device)
    with torch.no_grad():
        for _, row in df_test.iterrows():
            sentence = row["text"]
            tag = row["label"]

            inputs = tokenizer(sentence, return_tensors="pt").to(device)
            pred_score = model(**inputs).logits
            predicted_class_id = pred_score.argmax().item()

            if predicted_class_id == tag:
                correct += 1            
            total += 1
    
    print(f'Accuracy of the network on the test data: {100 * correct // total} %')

In [22]:
checkpoint_path = "./checkpoints/koph/bert/checkpoint-1224"

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

test(model, tokenizer)

Accuracy of the network on the test data: 80 %
