# Assignment 6

In [24]:
import torch
from heapq import nlargest
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import os
import evaluate
from transformers import pipeline
import time
import datetime


torch.manual_seed(42)

<torch._C.Generator at 0x7f1b0666a0b0>

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

labels2id = {"anger": 0, "joy": 1, "optimism": 2, "sadness": 3}
id2labels = {0: "anger", 1: "joy", 2: "optimism", 3: "sadness"}

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
with open('/home/ruwen/tweeteval/datasets/emotion/train_text.txt') as f:
    t_lines = f.read().splitlines()

with open('/home/ruwen/tweeteval/datasets/emotion/test_text.txt') as f:
    test_lines = f.read().splitlines()

with open('/home/ruwen/tweeteval/datasets/emotion/val_text.txt') as f:
    val_lines = f.read().splitlines()

with open('/home/ruwen/tweeteval/datasets/emotion/train_labels.txt') as f:
    train_labels = f.read().splitlines()

train_labels = [int(numeric_string) for numeric_string in train_labels]

with open('/home/ruwen/tweeteval/datasets/emotion/test_labels.txt') as f:
    test_labels = f.read().splitlines()

test_labels = [int(numeric_string) for numeric_string in test_labels]

with open('/home/ruwen/tweeteval/datasets/emotion/val_labels.txt') as f:
    val_labels = f.read().splitlines()

val_labels = [int(numeric_string) for numeric_string in val_labels]


print("Training lines: " + str(len(t_lines)))
print("Training labels: " + str(len(train_labels)))
print("Validation lines: " + str(len(val_lines)))
print("Validation labels: " + str(len(val_labels)))
print("Test lines: " + str(len(test_lines)))
print("Test labels: " + str(len(test_labels)))

Training lines: 3257
Training labels: 3257
Validation lines: 374
Validation labels: 374
Test lines: 1421
Test labels: 1421


In [4]:
train_dict = []
test_dict = []
val_dict = []

for i,l in enumerate(t_lines):
    train_dict.append({"text": l, "label": train_labels[i]})

for i,l in enumerate(test_lines):
    test_dict.append({"text": l, "label": test_labels[i]})

for i,l in enumerate(val_lines):
    val_dict.append({"text": l, "label": val_labels[i]})

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
tweets = DatasetDict()
tweets["train"] = Dataset.from_list(train_dict)
tweets["val"] = Dataset.from_list(val_dict)
tweets["test"] = Dataset.from_list(test_dict)

tweets_tokens = tweets.map(preprocess_function, batched=True)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=4, id2label=id2labels, label2id=labels2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [19]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tweets_tokens["train"],
    eval_dataset=tweets_tokens["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

start = time.time()
trainer.train()
end = time.time()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.640598,0.780749
2,No log,0.661333,0.780749
3,0.446100,0.718129,0.775401
4,0.446100,0.721432,0.786096


In [21]:
def test():
    correct = 0
    total = 0

    tokenizer = AutoTokenizer.from_pretrained("./my_awesome_model/checkpoint-408")
    model = AutoModelForSequenceClassification.from_pretrained("./my_awesome_model/checkpoint-408")
    
    with torch.no_grad():
        for entry in test_dict:
            inputs = tokenizer(entry["text"], return_tensors="pt")
            logits = model(**inputs).logits
                
            predicted_class_id = logits.argmax().item()
            model.config.id2label[predicted_class_id]
            
            correct += (predicted_class_id == entry["label"])
            total += 1
    
    print(f'Accuracy of the network on the test data: {100 * correct // total} %')

In [22]:
test()

Accuracy of the network on the test data: 80 %


In [25]:
print("time: " + str(datetime.timedelta(seconds=(end - start))))

time: 0:30:56.849409
