In [1]:
import numpy as np
import time
import pandas as pd

train_tweets = pd.read_parquet('data/train-00000-of-00001.parquet', engine='pyarrow')
val_tweets = pd.read_parquet('data/validation-00000-of-00001.parquet', engine='pyarrow')
test_tweets = pd.read_parquet('data/test-00000-of-00001.parquet', engine='pyarrow')

train_tweets

Unnamed: 0,text,label
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3
...,...,...
3252,I get discouraged because I try for 5 fucking ...,3
3253,The @user are in contention and hosting @user ...,3
3254,@user @user @user @user @user as a fellow UP g...,0
3255,You have a #problem? Yes! Can you do #somethin...,0


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x223e5e720b0>

In [4]:
import datasets
from datasets import Dataset, DatasetDict

train_tweets_ds = Dataset.from_pandas(train_tweets)
val_tweets_ds = Dataset.from_pandas(val_tweets)
test_tweets_ds = Dataset.from_pandas(test_tweets)

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

cuda:0


In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
tweets_ds = DatasetDict()
tweets_ds["train"] = train_tweets_ds
tweets_ds["validation"] = val_tweets_ds
tweets_ds["test"] = test_tweets_ds

tweets_tk = tweets_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map: 100%|██████████| 3257/3257 [00:00<00:00, 35624.67 examples/s]
Map: 100%|██████████| 374/374 [00:00<00:00, 23934.90 examples/s]
Map: 100%|██████████| 1421/1421 [00:00<00:00, 39109.08 examples/s]


In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
import evaluate

accuracy = evaluate.load("accuracy")

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [11]:
id2label = {0: "anger", 1: "joy", 2: "optimism", 3: "sadness"}
label2id = {"anger": 0, "joy": 1, "optimism": 2, "sadness": 3}

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tweets_tk["train"],
    eval_dataset=tweets_tk["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

start_time = time.time()
trainer.train()
end_time = time.time()

                                                  
 12%|█▎        | 204/1632 [00:19<02:01, 11.73it/s]

{'eval_loss': 0.6177459359169006, 'eval_accuracy': 0.7727272727272727, 'eval_runtime': 0.6344, 'eval_samples_per_second': 589.488, 'eval_steps_per_second': 37.828, 'epoch': 1.0}


                                                  
 25%|██▌       | 408/1632 [00:41<02:04,  9.80it/s]

{'eval_loss': 0.6263861656188965, 'eval_accuracy': 0.7807486631016043, 'eval_runtime': 0.777, 'eval_samples_per_second': 481.323, 'eval_steps_per_second': 30.887, 'epoch': 2.0}


 31%|███       | 501/1632 [00:50<01:44, 10.81it/s]

{'loss': 0.6091, 'grad_norm': 5.032498836517334, 'learning_rate': 1.3872549019607844e-05, 'epoch': 2.45}


                                                  
 38%|███▊      | 612/1632 [01:01<01:29, 11.41it/s]

{'eval_loss': 0.6619147658348083, 'eval_accuracy': 0.786096256684492, 'eval_runtime': 0.6467, 'eval_samples_per_second': 578.339, 'eval_steps_per_second': 37.113, 'epoch': 3.0}


                                                  
 50%|█████     | 816/1632 [01:22<01:10, 11.58it/s]

{'eval_loss': 0.7558662295341492, 'eval_accuracy': 0.786096256684492, 'eval_runtime': 0.7374, 'eval_samples_per_second': 507.203, 'eval_steps_per_second': 32.548, 'epoch': 4.0}


 61%|██████▏   | 1002/1632 [01:41<00:57, 10.89it/s]

{'loss': 0.1896, 'grad_norm': 2.3744189739227295, 'learning_rate': 7.745098039215687e-06, 'epoch': 4.9}


                                                   
 62%|██████▎   | 1020/1632 [01:43<00:51, 11.78it/s]

{'eval_loss': 0.8309715986251831, 'eval_accuracy': 0.7780748663101604, 'eval_runtime': 0.6999, 'eval_samples_per_second': 534.325, 'eval_steps_per_second': 34.288, 'epoch': 5.0}


                                                   
 75%|███████▌  | 1224/1632 [02:04<00:41,  9.93it/s]

{'eval_loss': 0.9062590003013611, 'eval_accuracy': 0.7887700534759359, 'eval_runtime': 0.6786, 'eval_samples_per_second': 551.14, 'eval_steps_per_second': 35.367, 'epoch': 6.0}


                                                   
 88%|████████▊ | 1428/1632 [02:25<00:19, 10.31it/s]

{'eval_loss': 0.9350181221961975, 'eval_accuracy': 0.7834224598930482, 'eval_runtime': 0.6669, 'eval_samples_per_second': 560.793, 'eval_steps_per_second': 35.987, 'epoch': 7.0}


 92%|█████████▏| 1502/1632 [02:33<00:11, 11.07it/s]

{'loss': 0.0765, 'grad_norm': 10.823250770568848, 'learning_rate': 1.6176470588235297e-06, 'epoch': 7.35}


                                                   
100%|██████████| 1632/1632 [02:45<00:00, 10.64it/s]

{'eval_loss': 0.9650164842605591, 'eval_accuracy': 0.7807486631016043, 'eval_runtime': 0.6835, 'eval_samples_per_second': 547.203, 'eval_steps_per_second': 35.115, 'epoch': 8.0}


100%|██████████| 1632/1632 [02:47<00:00,  9.77it/s]

{'train_runtime': 167.0312, 'train_samples_per_second': 155.995, 'train_steps_per_second': 9.771, 'train_loss': 0.27123202909441557, 'epoch': 8.0}





In [14]:
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")

model.to(device)

def test():
    with torch.no_grad():
        pred = []

        for idx, row in test_tweets.iterrows():
            inputs = tokenizer(row["text"], return_tensors="pt").to(device)
            tag_scores = model(**inputs).logits
            predicted_class = tag_scores.argmax().item()
            pred.append(predicted_class)

        test_tweets["pred"] = pred

    print("Accuracy:", accuracy_score(test_tweets["label"], test_tweets["pred"]))



In [15]:
test()

Accuracy: 0.505981703026038


In [16]:
print("DistillBERT", end_time - start_time)

DistillBERT 167.13501739501953
