In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
import torch
import tensorflow
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import EarlyStoppingCallback

### Load Data Set

In [2]:
df_train = pd.read_csv('training.csv')
df_val = pd.read_csv('validation.csv')
df_test = pd.read_csv('testing.csv')

df_train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,19429,retweet mentionhere: people be like from bae t...,1
1,10662,"i like being teased, do not give me the pussy ...",1
2,19558,retweet mentionhere: you are jealous bitch,1
3,2909,mentionhere cuz people at work are bitches the...,1
4,24378,quote this with your best racist comment i nee...,1


In [3]:
# Define pretrained tokenizer and model

model_name = "vinai/bertweet-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3,from_tf=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.


In [4]:
# ----- 1. Preprocess data -----#
# Preprocess data

X_train = list(df_train["text"])
y_train = list(df_train["label"])

X_val = list(df_val["text"])
y_val = list(df_val["label"])

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [5]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [6]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [7]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average = 'weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average = 'weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, }

In [8]:
# Define Trainer

args = TrainingArguments(
    output_dir="D:\COS 802\Project\Bertweet2 Training\output",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps = 1000,
    per_device_train_batch_size=16*2,
    per_device_eval_batch_size=16*2,
    num_train_epochs=2,
    seed=0,
    load_best_model_at_end=True,
    report_to="wandb",  # enable logging to W&B
    run_name="bertweet2"  # name of the W&B run (optional)
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [9]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 17348
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2170
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: jesseparvess-elucidate (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.624,0.43844,0.870056,0.818656,0.870056,0.843293
100,0.4406,0.399182,0.857143,0.841575,0.857143,0.840444
150,0.3838,0.390582,0.894808,0.844091,0.894808,0.868698
200,0.3329,0.315801,0.9053,0.854657,0.9053,0.879117
250,0.3524,0.343011,0.905569,0.857128,0.905569,0.879952
300,0.3869,0.319704,0.903417,0.851993,0.903417,0.876949
350,0.3233,0.312801,0.904493,0.853048,0.904493,0.878012
400,0.3269,0.411967,0.892117,0.897648,0.892117,0.864852
450,0.3697,0.32351,0.905031,0.853448,0.905031,0.878482
500,0.3044,0.322675,0.900995,0.849126,0.900995,0.874155


***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\output\checkpoint-50
Configuration saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-50\config.json
Model weights saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-50\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\output\checkpoint-100
Configuration saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-100\config.json
Model weights saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\o

***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\output\checkpoint-800
Configuration saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-800\config.json
Model weights saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-800\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\output\checkpoint-850
Configuration saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-850\config.json
Model weights saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-850\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Bertweet Trainin

Model weights saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-1800\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\output\checkpoint-1850
Configuration saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-1850\config.json
Model weights saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-1850\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\output\checkpoint-1900
Configuration saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-1900\config.json
Model weights saved in D:\COS 802\Project\Bertweet Training\output\checkpoint-1900\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Bertweet Training\output\checkpoint-1950
Configuration saved in D:\COS 802

TrainOutput(global_step=2170, training_loss=0.2929399650767102, metrics={'train_runtime': 29135.085, 'train_samples_per_second': 1.191, 'train_steps_per_second': 0.074, 'total_flos': 1212443073479232.0, 'train_loss': 0.2929399650767102, 'epoch': 2.0})

In [10]:
# ----- 3. Predict -----#
# Load test data

X_test = list(df_test['text'])
y_test = list(df_test['label'])

X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)


In [11]:
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

In [13]:
# Load trained model
model_path = "D:\COS 802\Project\Bertweet Training\output\checkpoint-1850"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)

loading configuration file D:\COS 802\Project\Bertweet Training\output\checkpoint-1850\config.json
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tokenizer_class": "BertweetTokenizer",
  "torch_dtype": "float32",
  "tra

In [14]:
# Define test trainer
test_trainer = Trainer(model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [15]:
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 3718
  Batch size = 8


In [16]:
from sklearn.metrics import confusion_matrix, classification_report

In [17]:
confusion_matrix(y_test, y_pred)

array([[  78,  113,   22],
       [  65, 2776,   64],
       [  14,   28,  558]], dtype=int64)

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.50      0.37      0.42       213
           1       0.95      0.96      0.95      2905
           2       0.87      0.93      0.90       600

    accuracy                           0.92      3718
   macro avg       0.77      0.75      0.76      3718
weighted avg       0.91      0.92      0.91      3718

