In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
import torch
from transformers import TrainingArguments, Trainer
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from transformers import EarlyStoppingCallback

### Load Data Set

In [2]:
df_train = pd.read_csv('training.csv')
df_val = pd.read_csv('validation.csv')
df_test = pd.read_csv('testing.csv')

df_train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,19429,retweet mentionhere: people be like from bae t...,1
1,10662,"i like being teased, do not give me the pussy ...",1
2,19558,retweet mentionhere: you are jealous bitch,1
3,2909,mentionhere cuz people at work are bitches the...,1
4,24378,quote this with your best racist comment i nee...,1


In [3]:
# Define pretrained tokenizer and model

model_name = 'albert-base-v2'

tokenizer = AlbertTokenizer.from_pretrained(model_name)

model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [4]:
# ----- 1. Preprocess data -----#
# Preprocess data

X_train = list(df_train["text"])
y_train = list(df_train["label"])

X_val = list(df_val["text"])
y_val = list(df_val["label"])

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [5]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [6]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [7]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average = 'weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average = 'weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, }

In [8]:
# Define Trainer

args = TrainingArguments(
    output_dir="D:\COS 802\Project\Albert Training\output",
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps = 50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    seed=0,
    load_best_model_at_end=True,
    report_to="wandb",  # enable logging to W&B
    run_name="albert"  # name of the W&B run (optional)
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [9]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 17348
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2170
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: jesseparvess-elucidate (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.5744,0.398794,0.865483,0.813621,0.865483,0.838338
100,0.4505,0.379816,0.883508,0.837191,0.883508,0.859102
150,0.4126,0.370546,0.890772,0.841626,0.890772,0.865332
200,0.3588,0.356711,0.88297,0.845184,0.88297,0.860633
250,0.4064,0.340371,0.897229,0.849123,0.897229,0.87199
300,0.3734,0.331706,0.896691,0.84844,0.896691,0.871372
350,0.3752,0.351204,0.889427,0.848142,0.889427,0.865898
400,0.3522,0.340253,0.891041,0.847495,0.891041,0.867024
450,0.4446,0.389855,0.889158,0.83788,0.889158,0.862718
500,0.3642,0.375386,0.884853,0.833029,0.884853,0.857651


***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoint-50
Configuration saved in D:\COS 802\Project\Albert Training\output\checkpoint-50\config.json
Model weights saved in D:\COS 802\Project\Albert Training\output\checkpoint-50\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoint-100
Configuration saved in D:\COS 802\Project\Albert Training\output\checkpoint-100\config.json
Model weights saved in D:\COS 802\Project\Albert Training\output\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoi

  Num examples = 3717
  Batch size = 16
wandb: 500 encountered ({"error":"Error 1135: Can't create a new thread (errno 11); if you are not out of available memory, you can consult the manual for a possible OS-dependent bug"}), retrying request
wandb: Network error resolved after 0:11:31.431235, resuming normal operation.
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoint-700
Configuration saved in D:\COS 802\Project\Albert Training\output\checkpoint-700\config.json
Model weights saved in D:\COS 802\Project\Albert Training\output\checkpoint-700\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoint-750
Configuration saved in D:\COS 802\Project\Albert Training\output\checkpoint-750\config.json
Model weights saved in D:\COS 802\Project\Albe

Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoint-1450
Configuration saved in D:\COS 802\Project\Albert Training\output\checkpoint-1450\config.json
Model weights saved in D:\COS 802\Project\Albert Training\output\checkpoint-1450\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoint-1500
Configuration saved in D:\COS 802\Project\Albert Training\output\checkpoint-1500\config.json
Model weights saved in D:\COS 802\Project\Albert Training\output\checkpoint-1500\pytorch_model.bin
wandb: Network error resolved after 0:02:10.994996, resuming normal operation.
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Albert Training\output\checkpoint-1550
Configuration saved in D:\COS 802\Project\Albert Training\output\checkpoint-1550\config.json
Model weights saved in D:\COS 802\Project\Alber

TrainOutput(global_step=2170, training_loss=0.3365783522205968, metrics={'train_runtime': 80334.0793, 'train_samples_per_second': 0.432, 'train_steps_per_second': 0.027, 'total_flos': 163582618831824.0, 'train_loss': 0.3365783522205968, 'epoch': 2.0})

In [11]:
# ----- 3. Predict -----#
# Load test data

X_test = list(df_test['text'])
y_test = list(df_test['label'])

X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)


In [12]:
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

In [13]:
# Load trained model
model_path = "D:\COS 802\Project\Albert Training\output\checkpoint-1350"
model = AlbertForSequenceClassification.from_pretrained(model_path, num_labels=3)

loading configuration file D:\COS 802\Project\Albert Training\output\checkpoint-1350\config.json
Model config AlbertConfig {
  "_name_or_path": "albert-base-v2",
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolu

In [14]:
# Define test trainer
test_trainer = Trainer(model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [15]:
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 3718
  Batch size = 8


wandb: Network error resolved after 0:00:40.303726, resuming normal operation.
wandb: Network error resolved after 0:00:50.173427, resuming normal operation.


In [16]:
from sklearn.metrics import confusion_matrix, classification_report

In [17]:
confusion_matrix(y_test, y_pred)

array([[  38,  132,   43],
       [  42, 2755,  108],
       [   5,   29,  566]], dtype=int64)

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.45      0.18      0.26       213
           1       0.94      0.95      0.95      2905
           2       0.79      0.94      0.86       600

    accuracy                           0.90      3718
   macro avg       0.73      0.69      0.69      3718
weighted avg       0.89      0.90      0.89      3718

