In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
import torch
from transformers import TrainingArguments, Trainer
from transformers import SqueezeBertTokenizer, SqueezeBertForSequenceClassification
from transformers import EarlyStoppingCallback

### Load Data Set

In [2]:
df_train = pd.read_csv('training.csv')
df_val = pd.read_csv('validation.csv')
df_test = pd.read_csv('testing.csv')

df_train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,19429,retweet mentionhere: people be like from bae t...,1
1,10662,"i like being teased, do not give me the pussy ...",1
2,19558,retweet mentionhere: you are jealous bitch,1
3,2909,mentionhere cuz people at work are bitches the...,1
4,24378,quote this with your best racist comment i nee...,1


In [3]:
# Define pretrained tokenizer and model

model_name = 'squeezebert/squeezebert-uncased'

tokenizer = SqueezeBertTokenizer.from_pretrained(model_name)

model = SqueezeBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/500 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing SqueezeBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SqueezeBertForSequenceClassification were no

In [4]:
# ----- 1. Preprocess data -----#
# Preprocess data

X_train = list(df_train["text"])
y_train = list(df_train["label"])

X_val = list(df_val["text"])
y_val = list(df_val["label"])

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [5]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [6]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [7]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average = 'weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average = 'weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, }

In [8]:
# Define Trainer

args = TrainingArguments(
    output_dir="D:\COS 802\Project\Squeezebert Training\output",
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps = 50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    seed=0,
    load_best_model_at_end=True,
    report_to="wandb",  # enable logging to W&B
    run_name="squeezebert"  # name of the W&B run (optional)
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [9]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 17348
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2170
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: jesseparvess-elucidate (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.6128,0.426848,0.865752,0.813399,0.865752,0.836852
100,0.4026,0.402064,0.883777,0.831944,0.883777,0.856238
150,0.3696,0.345256,0.89131,0.874826,0.89131,0.868341
200,0.354,0.339525,0.88593,0.880625,0.88593,0.875107
250,0.3761,0.347792,0.893462,0.841586,0.893462,0.86624
300,0.3482,0.32653,0.902072,0.851909,0.902072,0.876087
350,0.3227,0.297122,0.90261,0.853196,0.90261,0.876822
400,0.2957,0.293415,0.904762,0.893279,0.904762,0.879759
450,0.3291,0.293061,0.905569,0.891782,0.905569,0.890381
500,0.2957,0.324684,0.89427,0.842544,0.89427,0.866784


***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Squeezebert Training\output\checkpoint-50
Configuration saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-50\config.json
Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-50\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
wandb: Network error resolved after 0:02:39.692465, resuming normal operation.
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to D:\COS 802\Project\Squeezebert Training\output\checkpoint-100
Configuration saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-100\config.json
Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint

Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-900\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Squeezebert Training\output\checkpoint-950
Configuration saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-950\config.json
Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-950\pytorch_model.bin
wandb: Network error resolved after 0:27:02.118406, resuming normal operation.
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
wandb: Network error resolved after 0:00:59.013431, resuming normal operation.
Saving model checkpoint to D:\COS 802\Project\Squeezebert Training\output\checkpoint-1000
Configuration saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-1000\config.json
Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-1000\pytorch_model.bin
***** Running Ev

Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-2000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Squeezebert Training\output\checkpoint-2050
Configuration saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-2050\config.json
Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-2050\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Squeezebert Training\output\checkpoint-2100
Configuration saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-2100\config.json
Model weights saved in D:\COS 802\Project\Squeezebert Training\output\checkpoint-2100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3717
  Batch size = 16
Saving model checkpoint to D:\COS 802\Project\Squeezebert Training\output\checkpoint-2150
Configura

TrainOutput(global_step=2170, training_loss=0.28176188073399977, metrics={'train_runtime': 68384.5318, 'train_samples_per_second': 0.507, 'train_steps_per_second': 0.032, 'total_flos': 391513748482512.0, 'train_loss': 0.28176188073399977, 'epoch': 2.0})

In [10]:
# ----- 3. Predict -----#
# Load test data

X_test = list(df_test['text'])
y_test = list(df_test['label'])

X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)


In [11]:
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

In [20]:
# Load trained model
model_path = "D:\COS 802\Project\Squeezebert Training\output\checkpoint-1850"
model = SqueezeBertForSequenceClassification.from_pretrained(model_path, num_labels=3)

loading configuration file D:\COS 802\Project\Squeezebert Training\output\checkpoint-1850\config.json
Model config SqueezeBertConfig {
  "_name_or_path": "squeezebert/squeezebert-uncased",
  "architectures": [
    "SqueezeBertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_groups": 4,
  "intermediate_size": 3072,
  "k_groups": 4,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "squeezebert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_groups": 4,
  "pad_token_id": 0,
  "post_attention_groups": 1,
  "problem_type": "single_label_classification",
  "q_groups": 4,
  "torch_dtype": "float32",
  "transformers_version": "4.1

In [21]:
# Define test trainer
test_trainer = Trainer(model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 3718
  Batch size = 8


In [23]:
from sklearn.metrics import confusion_matrix, classification_report

In [24]:
confusion_matrix(y_test, y_pred)

array([[  86,  105,   22],
       [  84, 2748,   73],
       [   8,   47,  545]], dtype=int64)

In [25]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.48      0.40      0.44       213
           1       0.95      0.95      0.95      2905
           2       0.85      0.91      0.88       600

    accuracy                           0.91      3718
   macro avg       0.76      0.75      0.76      3718
weighted avg       0.91      0.91      0.91      3718

