In [95]:
from datasets import load_dataset
data_files = {'train': 'data/data_train1.csv', 'test': 'data/data_test1.csv', "validation" : 'data/data_val1.csv'}
raw_datasets = load_dataset("csv", data_files=data_files)
raw_datasets

Downloading and preparing dataset csv/default to /Users/emiliagenadieva/.cache/huggingface/datasets/csv/default-4d3dd67094a5021c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/emiliagenadieva/.cache/huggingface/datasets/csv/default-4d3dd67094a5021c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['overview', 'Comedy', 'Adventure', 'Drama', 'Horror', 'Action', 'Documentary', 'Thriller', 'Crime', 'Western', 'Romance', 'TV Movie'],
        num_rows: 1058
    })
    test: Dataset({
        features: ['overview', 'Comedy', 'Adventure', 'Drama', 'Horror', 'Action', 'Documentary', 'Thriller', 'Crime', 'Western', 'Romance', 'TV Movie'],
        num_rows: 98
    })
    validation: Dataset({
        features: ['overview', 'Comedy', 'Adventure', 'Drama', 'Horror', 'Action', 'Documentary', 'Thriller', 'Crime', 'Western', 'Romance', 'TV Movie'],
        num_rows: 106
    })
})

In [98]:
labels = [label for label in raw_datasets['train'].features.keys() if label not in ['overview']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['Comedy',
 'Adventure',
 'Drama',
 'Horror',
 'Action',
 'Documentary',
 'Thriller',
 'Crime',
 'Western',
 'Romance',
 'TV Movie']

In [101]:

from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["overview"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [102]:
encoded_dataset = raw_datasets.map(preprocess_data, batched=True, remove_columns=raw_datasets['train'].column_names)

Map:   0%|          | 0/1058 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

In [103]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [104]:
tokenizer.decode(example['input_ids'])

'[CLS] stifle edwardian london wendy darling mesmerize brother night bedtime tale swordplay swashbuckling fearsome captain hook child hero great story peter pan fly nursery night lead moonlit rooftop galaxy star lush jungle neverland [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [105]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['Adventure']

In [106]:
encoded_dataset.set_format("torch")

In [107]:

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [108]:
batch_size = 8
metric_name = "f1"

In [109]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [110]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [111]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs
     

SequenceClassifierOutput(loss=tensor(0.7028, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.4558, -0.5435, -0.0540, -0.2737,  0.2828,  0.2156, -0.5558, -0.0456,
          0.2179,  0.1460, -0.0798]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [112]:

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [113]:
trainer.train()



  0%|          | 0/665 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2549687922000885, 'eval_f1': 0.0, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.0, 'eval_runtime': 5.4713, 'eval_samples_per_second': 19.374, 'eval_steps_per_second': 2.559, 'epoch': 1.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.22990970313549042, 'eval_f1': 0.10619469026548672, 'eval_roc_auc': 0.5278301886792452, 'eval_accuracy': 0.05660377358490566, 'eval_runtime': 5.3127, 'eval_samples_per_second': 19.952, 'eval_steps_per_second': 2.635, 'epoch': 2.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.21550586819648743, 'eval_f1': 0.3841059602649007, 'eval_roc_auc': 0.629245283018868, 'eval_accuracy': 0.27358490566037735, 'eval_runtime': 5.2839, 'eval_samples_per_second': 20.061, 'eval_steps_per_second': 2.65, 'epoch': 3.0}
{'loss': 0.2476, 'learning_rate': 4.962406015037594e-06, 'epoch': 3.76}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.21064558625221252, 'eval_f1': 0.48, 'eval_roc_auc': 0.685377358490566, 'eval_accuracy': 0.39622641509433965, 'eval_runtime': 5.2508, 'eval_samples_per_second': 20.187, 'eval_steps_per_second': 2.666, 'epoch': 4.0}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2106993943452835, 'eval_f1': 0.49162011173184356, 'eval_roc_auc': 0.6938679245283018, 'eval_accuracy': 0.41509433962264153, 'eval_runtime': 5.2414, 'eval_samples_per_second': 20.224, 'eval_steps_per_second': 2.671, 'epoch': 5.0}
{'train_runtime': 1092.2013, 'train_samples_per_second': 4.843, 'train_steps_per_second': 0.609, 'train_loss': 0.22746007245286068, 'epoch': 5.0}


TrainOutput(global_step=665, training_loss=0.22746007245286068, metrics={'train_runtime': 1092.2013, 'train_samples_per_second': 4.843, 'train_steps_per_second': 0.609, 'train_loss': 0.22746007245286068, 'epoch': 5.0})

In [114]:
trainer.evaluate()

  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.2106993943452835,
 'eval_f1': 0.49162011173184356,
 'eval_roc_auc': 0.6938679245283018,
 'eval_accuracy': 0.41509433962264153,
 'eval_runtime': 5.4448,
 'eval_samples_per_second': 19.468,
 'eval_steps_per_second': 2.571,
 'epoch': 5.0}

In [144]:
#text = "A successful artist looks back with loving memories on the summer of his defining year, 1974. A talented but troubled 18-year-old aspiring artist befriends a brilliant elderly alcoholic painter who has turned his back on not only art but life. The two form what appears to be at first a tenuous relationship. The kid wants to learn all the secrets the master has locked away inside his head and heart. Time has not been kind to the old master. His life appears pointless to him until the kid rekindles his interest in his work and ultimately gives him the will to live. Together, they give one another a priceless gift. The kid learns to see the world through the master's eyes. And the master learns to see life through the eyes of innocence again. This story is based on a real life experience."
import tensorflow as tf

text = '"In a desperate search to create a follow-up to Joe Swanberg 2011 film Uncle Kent, Kent Osborne travels to a comic book convention in San Diego where he loses his mind and confronts the end of the world.'
#text = 'Toward the end of World War II, middle-aged soldier Keita is entrusted with a postcard from a comrade who is sure he will die in battle. After the war ends, Keita visits his comrade wife Yuko and bears witness to the tragic life she has led. This year Oscar entry from Japan finds SHINDO in top form and his 49th and reportedly last film as fresh and poignant as ever.'
encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

logits = outputs.logits
probs = logits.softmax(dim=-1).detach().cpu().flatten().numpy().tolist()
print(probs)
     

[0.2469692826271057, 0.04617377370595932, 0.14979422092437744, 0.14409200847148895, 0.13373678922653198, 0.045619308948516846, 0.06129215285181999, 0.07655453681945801, 0.025626417249441147, 0.03802759572863579, 0.03211389482021332]


In [145]:
print(probs)

[0.2469692826271057, 0.04617377370595932, 0.14979422092437744, 0.14409200847148895, 0.13373678922653198, 0.045619308948516846, 0.06129215285181999, 0.07655453681945801, 0.025626417249441147, 0.03802759572863579, 0.03211389482021332]


In [146]:
ids = np.argmax(probs, axis=-1)
print('Label',labels[ids])

Label Comedy


In [140]:
model.config.id2label[2]

'Drama'

# Conclusion
By traing a model, one needs to first preprocess the data in the format, 
with which the model will easily process the data.</br>
This is an example of single label classification.
</br>
First the data is filtered for only relevant labels. 
data = data[data['genre'].isin(['Comedy',
 'Horror',
 'Drama',
 'Crime',
 'Documentary',
 'TV Movie',
 'Romance',
 'Action',
 'Adventure',
 'Western',
 'Thriller'])].reindex()