# ML Pipeline Preparation - Transformer Version
try out transformer based model for our classification task
this is based on https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

## Approach
As transformer are current state of the art for many nlp tasks I wanted to try this out for our multi-label-classification task. 

This model I based on pretrained https://huggingface.co/bert-base-uncased model.
I gave it only one shot as training is quite expensive and took on my laptop with one RTX 3080 GPU more than XX hours.

Most of the code is taken/adopted from other sources which I linked here in this notebook. Only the function train_test_val_split which takes a pandas Dataframe as input and returns a DatasetDict has been implemented by me.

In [27]:
# import libraries
import pandas as pd
from pandas import DataFrame as DataFrame
import numpy as np
from sqlalchemy import create_engine
import pickle

import numpy as np

from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch





In [28]:
# load data from database
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_query('select * from CleanedData', engine)
df.head(3)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# make categorie values boolean

categories = df.columns[4:]
for column in categories:
    df[column] = df[column].astype(bool)

df.head()


Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,True,False,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,True,True,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
# what is max, min and avg lenght of message?
len(df.max()['message'])

  len(df.max()['message'])


98

In [31]:
# drop columns
df_cleaned = df.drop( ['original', 'genre'], axis = 1)
df_cleaned.head(1)

Unnamed: 0,id,message,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [32]:
# rows with nan
df_nan=df_cleaned[df_cleaned.isna().any(axis=1)]
df_nan.head(3)

Unnamed: 0,id,message,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report


In [33]:
# NaN values
print(df_cleaned.shape)
df_cleaned.dropna(inplace=True)
print(df_cleaned.shape)

(26216, 38)
(26216, 38)


In [34]:
df_cleaned[df_cleaned.columns[4:]].columns

Index(['offer', 'aid_related', 'medical_help', 'medical_products',
       'search_and_rescue', 'security', 'military', 'child_alone', 'water',
       'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees',
       'death', 'other_aid', 'infrastructure_related', 'transport',
       'buildings', 'electricity', 'tools', 'hospitals', 'shops',
       'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
       'storm', 'fire', 'earthquake', 'cold', 'other_weather',
       'direct_report'],
      dtype='object')

In [35]:
train_size = int(0.8 * len(df_cleaned))
test_size = int(0.1 * len(df_cleaned))
val_size = len(df_cleaned) - train_size - test_size
random_seed = 43
df_randomized = df_cleaned.sample(frac=1, random_state=random_seed)
validate, test, train = np.split(df_cleaned, [test_size, test_size + val_size])

In [36]:
print(f'df: {df.shape}')
print(f'test: {test.shape}')
print(f'train: {train.shape}')
print(f'validate: {validate.shape}')

df: (26216, 40)
test: (2623, 38)
train: (20972, 38)
validate: (2621, 38)


## Train, Test and Validation split into Dataset

In [37]:
def train_test_val_split(df: DataFrame, train_perc = 0.8, test_perc = 0.1, random_seed = 42, max_size=None, drop_cats=None):
    ''' returns a randomized split of df into train, test and validation DataSet(s)

    INPUT:
        df            - DataFrame
        train_perc    - percentage of data used for training
        test_perd     - percentage of data used for testing
        random_seed   - random seed
        max_size      - if set, only max_size entries from df will be used
        
    
    OUTPUT:
        data_set_dict - DatasetDict with train, test and validation data
    ''' 
    
    if drop_cats:
        df = df.drop(df.columns[-drop_cats:],axis=1)
        
    df_randomized = df.sample(frac=1, random_state=random_seed)
    
    if max_size:
        df_randomized= df_randomized[:max_size]
    
    train_size = int(train_perc * len(df_randomized))
    test_size = int(test_perc * len(df_randomized))
    val_size = len(df_randomized) - train_size - test_size
    
    df_validate, df_test, df_train = np.split(df_randomized, [test_size, test_size + val_size])

    train_dataset = Dataset.from_dict(df_train)
    test_dataset  = Dataset.from_dict(df_test)
    val_dataset   = Dataset.from_dict(df_validate)
    my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset, "validation": val_dataset})

    return my_dataset_dict



In [38]:
df_cleaned.shape

(26216, 38)

In [39]:
# Create Dataset with trainint/test and validation data

# dataset = train_test_val_split(df_cleaned, max_size=1000, drop_cats=30)
dataset = train_test_val_split(df_cleaned)
dataset



DatasetDict({
    train: Dataset({
        features: ['id', 'message', 'related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report'],
        num_rows: 20972
    })
    test: Dataset({
        features: ['id', 'message', 'related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'ot

In [40]:
example = dataset['train'][0]
example

{'id': 27657,
 'message': '**Provisional conclusions and recommendations** At the end of his visit the Special Rapporteur said: "Burkina Faso has so far escaped the threat of terrorist attack, the spread of armed conflict across its borders, and the religious intolerance, radicalization and violent extremism among its population.',
 'related': True,
 'request': False,
 'offer': False,
 'aid_related': True,
 'medical_help': False,
 'medical_products': False,
 'search_and_rescue': False,
 'security': False,
 'military': True,
 'child_alone': False,
 'water': False,
 'food': False,
 'shelter': False,
 'clothing': False,
 'money': False,
 'missing_people': False,
 'refugees': False,
 'death': False,
 'other_aid': False,
 'infrastructure_related': False,
 'transport': False,
 'buildings': False,
 'electricity': False,
 'tools': False,
 'hospitals': False,
 'shops': False,
 'aid_centers': False,
 'other_infrastructure': False,
 'weather_related': False,
 'floods': False,
 'storm': False,
 'f

In [41]:
labels = [label for label in dataset['train'].features.keys() if label not in ['id', 'message']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

## Encode the DataSet for training
Models like BERT don't allow for text as direct input. We have to tokenize the messages.
Also the labels have to be of certain format for multi-label text classification.

In [16]:
# pretrained models
BERT_SINGLE = "bert-base-uncased"
BERT_MULTI  = "bert-base-multilingual-cased"
MODEL_PRET  = BERT_MULTI

In [17]:
from transformers import AutoTokenizer
import numpy as np

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PRET)

def preprocess_data(examples):
  ''' - tokenize text using AutoTokenizer API. 
      - provide labels (). For multi-label text classification, 
              this is a matrix of shape (batch_size, num_labels)
              will be a tensor of floats rather integers 
              (see: https://discuss.pytorch.org/t/multi-label-binary-classification-result-type-float-cant-be-cast-to-the-desired-output-type-long/117915)
    
    INPUT:
        examples      - DataSet
    
    OUTPUT:
        encoding - encoding of text (from column 'message') and labels
    ''' 

  # print(f'examples: {examples}')
  # take a batch of texts
  text = examples['message']
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

100%|██████████| 21/21 [00:01<00:00, 11.80ba/s]
100%|██████████| 3/3 [00:00<00:00, 16.67ba/s]
100%|██████████| 3/3 [00:00<00:00, 16.05ba/s]


In [18]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20972
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2623
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2621
    })
})

In [19]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [20]:
encoded_dataset['train'][0]['labels'][0]

1.0

In [21]:
print(example.values())

dict_values([[101, 1008, 1008, 10864, 15306, 1998, 11433, 1008, 1008, 2012, 1996, 2203, 1997, 2010, 3942, 1996, 2569, 9680, 6442, 11236, 2056, 1024, 1000, 23089, 22773, 2038, 2061, 2521, 6376, 1996, 5081, 1997, 9452, 2886, 1010, 1996, 3659, 1997, 4273, 4736, 2408, 2049, 6645, 1010, 1998, 1996, 3412, 2046, 3917, 6651, 1010, 7490, 3989, 1998, 6355, 4654, 7913, 26725, 2426, 2049, 2313, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1,

In [22]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['related', 'aid_related', 'military']

In [23]:
encoded_dataset.set_format("torch")

## Build the model
We construct a model which is based on pretrained bert-base-uncased model (see https://huggingface.co/bert-base-uncased).

In [24]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PRET, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [25]:
batch_size = 8
metric_name = "f1"

## Train the model
All training hyperparameters can be found here: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments

In [26]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [27]:

def multi_label_metrics(predictions, labels, threshold=0.5):
    ''' - compute metrics while training 
        - code taken from: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
        - for details refer to this above url
     
    
    OUTPUT:
        metrics     - dictionary with the desired metric values
    ''' 
    # print(f'predictions: {predictions}')
    # print(f'labels: {labels}')
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    # print(f'y_true: {y_true}')
    # print(f'y_pred: {y_pred}')
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred, normalize=True)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    ''' - compute multi-label metrics while training 
        - code taken from: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
        - for details refer to this above url
     
    
    OUTPUT:
        result   - dictionary with the desired metric values
    ''' 
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [28]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [29]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  1008,  1008, 10864, 15306,  1998, 11433,  1008,  1008,  2012,
         1996,  2203,  1997,  2010,  3942,  1996,  2569,  9680,  6442, 11236,
         2056,  1024,  1000, 23089, 22773,  2038,  2061,  2521,  6376,  1996,
         5081,  1997,  9452,  2886,  1010,  1996,  3659,  1997,  4273,  4736,
         2408,  2049,  6645,  1010,  1998,  1996,  3412,  2046,  3917,  6651,
         1010,  7490,  3989,  1998,  6355,  4654,  7913, 26725,  2426,  2049,
         2313,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [30]:
encoded_dataset['train'][0]['labels']

tensor([1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [31]:
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.7102, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.2376, -0.0304,  0.0270, -0.3140, -0.3453,  0.0355, -0.4119, -0.1333,
          0.0994,  0.5138,  0.1859, -0.0223, -0.3160, -0.1217,  0.1646, -0.4515,
          0.1971, -0.0786, -0.0135,  0.1819,  0.6735, -0.2587,  0.0996,  0.5891,
         -0.1681, -0.1149,  0.6752,  0.9858, -0.2200, -0.5146, -0.1499, -0.4692,
          0.4473, -0.6613, -0.6028,  0.2837]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [32]:
## Train

In [33]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 20972
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13110
  Number of trainable parameters = 109509924
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1442,0.140213,0.686292,0.789811,0.309424
2,0.1231,0.133238,0.704739,0.799871,0.361312
3,0.107,0.133185,0.725353,0.826662,0.354063
4,0.0936,0.137308,0.72122,0.824603,0.362839
5,0.0848,0.138986,0.720162,0.823474,0.363602


***** Running Evaluation *****
  Num examples = 2621
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-2622
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-2622/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-2622/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-2622/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-2622/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2621
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-5244
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-5244/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-5244/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-5244/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/ch

TrainOutput(global_step=13110, training_loss=0.11783118629892031, metrics={'train_runtime': 49292.3256, 'train_samples_per_second': 2.127, 'train_steps_per_second': 0.266, 'total_flos': 6899561918484480.0, 'train_loss': 0.11783118629892031, 'epoch': 5.0})

## Evaluate Model


In [40]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2621
  Batch size = 8


{'eval_loss': 0.1331845372915268,
 'eval_f1': 0.7253534892812927,
 'eval_roc_auc': 0.8266616602450627,
 'eval_accuracy': 0.3540633346051126,
 'eval_runtime': 242.8744,
 'eval_samples_per_second': 10.792,
 'eval_steps_per_second': 1.35,
 'epoch': 5.0}

### Test Model on Single Data

In [45]:
# text = "There is heavy rain and earth is shaking - need help!"
text = "My family got lost and I have no food or even water"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [46]:
# logits is a tensor containing the scores for each label
logits = outputs.logits
logits.shape

torch.Size([1, 36])

In [47]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['related', 'request', 'aid_related', 'water', 'food', 'direct_report']


## Export model for reuse
save the model, so we can reload it using from_pretrained()

In [44]:
trainer.save_model('transormer_model')

Saving model checkpoint to transormer_model
Configuration saved in transormer_model/config.json
Model weights saved in transormer_model/pytorch_model.bin
tokenizer config file saved in transormer_model/tokenizer_config.json
Special tokens file saved in transormer_model/special_tokens_map.json


In [7]:
# Check loading/using the model
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import numpy as np
model = AutoModelForSequenceClassification.from_pretrained("./transormer_multi_model/")
tokenizer = AutoTokenizer.from_pretrained("./transormer_multi_model/")


In [2]:
def eval_message(model, tokenizer, msg):
    ''' Loads transformer based tokenizer and model. Evaluates input on this.
        INPUT:
            model     - trained transformer model
            tokenizer - tokenizer for encoding the input message
            msg       - single text message to be evaluated

        
        OUTPUT:
            list of predicted categories
    '''
    text = msg

    encoding = tokenizer(text, return_tensors="pt")
    encoding = {k: v.to(model.device) for k,v in encoding.items()}

    outputs = model(**encoding)
    # logits is a tensor containing the scores for each label
    logits = outputs.logits
    logits.shape
    # apply sigmoid + threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= 0.5)] = 1
    # turn predicted id's into actual label names
    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
    return predicted_labels

In [3]:
id2label = {0: 'related',
 1: 'request',
 2: 'offer',
 3: 'aid_related',
 4: 'medical_help',
 5: 'medical_products',
 6: 'search_and_rescue',
 7: 'security',
 8: 'military',
 9: 'child_alone',
 10: 'water',
 11: 'food',
 12: 'shelter',
 13: 'clothing',
 14: 'money',
 15: 'missing_people',
 16: 'refugees',
 17: 'death',
 18: 'other_aid',
 19: 'infrastructure_related',
 20: 'transport',
 21: 'buildings',
 22: 'electricity',
 23: 'tools',
 24: 'hospitals',
 25: 'shops',
 26: 'aid_centers',
 27: 'other_infrastructure',
 28: 'weather_related',
 29: 'floods',
 30: 'storm',
 31: 'fire',
 32: 'earthquake',
 33: 'cold',
 34: 'other_weather',
 35: 'direct_report'}

In [8]:
eval_message(model, tokenizer, 'Hier ist eine Flut und die Wassermassen reißen Häuser ein')

['related', 'weather_related']

: 