# Model Hugging Face

Create a model using BERT from Hugging Face.

# 1. Imports

Imported the necessary packages for the model.

In [1]:
import os

import numpy as np
import pandas as pd

from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, confusion_matrix

from huggingface_hub import notebook_login

import matplotlib.pyplot as plt

In [2]:
# Option packages

pd.set_option('display.max_columns', 500)

## 1.2 Options

In [3]:
path_data = '../input/nlp-getting-started/'

## 1.3 Datasets

Load the dataset and split into train and validation in order to check the learning of the model.
The dataset is splitted using the stratify option as the dataset is highly unbalanced.

In [4]:
df = pd.read_csv(os.path.join(path_data, 'train.csv'), index_col=0)
target = df.target
df.drop(columns='target', inplace=True)

# Split the dataset into train and validation
df_train, df_val, y_train, y_val = train_test_split(df, target, test_size=0.15, stratify=target, random_state=42)
df_train['label'] = y_train
df_val['label'] = y_val

The DataFrame is then converted into a Hugging Face dataset to follow thier framework.

In [5]:
tds = Dataset.from_pandas(df_train)
vds = Dataset.from_pandas(df_val)

disaster_tweets = DatasetDict()
disaster_tweets['train'] = tds
disaster_tweets['validation'] = vds

Directly load the test set into a Hugging Face dataset as we won't need to slit it.

In [6]:
# Load the test set directly into hHugging Face dataset
disaster_tweets_test = load_dataset('csv', data_files={'test': os.path.join(path_data, 'test.csv')})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-329d94136d348b63/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-329d94136d348b63/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
disaster_tweets

DatasetDict({
    train: Dataset({
        features: ['keyword', 'location', 'text', 'label', 'id'],
        num_rows: 6471
    })
    validation: Dataset({
        features: ['keyword', 'location', 'text', 'label', 'id'],
        num_rows: 1142
    })
})

In [8]:
disaster_tweets['train'][:5]

{'keyword': ['explode',
  'emergency',
  'emergency%20services',
  'body%20bag',
  'flattened'],
 'location': ['Yamaku Academy, Class 3-4',
  'Phoenix',
  'Auckland',
  None,
  'somewhere too cold for me'],
 'text': ['KS except every character is Shizune.\nThe world would explode.',
  'God forbid anyone in my family knows how to answer a phone. I need new emergency contacts.',
  'Emergency services unsure how to cope with loss of paging network http://t.co/UXqKIeqDyf',
  "Nuu that FAM?? fwt I'm Leave You In a Body bag??",
  "@GrabakaHitman @Izi_Garcia when he flattened machida...did he lose that fight..nope he lost fights to guys he shouldn't of lost to also"],
 'label': [0, 0, 0, 0, 1],
 'id': [4875, 4511, 4611, 1421, 5528]}

In [9]:
disaster_tweets_test

DatasetDict({
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text'],
        num_rows: 3263
    })
})

# 2. Model

In [10]:
def compute_metrics(pred) -> dict:
    """Compute accuracy and f1 score
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

In [11]:
def plot_confusion_matrix(y_preds, y_true, labels):
    """Plot confusion matrix to see the performance of the model
    """
    cm = confusion_matrix(y_preds, y_true, labels=labels)
    _, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap='Blues', values_format='.2f', ax=ax, colorbar=False)
    plt.title('Normalized confusion matrix')
    plt.show()

## Tokenizer

First we load a tokenizer in order to split the tweets into tokens. This is necessary to use Bert model. The Bert base uncased tokenizer is used.

In [12]:
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [13]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

disaster_tweets_encoded = disaster_tweets.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Loading a pretrained model

In [14]:
num_labels = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification\
    .from_pretrained(model_ckpt, num_labels=num_labels)\
    .to(device)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [15]:
batch_size = 64
logging_steps = len(disaster_tweets_encoded['train']) // batch_size
model_name = f"{model_ckpt}-finetuned-disaster"
training_args = TrainingArguments(
    report_to='none',
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=.01,
    evaluation_strategy='epoch',
    disable_tqdm=False,
    logging_steps=logging_steps,
    log_level='error',
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=disaster_tweets_encoded['train'],
    eval_dataset=disaster_tweets_encoded['validation'],
    tokenizer=tokenizer,
)

trainer.train();



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4774,0.408195,0.822242,0.822954
2,0.3681,0.380726,0.847636,0.846553


In [17]:
disaster_tweets_test_encoded = disaster_tweets_test.map(tokenize, batched=True, batch_size=None)

proba_prediction_test = trainer.predict(disaster_tweets_test_encoded['test'])
prediction_test = pd.DataFrame(data={
    'id': disaster_tweets_test['test']['id'],
    'target': np.argmax(proba_prediction_test.predictions, axis=-1)
})
prediction_test.to_csv('submission.csv', index=False)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
prediction_test

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
