In [1]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 42.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.5 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [2]:

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
from torch import optim
from tqdm import tqdm


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

import random
import collections
import pandas as pd
import numpy as np
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
PAD_TOKEN, PAD_INDEX = '[PAD]', 0
UNK_TOKEN, UNK_INDEX = '[UNK]', 1
MASK_TOKEN, MASK_INDEX = '[MASK]', 2
CLS_TOKEN, CLS_INDEX = '[CLS]', 3
SEP_TOKEN, SEP_INDEX = '[SEP]', 4

In [32]:
#Squad1.1 (Question answering dataset)
# train_sqd = pd.read_json('/content/drive/MyDrive/squad1.1/train-v1.1.json', lines = True)
# test_sqd = pd.read_json('/content/drive/MyDrive/squad1.1/dev-v1.1.json', lines = True)

                                                data  version
0  [{'title': 'Super_Bowl_50', 'paragraphs': [{'c...      1.1


Preprocessing

In [None]:
def tokenize(text: str, lower: bool, **_):  # token_min_len: int, token_max_len: int,
  if lower:
      text = text.lower()
  return text.split()

In [None]:
from tqdm import tqdm

from random import random, randint

class IndexedCorpus:
    def __init__(self, data_path, dictionary, dataset_limit=None):
        self.indexed_documents = []
        with open(data_path) as file:
            for document in tqdm(file):
                indexed_document = []
                for sentence in document.split('|'):
                    indexed_sentence = []
                    for token in sentence.strip().split():
                        indexed_token = dictionary.token_to_index(token)
                        indexed_sentence.append(indexed_token)
                    if len(indexed_sentence) < 1:
                        continue
                    indexed_document.append(indexed_sentence)
                if len(indexed_document) < 2:
                    continue
                self.indexed_documents.append(indexed_document)

                if dataset_limit is not None and len(self.indexed_documents) >= dataset_limit:
                    break

    def __getitem__(self, item):
        return self.indexed_documents[item]

    def __len__(self):
        return len(self.indexed_documents)

In [None]:
class MaskedCorpus:
    def __init__(self, data_path, dictionary, dataset_limit=None):
        source_corpus = IndexedCorpus(data_path, dictionary, dataset_limit=dataset_limit)

        self.sentences_count = 0
        self.masked_documents = []
        for indexed_document in source_corpus:
            masked_document = MaskedDocument(indexed_document, vocabulary_size=len(dictionary))
            self.masked_documents.append(masked_document)

            self.sentences_count += len(masked_document)

    def __getitem__(self, item):
        return self.masked_documents[item]

    def __len__(self):
        return len(self.masked_documents)

In [None]:
def Masked_Document(sentences, vocabulary_size):
  THRESHOLD = 0.15

  sentence =sentences[item]

  masked_sentence = []
  target_sentence = []

  for token_index in sentence:
      r = random()
      if r < THRESHOLD:  # we mask 15% of all tokens in each sequence at random.
          if r < THRESHOLD * 0.8:  # 80% of the time: Replace the word with the [MASK] token
              masked_sentence.append(MASK_INDEX)
              target_sentence.append(token_index)
          elif r < THRESHOLD * 0.9:  # 10% of the time: Replace the word with a random word
              random_token_index = randint(5, vocabulary_size-1)
              masked_sentence.append(random_token_index)
              target_sentence.append(token_index)
          else:  # 10% of the time: Keep the word unchanged
              masked_sentence.append(token_index)
              target_sentence.append(token_index)
      else:
          masked_sentence.append(token_index)
          target_sentence.append(PAD_INDEX)

  return masked_sentence, target_sentence

In [None]:
### Predicting next sentence######
class PairedDataset:

  def __init__(self, data_path, dictionary, dataset_limit=None):
    self.source_corpus = MaskedCorpus(data_path, dictionary, dataset_limit=dataset_limit)
    self.dataset_size = self.source_corpus.sentences_count
    self.corpus_size = len(self.source_corpus)

  def __getitem__(self, item):

    document_index = randint(0, self.corpus_size-1)
    document = self.source_corpus[document_index]
    sentence_index = randint(0, len(document) - 2)
    A_masked_sentence, A_target_sentence = document[sentence_index]

    if random() < 0.5:  # 50% of the time B is the actual next sentence that follows A
        B_masked_sentence, B_target_sentence = document[sentence_index + 1]
        is_next = 1
    else:  # 50% of the time it is a random sentence from the corpus
        random_document_index = randint(0, self.corpus_size-1)
        random_document = self.source_corpus[random_document_index]
        random_sentence_index = randint(0, len(random_document)-1)
        B_masked_sentence, B_target_sentence = random_document[random_sentence_index]
        is_next = 0

    sequence = [CLS_INDEX] + A_masked_sentence + [SEP_INDEX] + B_masked_sentence + [SEP_INDEX]

    # segment : [0,0,0,0,0,1,1,1,1,1,1,1])
    segment = [0] + [0] * len(A_masked_sentence) + [0] + [1] * len(B_masked_sentence) + [1]

    target = [PAD_INDEX] + A_target_sentence + [PAD_INDEX] + B_target_sentence + [PAD_INDEX]

    return (sequence, segment), (target, is_next)

  def __len__(self):
    return self.dataset_size

In [6]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 33.6 MB/s eta 0:00:01[K     |▌                               | 20 kB 8.9 MB/s eta 0:00:01[K     |▉                               | 30 kB 7.8 MB/s eta 0:00:01[K     |█                               | 40 kB 7.4 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.1 MB/s eta 0:00:01[K     |█▋                              | 61 kB 4.3 MB/s eta 0:00:01[K     |██                              | 71 kB 4.6 MB/s eta 0:00:01[K     |██▏                             | 81 kB 5.2 MB/s eta 0:00:01[K     |██▍                             | 92 kB 3.9 MB/s eta 0:00:01[K     |██▊                             | 102 kB 4.2 MB/s eta 0:00:01[K     |███                             | 112 kB 4.2 MB/s eta 0:00:01[K     |███▎                            | 122 kB 4.2 MB/s eta 0:00:01[K     |███▌         

Optimizers

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(),betas=(0.9, 0.999), weight_decay=0.01, lr=5e-5)

Dataloader

In [34]:
class DATALoader:
  def __init__(self, data, target, max_length, tokenizer):
    self.data = data
    self.target = target #make sure to convert the target into numerical values
    self.tokenizer = tokenizer
    self.max_length = max_length
      
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, item):
    data = str(self.data[item])
    data = " ".join(data.split())
    
    inputs = self.tokeniser.encode_plus(
        data, 
        None,
        add_special_tokens=True,
        max_length = self.max_length,
        pad_to_max_length=True
        
    )
      
    ids = inputs["input_ids"]
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]


    padding_length = self.max_length - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.target[item], dtype=torch.long)
    }
    
    
def train_func(data_loader, model, optimizer, device):
  model.to(device)
  model.train()
  
  for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
    ids = d["ids"]
    token_type_ids = d["token_type_ids"]
    mask = d["mask"]
    targets = d["targets"]
    
    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)
    
    optimizer.zero_grad()
    output = model(
        ids=ids,
        mask = mask,
        token_type_ids = token_type_ids
    )
    
    
    loss = criterion
    loss.backward()
    
    optimizer.step()

        
        
def eval_func(data_loader, model, device):
  model.eval()
  
  fin_targets = []
  fin_output = []
  
  with torch.no_grad():
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
      ids = d["ids"]
      token_type_ids = d["token_type_ids"]
      mask = d["mask"]
      targets = d["targets"]

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.long)


      output = model(
          ids=ids,
          masks = mask,
          token_type_ids = token_type_ids
      )
  
      fin_targets.extend(targets.cpu().detach().numpy().to_list())
      fin_targets.extend(torch.sigmoid(output).cpu().detach().numpy().to_list())
          
  return fin_output, fin_targets

In [None]:
BERT model implementation

Padding mask

In [5]:
def pad_masking(x):
  # x: (batch_size, seq_len)
  padded_positions = x == PAD_INDEX
  return padded_positions.unsqueeze(1)

Embedding

In [6]:
import torch
from torch import nn


class PositionalEmbedding(nn.Module):

  def __init__(self, max_len, hidden_size):
    """max_len: maximum lenght of sequence"""
    super(PositionalEmbedding, self).__init__()
    self.positional_embedding = nn.Embedding(max_len, hidden_size)
    positions = torch.arange(0, max_len)
    self.register_buffer('positions', positions)

  def forward(self, sequence):
    batch_size, seq_len = sequence.size()
    positions = self.positions[:seq_len].unsqueeze(0).repeat(batch_size, 1)
    return self.positional_embedding(positions)

class SegmentEmbedding(nn.Module):

  def __init__(self, hidden_size, n_segments=2):
    super(SegmentEmbedding, self).__init__()
    self.segment_embedding = nn.Embedding(n_segments, hidden_size)

  def forward(self, segments):
    """segments: (batch_size, seq_len)"""
    return self.segment_embedding(segments)  # (batch_size, seq_len, hidden_size)

Gelu activation function

In [7]:
import torch
from torch import nn

import math

class GELU(nn.Module):

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

Transformers

In [8]:
import torch
from torch import nn
import numpy as np


class TransformerEncoder(nn.Module):

  def __init__(self, layers_count, d_model, heads_count, d_ff, dropout_prob):
    super(TransformerEncoder, self).__init__()
    """layers_count:Number transformers blocks.
    heads_count: Number of attention heads
    d_model : size of the encoder layers
    d_ff: size of feed forward layer in the transformer encoder
    dropoutprob: dropout probability
    """
    self.d_model = d_model
    self.encoder_layers = nn.ModuleList(
        [TransformerEncoderLayer(d_model, heads_count, d_ff, dropout_prob) for _ in range(layers_count)]
    )

  def forward(self, x, mask):
    """Transformer bidirectional encoder
    args:
        sources: embedded_sequence, (batch_size, seq_len, embed_size)
    """
    for encoder_layer in self.encoder_layers:
        x = encoder_layer(x, mask)

    return x


class TransformerEncoderLayer(nn.Module):

  def __init__(self, d_model, heads_count, d_ff, dropout_prob):
    super(TransformerEncoderLayer, self).__init__()

    self.self_attention_layer = Sublayer(MultiHeadAttention(heads_count, d_model, dropout_prob), d_model)
    self.pointwise_feedforward_layer = Sublayer(PointwiseFeedForwardNetwork(d_ff, d_model, dropout_prob), d_model)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, x, mask):
    # x: (batch_size, seq_len, d_model)

    x = self.self_attention_layer(x, x, x, mask)
    x = self.dropout(x)
    x = self.pointwise_feedforward_layer(x)

    return x


class Sublayer(nn.Module):####Residual connection

  def __init__(self, sublayer, d_model):
    super(Sublayer, self).__init__()

    self.sublayer = sublayer
    self.layer_normalization = LayerNormalization(d_model)

  def forward(self, *args):
    x = args[0]
    x = self.sublayer(*args) + x
    return self.layer_normalization(x)


class LayerNormalization(nn.Module):#Layer normalisation

  def __init__(self, features_count, epsilon=1e-6):
      super(LayerNormalization, self).__init__()

      self.w = nn.Parameter(torch.ones(features_count))
      self.bias = nn.Parameter(torch.zeros(features_count))
      self.epsilon = epsilon

  def forward(self, x):

    mean = x.mean(dim=-1, keepdim=True)
    std = x.std(dim=-1, keepdim=True)

    return self.w * (x - mean) / (std + self.epsilon) + self.bias


class MultiHeadAttention(nn.Module):

  def __init__(self, heads_count, d_model, dropout_prob, mode='self-attention'):
    super(MultiHeadAttention, self).__init__()

    assert d_model % heads_count == 0
    assert mode in ('self-attention', 'memory-attention')

    self.d_head = d_model // heads_count
    self.heads_count = heads_count
    self.mode = mode
    self.query_projection = nn.Linear(d_model, heads_count * self.d_head)
    self.key_projection = nn.Linear(d_model, heads_count * self.d_head)
    self.value_projection = nn.Linear(d_model, heads_count * self.d_head)
    self.final_projection = nn.Linear(d_model, heads_count * self.d_head)
    self.dropout = nn.Dropout(dropout_prob)
    self.softmax = nn.Softmax(dim=3)

    self.attention = None
    # For cache
    self.key_projected = None
    self.value_projected = None

  def forward(self, query, key, value, mask=None, layer_cache=None):
    """
    Args:
        query: (batch_size, query_len, model_dim)
        key: (batch_size, key_len, model_dim)
        value: (batch_size, value_len, model_dim)
        mask: (batch_size, query_len, key_len)
    """
    batch_size, query_len, d_model = query.size()

    d_head = d_model // self.heads_count

    query_projected = self.query_projection(query)

    if layer_cache is None or layer_cache[self.mode] is None:  
      key_projected = self.key_projection(key)
      value_projected = self.value_projection(value)
    else:  # Use cache
      if self.mode == 'self-attention':
        key_projected = self.key_projection(key)
        value_projected = self.value_projection(value)

        key_projected = torch.cat([key_projected, layer_cache[self.mode]['key_projected']], dim=1)
        value_projected = torch.cat([value_projected, layer_cache[self.mode]['value_projected']], dim=1)
      elif self.mode == 'memory-attention':
        key_projected = layer_cache[self.mode]['key_projected']
        value_projected = layer_cache[self.mode]['value_projected']

      self.key_projected = key_projected
      self.value_projected = value_projected

      batch_size, key_len, d_model = key_projected.size()
      batch_size, value_len, d_model = value_projected.size()

      query_heads = query_projected.view(batch_size, query_len, self.heads_count, d_head).transpose(1, 2)  # (batch_size, heads_count, query_len, d_head)
      key_heads = key_projected.view(batch_size, key_len, self.heads_count, d_head).transpose(1, 2)  # (batch_size, heads_count, key_len, d_head)
      value_heads = value_projected.view(batch_size, value_len, self.heads_count, d_head).transpose(1, 2)  # (batch_size, heads_count, value_len, d_head)

      attention_weights = self.scaled_dot_product(query_heads, key_heads)  # (batch_size, heads_count, query_len, key_len)

      if mask is not None:
        mask_expanded = mask.unsqueeze(1).expand_as(attention_weights)
        attention_weights = attention_weights.masked_fill(mask_expanded, -1e18)

      self.attention = self.softmax(attention_weights) 
      attention_dropped = self.dropout(self.attention)
      context_heads = torch.matmul(attention_dropped, value_heads)  # (batch_size, heads_count, query_len, d_head)
      context_sequence = context_heads.transpose(1, 2).contiguous()  # (batch_size, query_len, heads_count, d_head)
      context = context_sequence.view(batch_size, query_len, d_model)  # (batch_size, query_len, d_model)
      final_output = self.final_projection(context)

      return final_output

    def scaled_dot_product(self, query_heads, key_heads):
      """
      Args:
            query_heads: (batch_size, heads_count, query_len, d_head)
            key_heads: (batch_size, heads_count, key_len, d_head)
      """
      key_heads_transposed = key_heads.transpose(2, 3)
      dot_product = torch.matmul(query_heads, key_heads_transposed)  # (batch_size, heads_count, query_len, key_len)
      attention_weights = dot_product / np.sqrt(self.d_head)
      return attention_weights


class PointwiseFeedForwardNetwork(nn.Module):

  def __init__(self, d_ff, d_model, dropout_prob):
    super(PointwiseFeedForwardNetwork, self).__init__()

    self.feed_forward = nn.Sequential(
        nn.Linear(d_model, d_ff),
        nn.Dropout(dropout_prob),
        GELU(),
        nn.Linear(d_ff, d_model),
        nn.Dropout(dropout_prob),
    )

  def forward(self, x):
    """
    Args:
          x: (batch_size, seq_len, d_model)
    """
    return self.feed_forward(x)

Bert model

In [9]:
from torch import nn

class BERT(nn.Module):

  def __init__(self, encoder, token_embedding, positional_embedding, segment_embedding, hidden_size, vocabulary_size, n_classes):
    super(BERT, self).__init__()
    """
    vocab_size,
    hidden_size=768,
    hidden_size=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_length=512,
    type_vocab_size=16,
    initializer_range=0.02
    """
    self.encoder = encoder
    self.token_embedding = token_embedding
    self.positional_embedding = positional_embedding
    self.segment_embedding = segment_embedding
    self.token_prediction_layer = nn.Linear(hidden_size, vocabulary_size)
    self.classification_layer = nn.Linear(hidden_size, n_classes)

  def forward(self, inputs):
    sequence, segment = inputs
    token_embedded = self.token_embedding(sequence)
    positional_embedded = self.positional_embedding(sequence)
    segment_embedded = self.segment_embedding(segment)
    embedded_sources = token_embedded + positional_embedded + segment_embedded

    mask = pad_masking(sequence)
    encoded_sources = self.encoder(embedded_sources, mask)
    token_predictions = self.token_prediction_layer(encoded_sources)
    classification_embedding = encoded_sources[:, 0, :]
    classification_output = self.classification_layer(classification_embedding)
    return token_predictions, classification_output


def build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size):
  token_embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=hidden_size)
  positional_embedding = PositionalEmbedding(max_len=max_len, hidden_size=hidden_size)
  segment_embedding = SegmentEmbedding(hidden_size=hidden_size)

  encoder = TransformerEncoder(
      layers_count=layers_count,
      d_model=hidden_size,
      heads_count=heads_count,
      d_ff=d_ff,
      dropout_prob=dropout_prob)

  bert = BERT(
      encoder=encoder,
      token_embedding=token_embedding,
      positional_embedding=positional_embedding,
      segment_embedding=segment_embedding,
      hidden_size=hidden_size,
      vocabulary_size=vocabulary_size,
      n_classes=2)

  return bert


class FineTuneModel(nn.Module):

  def __init__(self, pretrained_model, hidden_size, num_classes):
    super(FineTuneModel, self).__init__()

    self.pretrained_model = pretrained_model

    new_classification_layer = nn.Linear(hidden_size, num_classes)
    self.pretrained_model.classification_layer = new_classification_layer

  def forward(self, inputs):
    sequence, segment = inputs
    token_predictions, classification_outputs = self.pretrained_model((sequence, segment))
    return classification_outputs



Test the model

In [10]:
import torch

def test_encoder():
  model = build_model(hidden_size=512, layers_count=6, heads_count=8, d_ff=1024, dropout_prob=0.1, max_len=512,
                      vocabulary_size=100)

  example_sequence = torch.tensor([[1, 2, 3, 4, 5], [2, 1, 3, 0, 0]])
  example_segment = torch.tensor([[0, 0, 1, 1, 1], [0, 0, 0, 1, 1]])

  token_predictions, classification_output = model((example_sequence, example_segment))

  batch_size, seq_len, target_vocabulary_size = 2, 5, 100
  assert token_predictions.size() == (batch_size, seq_len, target_vocabulary_size)

In [None]:
from sklearn import preprocessing
def run():
  df = pd.read_json('/content/drive/MyDrive/News_Category_Dataset_v2.json', lines = True)
  data = pd.DataFrame({
      'text' : df['headline'] + df['short_description'],
      'label' : df['category']
  })


  encoder = preprocessing.LabelEncoder()
  data['label'] = encoder.fit_transform(data['label'])

  df_train, df_valid = train_test_split(data, test_size = 0.1, random_state=23, stratify=data.label.values)

  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  train_dataset = DATALoader(
      data=df_train.text.values,
      target=df_train.label.values,
      max_length=512
  )

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset, 
      batch_size=8,
      num_workers=4,
  )

  val_dataset = DATALoader(
      data=df_valid.text.values,
      target=df_valid.label.values,
      max_length=512
  )

  val_data_loader = torch.utils.data.DataLoader(
      val_dataset, 
      batch_size=4,
      num_workers=1,
  )

  device = torch.device("cuda")
  model = FineTuneModel(BERT, hidden_size=512, num_classes=2)
  optimizer = optim.AdamW(model.parameters(),betas=(0.9, 0.999), weight_decay=0.01, lr=5e-5)

  best_accuracy = 0
  for epoch in range(5):
      train_func(data_loader=train_data_loader, model=model, optimizer=optimizer, device=device)
      outputs, targets = eval_func(data_loader=train_data_loader, model=model, device=device)
      outputs = np.array(outputs) >= 0.5
      accuracy = metrics.accuracy_score()
      print(f"Accuracy Score: {accuracy}")

      if accuracy>best_accuracy:
          torch.save(model.state_dict(), "model.bin")
          best_accuracy = accuracy

Fine tune pretrained BERT  model on different NLP tasks

Fine tuning on question answering tasks

In [56]:
from transformers import pipeline
question_answerer = pipeline("question-answering")
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script."""

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [60]:
result = question_answerer(question="What is extractive question answering?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95
Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160


In [58]:

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""
questions = [
    "How many pretrained models are available in 🤗 Transformers?",
    "What does 🤗 Transformers provide?",
    "🤗 Transformers provides interoperability between which frameworks?",
]
for question in questions:
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Question: How many pretrained models are available in 🤗 Transformers?
Answer: over 32 +
Question: What does 🤗 Transformers provide?
Answer: general - purpose architectures
Question: 🤗 Transformers provides interoperability between which frameworks?
Answer: tensorflow 2. 0 and pytorch


Language modelling

In [61]:
from transformers import pipeline
unmasker = pipeline("fill-mask")

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [62]:
from pprint import pprint
pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."))

[{'score': 0.17927460372447968,
  'sequence': 'HuggingFace is creating a tool that the community uses to solve '
              'NLP tasks.',
  'token': 3944,
  'token_str': ' tool'},
 {'score': 0.1134939044713974,
  'sequence': 'HuggingFace is creating a framework that the community uses to '
              'solve NLP tasks.',
  'token': 7208,
  'token_str': ' framework'},
 {'score': 0.05243545398116112,
  'sequence': 'HuggingFace is creating a library that the community uses to '
              'solve NLP tasks.',
  'token': 5560,
  'token_str': ' library'},
 {'score': 0.03493543714284897,
  'sequence': 'HuggingFace is creating a database that the community uses to '
              'solve NLP tasks.',
  'token': 8503,
  'token_str': ' database'},
 {'score': 0.02860247902572155,
  'sequence': 'HuggingFace is creating a prototype that the community uses to '
              'solve NLP tasks.',
  'token': 17715,
  'token_str': ' prototype'}]


In [63]:

from transformers import AutoModelWithLMHead, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
input = tokenizer.encode(sequence, return_tensors="pt")
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
token_logits = model(input).logits
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [64]:
for token in top_5_tokens:
    print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))

Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.


Name entity Recognition

In [65]:
from transformers import pipeline
ner_pipe = pipeline("ner")
sequence = """Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO,
therefore very close to the Manhattan Bridge which is visible from the window."""

Downloading:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [66]:
print(ner_pipe(sequence))

[{'entity': 'I-ORG', 'score': 0.9995786, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}, {'entity': 'I-ORG', 'score': 0.9909764, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}, {'entity': 'I-ORG', 'score': 0.9982225, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}, {'entity': 'I-ORG', 'score': 0.99948806, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16}, {'entity': 'I-LOC', 'score': 0.9994345, 'index': 11, 'word': 'New', 'start': 40, 'end': 43}, {'entity': 'I-LOC', 'score': 0.9993196, 'index': 12, 'word': 'York', 'start': 44, 'end': 48}, {'entity': 'I-LOC', 'score': 0.9993794, 'index': 13, 'word': 'City', 'start': 49, 'end': 53}, {'entity': 'I-LOC', 'score': 0.98625827, 'index': 19, 'word': 'D', 'start': 79, 'end': 80}, {'entity': 'I-LOC', 'score': 0.9514269, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82}, {'entity': 'I-LOC', 'score': 0.933659, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84}, {'entity': 'I-LOC', 'score': 0.9761654, 'index': 28, 'word': 'Manhattan

In [67]:

from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
label_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."
# Bit of a hack to get the tokens with the special tokens
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="pt")
outputs = model(inputs).logits
predictions = torch.argmax(outputs, dim=2)

In [68]:
for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

('[CLS]', 'O')
('Hu', 'I-ORG')
('##gging', 'I-ORG')
('Face', 'I-ORG')
('Inc', 'I-ORG')
('.', 'O')
('is', 'O')
('a', 'O')
('company', 'O')
('based', 'O')
('in', 'O')
('New', 'I-LOC')
('York', 'I-LOC')
('City', 'I-LOC')
('.', 'O')
('Its', 'O')
('headquarters', 'O')
('are', 'O')
('in', 'O')
('D', 'I-LOC')
('##UM', 'I-LOC')
('##BO', 'I-LOC')
(',', 'O')
('therefore', 'O')
('very', 'O')
('##c', 'O')
('##lose', 'O')
('to', 'O')
('the', 'O')
('Manhattan', 'I-LOC')
('Bridge', 'I-LOC')
('.', 'O')
('[SEP]', 'O')
