# Preparation

## Pip Install

In [None]:
!pip install transformers



In [None]:
!pip install tokenizers



In [None]:
!pip install Sentencepiece



## Basic Modules

In [None]:
import numpy as np
from transformers import pipeline

## Load Data


In [None]:
!wget https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt

--2021-05-05 03:18:28--  https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6488666 (6.2M) [text/plain]
Saving to: ‘big.txt.2’


2021-05-05 03:18:29 (14.4 MB/s) - ‘big.txt.2’ saved [6488666/6488666]



In [None]:
sample_message = 'Simple input to be tokenized and processed'
longer_sample_message = 'Simple input to be tokenized and processed so that it can be analysed'
sample_msg_ger = "Hugging Face ist eine französische Firma mit Sitz in New-York."
sample_self_intro = 'My Name is Alvin. I work at Naluri in Malaysia and I love sashimi'

In [None]:
# Extracted from http://www.gutenberg.org/files/1661/1661-h/1661-h.htm
sample_paragraph = 'To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position. He never spoke of the softer passions, save with a gibe and a sneer. They were admirable things for the observer—excellent for drawing the veil from men’s motives and actions. But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results. Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be more disturbing than a strong emotion in a nature such as his. And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory.'

# Tokenization

In [None]:
from tokenizers import Tokenizer

from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE                                   # Byte-Pair Encoding
from tokenizers.normalizers import Lowercase, NFKC, Sequence        # lower-casing and unicode-normalization; Sequence combines multiple Normalizer and execute in order
from tokenizers.pre_tokenizers import ByteLevel

from tokenizers.trainers import BpeTrainer                          # to train with byte-pair encoder

In [None]:
tokenizer = Tokenizer(BPE())            # create an empty Byte-Pair Encoding model
tokenizer.normalizer = Sequence([       # normalize in sequnced order
    NFKC(),                             # unicode normalization
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()   # pre-tokenizer converts inputs to a ByteLevel representation
tokenizer.decoder = ByteLevelDecoder()  # decode tokenized text to original text

In [None]:
trainer = BpeTrainer(vocab_size=30000, show_progress=True, initial_alphabet=ByteLevel.alphabet())  
tokenizer.train(files=["big.txt"], trainer=trainer)
tokenizer.get_vocab_size()

30000

In [None]:
# tokenizer.model.save('.')                             # Export tokenization model - 2 files
# tokenizer.model = BPE('vocab.json', 'merges.txt')     # Import saved tokenization model locally

In [None]:
# Try Model! 

encoding = tokenizer.encode(sample_message)
print(sample_message)
print(encoding.tokens)
# Note that 'in-put', 'token-ized' and 'process-ed' are each split into simpler tokens

Simple input to be tokenized and processed
['Ġsimple', 'Ġin', 'put', 'Ġto', 'Ġbe', 'Ġtoken', 'ized', 'Ġand', 'Ġprocess', 'ed']


In [None]:
tokenizer.decode(encoding.ids)

' simple input to be tokenized and processed'

# Transformers

In [None]:
# RNN (sequential processing with NN) is commonly used before Transformers
# Good for task involving sequential dependency but has issue with long range dependencies & bad parallelization capabilities

# Transformer is non-sequencial attention
# Hence it can look at every position in a sequence at the same time
# More details : https://nlp.seas.harvard.edu/2018/04/03/attention.html#encoder-and-decoder-stacks

# BERT - Bidirectional Encoder Representations from Transformers
# More details : https://arxiv.org/abs/1810.04805

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer   # To Import pre-trained model from HuggingFace
from transformers import BertTokenizer
from transformers import TFBertModel, BertModel     # BertModel for TensorFlow and Pytorch (default)
from transformers import DistilBertModel            # distilled from large-scale language model, much faster & lighter with 97% of BERT's performance; More info: https://medium.com/huggingface/distilbert-8cf3380435b5

torch.set_grad_enabled(False); # computational graph clean up for gradient calculation to avoid out of memory error

In [None]:
MODEL_NAME = "bert-base-cased"

In [None]:
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




In [None]:
tokens_pt = tokenizer(sample_message, sample_message, return_tensors="pt")      # Input 2 sentences/sequences, return PyTorch Tensor
print(sample_message)
print(tokens_pt)
# token_type_ids: map tokens to their belonging sentnence/sequence to understand the start and end of each sentence/sequence
# attention_mask: "mask" padded values; 0 = padded = model will not attend

Simple input to be tokenized and processed
{'input_ids': tensor([[  101, 16896,  7758,  1106,  1129, 22559,  2200,  1105, 14659,   102,
         16896,  7758,  1106,  1129, 22559,  2200,  1105, 14659,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
outputs = model(**tokens_pt)    
last_hidden_state = outputs.last_hidden_state       # best with Named Entity Recognition / Question-Answering.
pooler_output = outputs.pooler_output               # best with Sentiment-Analysis / Information Retrieval (doesnt require token level info).
print(last_hidden_state.shape)  # generated representation for each token in input (1, NB_TOKENS, REPRESENTATION_SIZE)
print(pooler_output.shape)      # aggregated representation for the whole input (1, REPRESENTATION_SIZE)

torch.Size([1, 19, 768])
torch.Size([1, 768])


In [None]:
tokens = tokenizer(sample_message, sample_message)              # Input 2 sentences/sequences, return as dictionary of arrays
print(tokens)
print(tokenizer.convert_ids_to_tokens(tokens['input_ids']))     # tokens from index to string format

{'input_ids': [101, 16896, 7758, 1106, 1129, 22559, 2200, 1105, 14659, 102, 16896, 7758, 1106, 1129, 22559, 2200, 1105, 14659, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'Simple', 'input', 'to', 'be', 'token', '##ized', 'and', 'processed', '[SEP]', 'Simple', 'input', 'to', 'be', 'token', '##ized', 'and', 'processed', '[SEP]']


In [None]:
# Padding - to match the length of words across all sentence/sequence
tokens = tokenizer(
    [sample_message, longer_sample_message], 
    padding=True  # First sentence will have some PADDED tokens to match second sequence length
)
print(sample_message)
print(tokens[0])
print([tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][0]])     # tokens from index to string format
print()
print(longer_sample_message)
print(tokens[1])
print([tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][1]])     # tokens from index to string format

Simple input to be tokenized and processed
Encoding(num_tokens=18, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', 'Simple', 'input', 'to', 'be', 'token', '##ized', 'and', 'processed', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']

Simple input to be tokenized and processed so that it can be analysed
Encoding(num_tokens=18, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', 'Simple', 'input', 'to', 'be', 'token', '##ized', 'and', 'processed', 'so', 'that', 'it', 'can', 'be', 'anal', '##ys', '##ed', '[SEP]']


In [None]:
# # TensorFlow
# model_tf = TFBertModel.from_pretrained('bert-base-cased')
# input_tf = tokenizer("This is a sample input", return_tensors="tf")

# # PyTorch
# model_pt = BertModel.from_pretrained('bert-base-cased')
# input_pt = tokenizer("This is a sample input", return_tensors="pt")


# # Compare outputs
# output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)
# for name in ["last_hidden_state", "pooler_output"]:
#     print("{} differences: {:.5}".format(name, (output_tf[name].numpy() - output_pt[name].numpy()).sum()))

In [None]:
# Distilled BERT Modeling
model_dbert = DistilBertModel.from_pretrained('distilbert-base-cased')
model_bert = BertModel.from_pretrained('bert-base-cased')
tokens = tokenizer(longer_sample_message, return_tensors="pt")

%time _ = model_dbert(tokens['input_ids'])
%time _ = model_bert(tokens['input_ids'])

# distilled BERT almost halfed the CPU time

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…


CPU times: user 55.7 ms, sys: 2.74 ms, total: 58.5 ms
Wall time: 76.3 ms
CPU times: user 110 ms, sys: 0 ns, total: 110 ms
Wall time: 106 ms


In [None]:
# Example on German language with "Community Provided Model"
model_bert_ger= BertModel.from_pretrained("dbmdz/bert-base-german-cased")
tokenizer_ger = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

tokens_ger = tokenizer_ger(
    sample_msg_ger,
    return_tensors="pt"
)
print(sample_msg_ger)
print(tokens_ger)
print([tokenizer_ger.convert_ids_to_tokens(s) for s in tokens_ger['input_ids'].tolist()[0]])

Hugging Face ist eine französische Firma mit Sitz in New-York.
{'input_ids': tensor([[  102, 12272,  9355,  5746, 30881,   215,   261,  5945,  4118,   212,
          2414,   153,  1942,   232,  3532,   566,   103]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 'Hug', '##ging', 'Fac', '##e', 'ist', 'eine', 'französische', 'Firma', 'mit', 'Sitz', 'in', 'New', '-', 'York', '.', '[SEP]']


# Pipelines

Source: [link](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=pipeline#the-pipeline-abstraction)<br><br>
Downstream-tasks, including: 

- ***Sentiment Analysis***
- ***Named Entity Recognition***
- ***Question-Answering***
- ***Mask-Filling***
- ***Summarization***
- ***Translation***
- ***Feature Extraction***

Pipelines encapsulate tokenization, inference & decoding.

API structure:

```python
from transformers import pipeline

# Using default model and tokenizer for the task
pipeline("<task-name>")

# Using a user-specified model
pipeline("<task-name>", model="<model_name>")

# Using custom model/tokenizer as str
pipeline('<task-name>', model='<model name>', tokenizer='<tokenizer_name>')
```

## Sentimental Analysis

In [None]:
model = pipeline('sentiment-analysis')
print(model(r"I don't hate speaking with him"))
print(model(r"His attitude is horrible"))

[{'label': 'POSITIVE', 'score': 0.9988293051719666}]
[{'label': 'NEGATIVE', 'score': 0.9996873736381531}]


## Named Entity Recognition

In [None]:
model = pipeline('ner')
print(sample_self_intro)
model(sample_self_intro)

My Name is Alvin. I work at Naluri in Malaysia and I love sashimi


[{'end': 16,
  'entity': 'I-PER',
  'index': 4,
  'score': 0.9981746673583984,
  'start': 11,
  'word': 'Alvin'},
 {'end': 30,
  'entity': 'I-ORG',
  'index': 9,
  'score': 0.9940113425254822,
  'start': 28,
  'word': 'Na'},
 {'end': 32,
  'entity': 'I-ORG',
  'index': 10,
  'score': 0.9317202568054199,
  'start': 30,
  'word': '##lu'},
 {'end': 34,
  'entity': 'I-ORG',
  'index': 11,
  'score': 0.9865034818649292,
  'start': 32,
  'word': '##ri'},
 {'end': 46,
  'entity': 'I-LOC',
  'index': 13,
  'score': 0.9998395442962646,
  'start': 38,
  'word': 'Malaysia'},
 {'end': 63,
  'entity': 'I-MISC',
  'index': 18,
  'score': 0.7182899117469788,
  'start': 60,
  'word': '##shi'}]

## Question Answering

In [None]:
model = pipeline('question-answering')
print(sample_self_intro)
model(context=sample_self_intro, question='Where Alvin works ?')

My Name is Alvin. I work at Naluri in Malaysia and I love sashimi


{'answer': 'Naluri', 'end': 34, 'score': 0.527121365070343, 'start': 28}

## Mask Filling

In [None]:
model = pipeline('fill-mask')
model('Alvin is feeling ' + model.tokenizer.mask_token + ' this morning')

[{'score': 0.1295562982559204,
  'sequence': 'Alvin is feeling better this morning',
  'token': 357,
  'token_str': ' better'},
 {'score': 0.06868993490934372,
  'sequence': 'Alvin is feeling great this morning',
  'token': 372,
  'token_str': ' great'},
 {'score': 0.05671766772866249,
  'sequence': 'Alvin is feeling fine this morning',
  'token': 2051,
  'token_str': ' fine'},
 {'score': 0.053356777876615524,
  'sequence': 'Alvin is feeling good this morning',
  'token': 205,
  'token_str': ' good'},
 {'score': 0.03130243346095085,
  'sequence': 'Alvin is feeling OK this morning',
  'token': 4954,
  'token_str': ' OK'}]

## Summarization

In [None]:
sample_biography = r"Sherlock Holmes is a fictional detective of the late 19th and early 20th centuries, who first appeared in publication in 1887. He was devised by British author and physician Sir Arthur Conan Doyle. A brilliant London-based detective, Holmes is famous for his prowess at using logic and astute observation to solve cases. He is perhaps the most famous fictional detective, and indeed one of the best known and most universally recognizable literary characters. Sir Arthur Conan Doyle wrote four novels and fifty-six short-stories featuring his creation. Almost all were narrated by Holmes' friend and biographer, Dr John H. Watson, with the exception of two narrated by Holmes himself and two more written in the third person. The stories first appeared in magazine serialization, notably in The Strand Magazine, over a period of forty years. This was a common form of publication at the time: Charles Dickens' works were issued in a similar fashion. The stories cover a period from around 1878 up to 1903, with a final case in 1914. They are read as much for their characterization and the stylised late-Victorian era in which they take place as for the mysteries themselves. More actors have portrayed Sherlock Holmes than any other character, and by 1964, according to a report in The Times, the worldwide sales of the stories were running second only to the Bible"
model = pipeline('summarization')
model(sample_biography, min_length=5, max_length=30)

[{'summary_text': ' Sherlock Holmes is a fictional detective of the late 19th and early 20th centuries . He was devised by British author Sir Arthur Conan Doyle'}]

## Translation
Packages: https://huggingface.co/models?pipeline_tag=translation <br>
More about Helsinki-NLP Packages : https://github.com/Helsinki-NLP/Opus-MT

### Eng to Ger

In [None]:
# English to German (via pipeline)
model = pipeline('translation_en_to_de')
print(sample_self_intro)
model(sample_self_intro)

My Name is Alvin. I work at Naluri in Malaysia and I love sashimi


[{'translation_text': 'Mein Name ist Alvin, ich arbeite bei Naluri in Malaysia und liebe Sashimi.'}]

### Eng to Chi

In [None]:
# English to Chinese 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MarianTokenizer, MarianMTModel

src_lang = 'en'     # English
tgt_lang = 'zh'     # Chinese

pretrained_model = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'

**When default model pipeline is not available**

In [None]:
model = pipeline("translation_en_to_zh", model=pretrained_model)
model(sample_self_intro)

[{'translation_text': '我叫阿尔文,我在马来西亚的纳鲁里工作,我喜欢生鱼鱼'}]

**Manual way when pipeline is not available**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model)
type(tokenizer)

transformers.models.marian.tokenization_marian.MarianTokenizer

In [None]:
tokens = tokenizer([sample_self_intro], return_tensors='pt')
translation = model.generate(**tokens)
translated_text = tokenizer.batch_decode(translation, skip_special_tokens=True)[0]
translated_text

'我叫阿尔文,我在马来西亚的纳鲁里工作,我喜欢生鱼鱼'

## Text Generation

In [None]:
model = pipeline("text-generation")
model(sample_self_intro)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My Name is Alvin. I work at Naluri in Malaysia and I love sashimi."\n\nIn January his father, the late actor Ananth Kumar Jhayar, visited his home in New Delhi and asked Ananth to show'}]

## Feature Extraction

In [None]:
model = pipeline('feature-extraction')
output = model([sample_self_intro, sample_message])
np.array(output).shape   # (Samples, Tokens, Vector Size)

(2, 21, 768)

## Zero Shot Classification
Gentle Intro : [link](https://huggingface.co/facebook/bart-large-mnli) <br>
More about Zero-Shot Learning : [link](https://en.wikipedia.org/wiki/Zero-shot_learning)

In [None]:
model = pipeline("zero-shot-classification")
paragraph = 'A new model offers an explanation for how the Galilean satellites formed around the solar system’s largest world. Konstantin Batygin did not set out to solve one of the solar system’s most puzzling mysteries when he went for a run up a hill in Nice, France. Dr. Batygin, a Caltech researcher, best known for his contributions to the search for the solar system’s missing “Planet Nine,” spotted a beer bottle. At a steep, 20 degree grade, he wondered why it wasn’t rolling down the hill. He realized there was a breeze at his back holding the bottle in place. Then he had a thought that would only pop into the mind of a theoretical astrophysicist: “Oh! This is how Europa formed.” Europa is one of Jupiter’s four large Galilean moons. And in a paper published Monday in the Astrophysical Journal, Dr. Batygin and a co-author, Alessandro Morbidelli, a planetary scientist at the Côte d’Azur Observatory in France, present a theory explaining how some moons form around gas giants like Jupiter and Saturn, suggesting that millimeter-sized grains of hail produced during the solar system’s formation became trapped around these massive worlds, taking shape one at a time into the potentially habitable moons we know today.'
label_candidates = ['space & cosmos', 'scientific discovery', 'microbiology', 'robots', 'archeology']
model(paragraph, label_candidates, multi_label=False)

{'labels': ['scientific discovery',
  'space & cosmos',
  'archeology',
  'microbiology',
  'robots'],
 'scores': [0.6726154685020447,
  0.2543172240257263,
  0.028822265565395355,
  0.02705550193786621,
  0.017189569771289825],
 'sequence': 'A new model offers an explanation for how the Galilean satellites formed around the solar system’s largest world. Konstantin Batygin did not set out to solve one of the solar system’s most puzzling mysteries when he went for a run up a hill in Nice, France. Dr. Batygin, a Caltech researcher, best known for his contributions to the search for the solar system’s missing “Planet Nine,” spotted a beer bottle. At a steep, 20 degree grade, he wondered why it wasn’t rolling down the hill. He realized there was a breeze at his back holding the bottle in place. Then he had a thought that would only pop into the mind of a theoretical astrophysicist: “Oh! This is how Europa formed.” Europa is one of Jupiter’s four large Galilean moons. And in a paper publish

## Conversational
More details : [link](https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313)

In [None]:
from transformers import Conversation
model = pipeline("conversational")

In [None]:
# Initiate a conversation
conversation = Conversation("Tell me a joke.")
model([conversation])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Conversation id: 687c2efc-faf0-480e-9262-d04bad416403 
user >> Tell me a joke. 
bot >> What's the heaviest soup in Asia? One ton. 

In [None]:
# Initiate another conversation
conversation_2 = Conversation("What is your name?")
model([conversation_2])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Conversation id: 8de1efc5-d273-44fe-8c0d-86a14ab2c5c4 
user >> What is your name? 
bot >> I'm not sure, but I think it's a reference to the movie The Big Lebowski. 

In [None]:
# Apppend new dialog
conversation_2.add_user_input("How are you doing today?")
model([conversation_2])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Conversation id: 8de1efc5-d273-44fe-8c0d-86a14ab2c5c4 
user >> What is your name? 
bot >> I'm not sure, but I think it's a reference to the movie The Big Lebowski. 
user >> How are you doing today? 
bot >> I'm doing well, how are you? 

In [None]:
# Overwrite bot's answer
conversation_2.add_user_input("Where do you live?")
conversation_2.append_response('I  live in Malaysia')  # to overwrite bot's answer
model([conversation_2]) 

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Conversation id: 8de1efc5-d273-44fe-8c0d-86a14ab2c5c4 
user >> What is your name? 
bot >> I'm not sure, but I think it's a reference to the movie The Big Lebowski. 
user >> How are you doing today? 
bot >> I'm doing well, how are you? 
user >> Where do you live? 
bot >> I  live in Malaysia 

## Text2Text Generator

In [None]:
text2text_generator = pipeline("text2text-generation")
text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")

[{'generated_text': 'the answer to life, the universe and everything'}]