## HuggingFace Transformers

Open source, founded 2016

transformers package includes many pretrained models and a simple API

In [1]:
#!pip install transformers
#need backwards-compatable keras version -- current (v3) does not work with transformers
#!pip install tf-keras

In [2]:
from transformers import pipeline
# need tensorflow/pytorch for this to work

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# the argument is the type of pipeline we want to run
# uses default model if one is not supplied -- but you get a cranky message (see below)
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





In [4]:
# supply data to the pipeline
sentiment_classifier("I'm so excited to be learning about large language models")

[{'label': 'POSITIVE', 'score': 0.9997096657752991}]

In [5]:
# named entity recognition pipeline: with specific model optimized for NER
ner = pipeline("ner", model = "dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
ner_result = ner("Her name is Anna and she works in New York City for Morgan Stanley")

In [7]:
# provides a dictionary
# looks like...
#  entity is the type of thing being recognized
#  score is a confidence score?
#  index is the word location in the string, starting with 1
#  word is the word
#  start and end are character placements, starting with 0
# each word gets a dictionary with the following items
ner_result[4]

{'entity': 'B-ORG',
 'score': 0.9957462,
 'index': 13,
 'word': 'Morgan',
 'start': 52,
 'end': 58}

In [8]:
# zero-shot: model can complete the task without additional training
# try a classifier
zeroshot_classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

In [9]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']

In [10]:
# scores in the output are probabilities for each of the labels
zeroshot_classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938650727272034, 0.0032738044392317533, 0.0028610345907509327]}

## Pre-trained Tokenizers

In [11]:
from transformers import AutoTokenizer

In [12]:
model = "bert-base-uncased"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [14]:
sentence = "I'm so excited to be learning about large language models"

In [15]:
input_ids = tokenizer(sentence)
print(input_ids)

{'input_ids': [101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
tokens = tokenizer.tokenize(sentence)

In [17]:
print(tokens)

['i', "'", 'm', 'so', 'excited', 'to', 'be', 'learning', 'about', 'large', 'language', 'models']


In [18]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [19]:
print(token_ids)

[1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275]


In [20]:
decoded_ids = tokenizer.decode(token_ids)
print(decoded_ids)

i'm so excited to be learning about large language models


In [21]:
tokenizer.decode(101)

'[CLS]'

In [22]:
tokenizer.decode(102)

'[SEP]'

In [23]:
model2 = "xlnet-base-cased"

In [24]:
tokenizer2 = AutoTokenizer.from_pretrained(model2)

In [25]:
input_ids = tokenizer2(sentence)

In [26]:
print(input_ids)

{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [27]:
tokens = tokenizer2.tokenize(sentence)
print(tokens)

['▁I', "'", 'm', '▁so', '▁excited', '▁to', '▁be', '▁learning', '▁about', '▁large', '▁language', '▁models']


In [28]:
token_ids = tokenizer2.convert_tokens_to_ids(tokens)
print(token_ids)

[35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626]


In [29]:
tokenizer2.decode(4)

'<sep>'

In [30]:
tokenizer2.decode(3)

'<cls>'

## Huggingface and Pytorch/Tensorflow

In [31]:
#!pip install torch

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [33]:
print(sentence)
print(input_ids)

I'm so excited to be learning about large language models
{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [34]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [35]:
input_ids_pt = tokenizer(sentence, return_tensors ="pt")
print(input_ids_pt)

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653,
         4275,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [36]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [37]:
# sentiment classification

with torch.no_grad():
    logits = model(**input_ids_pt).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

## Saving and loading models

In [38]:
model_directory = "my_saved_models"

In [39]:
tokenizer.save_pretrained(model_directory)

('my_saved_models\\tokenizer_config.json',
 'my_saved_models\\special_tokens_map.json',
 'my_saved_models\\vocab.txt',
 'my_saved_models\\added_tokens.json',
 'my_saved_models\\tokenizer.json')

In [40]:
model.save_pretrained(model_directory)

In [41]:
my_tokenizer = AutoTokenizer.from_pretrained(model_directory)

In [42]:
my_model = AutoModelForSequenceClassification.from_pretrained(model_directory)