# HuggingFace Transformers

In [1]:
from transformers import pipeline

In [2]:
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [3]:
sentiment_classifier("I'm so excited to be learning about large language models")

[{'label': 'POSITIVE', 'score': 0.9997096657752991}]

In [4]:
ner = pipeline("ner", model="dslim/bert-base-NER")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0


In [5]:
ner("Her name is Anna and she works in New York City for Morgan Stanley")

[{'entity': 'B-PER',
  'score': 0.9954881,
  'index': 4,
  'word': 'Anna',
  'start': 12,
  'end': 16},
 {'entity': 'B-LOC',
  'score': 0.99960667,
  'index': 9,
  'word': 'New',
  'start': 34,
  'end': 37},
 {'entity': 'I-LOC',
  'score': 0.9993955,
  'index': 10,
  'word': 'York',
  'start': 38,
  'end': 42},
 {'entity': 'I-LOC',
  'score': 0.9995803,
  'index': 11,
  'word': 'City',
  'start': 43,
  'end': 47},
 {'entity': 'B-ORG',
  'score': 0.9957462,
  'index': 13,
  'word': 'Morgan',
  'start': 52,
  'end': 58},
 {'entity': 'I-ORG',
  'score': 0.9979346,
  'index': 14,
  'word': 'Stanley',
  'start': 59,
  'end': 66}]

In [6]:
zeroshot_classification = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


In [7]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']

In [8]:
zeroshot_classification(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938650727272034, 0.003273784415796399, 0.0028610362205654383]}

In [11]:
# testing on another sentence
sentence2 = "I wanna be the CEO of a big-wig firm"
labels = ['travel', 'cooking', 'dancing', 'ambition', 'love', 'work', 'love for work']
zeroshot_classification(sentence2, labels)

{'sequence': 'I wanna be the CEO of a big-wig firm',
 'labels': ['ambition',
  'work',
  'love for work',
  'travel',
  'cooking',
  'dancing',
  'love'],
 'scores': [0.8262977004051208,
  0.08831699937582016,
  0.07176921516656876,
  0.004037776030600071,
  0.0032555668149143457,
  0.0031692718621343374,
  0.0031534992158412933]}

# Pre-trained tokenizers

## bert-base-uncased Model

In [16]:
from transformers import AutoTokenizer

In [19]:
model = "bert-base-uncased"

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [21]:
sentence = "I'm so excited to learn about large language models"

In [22]:
input_ids = tokenizer(sentence)
print(input_ids)

{'input_ids': [101, 1045, 1005, 1049, 2061, 7568, 2000, 4553, 2055, 2312, 2653, 4275, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [23]:
tokens = tokenizer.tokenize(sentence)
print(tokens)

['i', "'", 'm', 'so', 'excited', 'to', 'learn', 'about', 'large', 'language', 'models']


In [24]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[1045, 1005, 1049, 2061, 7568, 2000, 4553, 2055, 2312, 2653, 4275]


In [25]:
decoded_ids = tokenizer.decode(token_ids)
print(decoded_ids)

i ' m so excited to learn about large language models


In [27]:
tokenizer.decode(101)

'[CLS]'

In [28]:
tokenizer.decode(102)

'[SEP]'

## xlnet-base-cased model

In [29]:
model2 = "xlnet-base-cased"

In [30]:
tokenizer2 = AutoTokenizer.from_pretrained(model2)

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [31]:
input_ids = tokenizer2(sentence)
print(input_ids)

{'input_ids': [35, 26, 98, 102, 5564, 22, 1184, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [32]:
tokens = tokenizer2.tokenize(sentence)
print(tokens)

['▁I', "'", 'm', '▁so', '▁excited', '▁to', '▁learn', '▁about', '▁large', '▁language', '▁models']


In [33]:
decoded_ids = tokenizer2.decode(token_ids)
print(decoded_ids)

Court Clinton includes broketie entered thank www Soviet super Labor


In [35]:
token_ids = tokenizer2.convert_tokens_to_ids(tokens)
print(token_ids)

[35, 26, 98, 102, 5564, 22, 1184, 75, 392, 1243, 2626]


In [36]:
decoded_ids = tokenizer2.decode(token_ids)
print(decoded_ids)

I'm so excited to learn about large language models


In [37]:
tokenizer2.decode(4)

'<sep>'

In [38]:
tokenizer2.decode(3)

'<cls>'

## CLS = classification
## SEP = separator

- cls is placed at the beginning and sep is placed at the last
- They are used in classification and sentence pair classification task

## MASK
- Used in tasks related to masked language modelling or text generation with a blank to fill in
- example - "Hello I'm a [MASK] model."
    - This can give multiple results for mask like (fashion model, super model, new model, etc)

## Task specific tokens
- We may need custom tokens like [SOURCE] and [TARGET] to help guide the model's behaviour during translation

## Special tokens for padding and truncation
- When multiple sentences off varying lengths are fed into the model, special tokens for padding may be needed

# HuggingFace and Pytorch/Tensorflow

In [39]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [40]:
print(sentence)
print(input_ids)

I'm so excited to learn about large language models
{'input_ids': [35, 26, 98, 102, 5564, 22, 1184, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [41]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [42]:
input_ids_pt = tokenizer(sentence, return_tensors = "pt")
print(input_ids_pt)

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 2061, 7568, 2000, 4553, 2055, 2312, 2653, 4275,
          102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [43]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [44]:
with torch.no_grad():
    logits = model(**input_ids_pt).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

# Saving and loading models