# HuggingFace Transformers

In [1]:
from transformers import pipeline

In [2]:
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [3]:
sentiment_classifier("I'm so excited to be learning about large language models!")

[{'label': 'POSITIVE', 'score': 0.9997439980506897}]

In [4]:
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [5]:
ner("My name is Frazer and I'm a data scientist, I work at Meta but found out this week that I'm being made redundant")

[{'entity_group': 'PER',
  'score': 0.99543214,
  'word': 'Fr',
  'start': 11,
  'end': 13},
 {'entity_group': 'PER',
  'score': 0.35155112,
  'word': '##azer',
  'start': 13,
  'end': 17},
 {'entity_group': 'ORG',
  'score': 0.99743557,
  'word': 'Meta',
  'start': 54,
  'end': 58}]

# Zero Shot classification 
Performing a task without any additional training

In [6]:
zeroshot_calssifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use mps:0


In [7]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ["travel", "cooking", "dancing"]

In [8]:
zeroshot_calssifier(sequence_to_classify, candidate_labels) 

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938650727272034, 0.003273779060691595, 0.00286103505641222]}

# Pre-Trained Tokenizers

In [4]:
from transformers import AutoTokenizer

In [5]:
model = "bert-base-uncased"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [7]:
sentence = "I am so excited to be learning about large language models!"

In [8]:
input_ids = tokenizer(sentence)
print(input_ids)

{'input_ids': [101, 1045, 2572, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
tokens = tokenizer.tokenize(sentence)
print(tokens)


['i', 'am', 'so', 'excited', 'to', 'be', 'learning', 'about', 'large', 'language', 'models', '!']


In [10]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[1045, 2572, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275, 999]


In [12]:
decoded_ids = tokenizer.decode(token_ids)
print(decoded_ids)

i am so excited to be learning about large language models!


In [13]:
tokenizer.decode(101)

'[CLS]'

In [14]:
tokenizer.decode(102)

'[SEP]'

In [5]:
model2 = "xlnet-base-cased"

In [6]:
tokenizer2 = AutoTokenizer.from_pretrained(model2)

In [9]:
input_ids2 = tokenizer2(sentence)
print(input_ids2)

{'input_ids': [35, 569, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 136, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
tokens2 = tokenizer2.tokenize(sentence)
print(tokens2)

['▁I', '▁am', '▁so', '▁excited', '▁to', '▁be', '▁learning', '▁about', '▁large', '▁language', '▁models', '!']


In [11]:
token_ids2 = tokenizer2.convert_tokens_to_ids(tokens2)
print(token_ids2)

[35, 569, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 136]


In [12]:
tokenizer2.decode(4)

'<sep>'

In [13]:
tokenizer2.decode(3)

'<cls>'

# Huggingface and PyTorch/Tensorflow

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [10]:
print(sentence)
print(input_ids)

I am so excited to be learning about large language models!
{'input_ids': [101, 1045, 2572, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [11]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [12]:
input_ids = tokenizer(sentence, return_tensors="pt")
print(input_ids)

{'input_ids': tensor([[ 101, 1045, 2572, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275,
          999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [13]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [14]:
with torch.no_grad():
    logits = model(**input_ids).logits

In [15]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

# Saving and loading models

In [16]:
model_directory = "my_saved_models"

In [17]:
tokenizer.save_pretrained(model_directory)

('my_saved_models/tokenizer_config.json',
 'my_saved_models/special_tokens_map.json',
 'my_saved_models/vocab.txt',
 'my_saved_models/added_tokens.json',
 'my_saved_models/tokenizer.json')

In [18]:
model.save_pretrained(model_directory)

In [19]:
my_tokenizer = AutoTokenizer.from_pretrained(model_directory)

In [20]:
my_model = AutoModelForSequenceClassification.from_pretrained(model_directory)