## HuggingFace Transformers

In [2]:
# Importing dependencies 
from transformers import pipeline

In [4]:
# Not specifying model
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [6]:
sentiment_classifier("I'm so excited to be learning about large language models")


[{'label': 'POSITIVE', 'score': 0.9997096657752991}]

In [None]:
# Try specifying a model
ner = pipeline("ner", model = "dslim/bert-base-NER")

In [8]:
ner("Her name is Anna and she works in New York for Morgan Stanley")

[{'entity': 'B-PER',
  'score': 0.9955788,
  'index': 4,
  'word': 'Anna',
  'start': 12,
  'end': 16},
 {'entity': 'B-LOC',
  'score': 0.9992231,
  'index': 9,
  'word': 'New',
  'start': 34,
  'end': 37},
 {'entity': 'I-LOC',
  'score': 0.99939775,
  'index': 10,
  'word': 'York',
  'start': 38,
  'end': 42},
 {'entity': 'B-ORG',
  'score': 0.99729913,
  'index': 12,
  'word': 'Morgan',
  'start': 47,
  'end': 53},
 {'entity': 'I-ORG',
  'score': 0.99829406,
  'index': 13,
  'word': 'Stanley',
  'start': 54,
  'end': 61}]

In [11]:
# Zero shot classification import dependencies
from transformers import BartTokenizer, BartModel

# Call tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartModel.from_pretrained('facebook/bart-large')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [14]:
# Zero shot classification
zeroshot_classifier = pipeline("zero-shot-classification", model = "facebook/bart-large")

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [15]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']

In [16]:
# Running without suggested training, still correctly classifies though by a narrow margin
zeroshot_classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'cooking', 'dancing'],
 'scores': [0.3451612889766693, 0.3365514278411865, 0.3182872533798218]}

## Pre-trained Tokenizers

In [17]:
# Import dependencies
from transformers import AutoTokenizer

In [18]:
model = "bert-base-uncased"

In [20]:
# Call tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [21]:
sentence = "I'm so excited to be learning about large language models"

In [22]:
# Create input ID's then run tokenizer over sentence
input_ids = tokenizer(sentence)
print(input_ids)

{'input_ids': [101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [23]:
# Breaking sentence into tokens call
tokens = tokenizer.tokenize(sentence)
print(tokens)

['i', "'", 'm', 'so', 'excited', 'to', 'be', 'learning', 'about', 'large', 'language', 'models']


In [25]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275]


In [26]:
# Decode ID's
decoded_ids = tokenizer.decode(token_ids)
print(decoded_ids)

i ' m so excited to be learning about large language models


In [27]:
# Decode specific input ID's (special token)
tokenizer.decode(101)

'[CLS]'

In [28]:
# Decode specific input ID's (special token)
tokenizer.decode(102)

'[SEP]'

In [29]:
model2 = "xlnet-base-cased"

In [30]:
tokenizer2 = AutoTokenizer.from_pretrained(model2)

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [31]:
input_ids = tokenizer2(sentence)
print(input_ids)

{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [32]:
# Run tokenizer2 over sentence
tokens = tokenizer2.tokenize(sentence)
print(tokens)

['▁I', "'", 'm', '▁so', '▁excited', '▁to', '▁be', '▁learning', '▁about', '▁large', '▁language', '▁models']


In [33]:
token_ids = tokenizer2.convert_tokens_to_ids(tokens)
print (token_ids)

[35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626]


In [34]:
# Decode special token
tokenizer2.decode(4)

'<sep>'

In [35]:
# Decode special token
tokenizer2.decode(3)

'<cls>'

## Huggingface and Pytorch/Tensorflow

In [37]:
# Import dependencies
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [38]:
print(sentence)
print(input_ids)

I'm so excited to be learning about large language models
{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [39]:
# Specify tokens to be tensors
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [42]:
input_ids_pt = tokenizer(sentence, return_tensors = "pt")
print(input_ids_pt)

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653,
         4275,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [43]:
# Create model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [46]:
# Look at dictionary mapping from classification to numeric scores
with torch.no_grad():
    logits = model(**input_ids_pt).logits
    
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

## Saving and loading models

In [47]:
model_directory = "my_saved_models"

In [48]:
tokenizer.save_pretrained(model_directory)

('my_saved_models/tokenizer_config.json',
 'my_saved_models/special_tokens_map.json',
 'my_saved_models/vocab.txt',
 'my_saved_models/added_tokens.json',
 'my_saved_models/tokenizer.json')

In [49]:
model.save_pretrained(model_directory)

In [50]:
my_tokenizer = AutoTokenizer.from_pretrained(model_directory)

In [51]:
my_model = AutoModelForSequenceClassification.from_pretrained(model_directory)