In [2]:
from pprint import pprint
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModel, AutoModelForSequenceClassification
from transformers import BertModel
import torch
from torch.nn import functional as F

torch.set_printoptions(sci_mode=False)

### Using Pipeline

In [3]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

classifier = pipeline("sentiment-analysis")
classifier(raw_inputs)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9598046541213989},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

### Without Pipeline

In [4]:
raw_inputs = [
    "Once upon a time lived a majestic fairy king atop the mountains of the world.",
    "Every universe withholds a corner of darkness.",
]

# Tokenize inputs - returns tensor
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

# Forward pass - returns logits
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape, outputs.logits, "\n")

# Decode outputs
predictions = F.softmax(outputs.logits, dim=-1)
print(predictions, "\n")
print(model.config.id2label, "\n")

for prediction in predictions:
	print(model.config.id2label[prediction.argmax().item()])

torch.Size([2, 2]) tensor([[-4.0055,  4.3060],
        [ 2.2051, -1.8741]], grad_fn=<AddmmBackward0>) 

tensor([[    0.0002,     0.9998],
        [    0.9834,     0.0166]], grad_fn=<SoftmaxBackward0>) 

{0: 'NEGATIVE', 1: 'POSITIVE'} 

POSITIVE
NEGATIVE


### Load / Save Model

In [5]:
from transformers import BertModel

# Get pretrained model
model = BertModel.from_pretrained("bert-base-cased")

# Save to computer
model.save_pretrained("./Models")

# Load from computer
model = AutoModel.from_pretrained("./Models")

### Push to / Load from Hub

In [6]:
model.push_to_hub("test-model")

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/DarkEmbers/test-model/commit/140becc969e8c2d6988df09d07cbcdfedc30ec16', commit_message='Upload model', commit_description='', oid='140becc969e8c2d6988df09d07cbcdfedc30ec16', pr_url=None, repo_url=RepoUrl('https://huggingface.co/DarkEmbers/test-model', endpoint='https://huggingface.co', repo_type='model', repo_id='DarkEmbers/test-model'), pr_revision=None, pr_num=None)

In [7]:
model = AutoModel.from_pretrained("DarkEmbers/test-model")

### Encode / Decode text

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# We pad the sentences so they have the same length
# Tensors need to be rectangular
# Use truncate to shorten sentences that are too long for the model
encoded_input = tokenizer(
    ["How are you?", "I'm fine, thank you!"], 
	padding=True, 
	truncation=True,
	# max_length=8,
	return_tensors="pt"
)

for key, value in encoded_input.items():
	print(key, value)

input_ids tensor([[ 101, 1731, 1132, 1128,  136,  102,    0,    0,    0,    0],
        [ 101,  146,  112,  182, 2503,  117, 6243, 1128,  106,  102]])
token_type_ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
tokenizer.decode(encoded_input["input_ids"][1])

"[CLS] I ' m fine, thank you! [SEP]"

### Tokenizer Pipeline

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokens = tokenizer.tokenize("Don't you love transformers?")
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

tokenizer.decode(ids)

['Don', "'", 't', 'you', 'love', 'transform', '##ers', '?']
[1790, 112, 189, 1128, 1567, 11303, 1468, 136]


"Don ' t you love transformers?"

In [11]:
# Feeding tokenizer ids to model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
outputs = model(input_ids)
# Softmax to get probabilities
preds = F.softmax(outputs.logits, dim=-1)

for prediction in preds:
	print(model.config.id2label[prediction.argmax().item()])

POSITIVE
