###### Credits to hugging face documentation

In [9]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR) # This blocks all warnings

# Pipeline

The pipeline() is the easiest and fastest way to use a pretrained model for inference.
Start by creating an instance of pipeline()
The pipeline() downloads and caches a default pretrained model and tokenizer for sentiment analysis.
The pipeline() can accommodate any model from the Hub, making it easy to adapt the pipeline() for other use-cases.  

In [2]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis","distilbert-base-uncased-finetuned-sst-2-english")
classifier(["I don't think anybody hates cows.","I don't think anybody hate cows."]) # huh!

[{'label': 'NEGATIVE', 'score': 0.9911612272262573},
 {'label': 'POSITIVE', 'score': 0.9829936027526855}]

In [10]:
model = pipeline('text-generation', "gpt2")
print(model("Last night, I saw a cow")[0])

{'generated_text': 'Last night, I saw a cow being raised on a rock and I\'m like, \'Wow, this is so cute!\'"\n\nThe family said there were no injuries and he was treated for his injured neck.\n\n"He got an MR'}


# Automodel

An AutoClass is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path

In [11]:
from transformers import AutoModel
gpt_model = AutoModel.from_pretrained('gpt2')
print(type(gpt_model))

<class 'transformers.models.gpt2.modeling_gpt2.GPT2Model'>


In [15]:
# 
from transformers import AutoModelForSequenceClassification, tokenizer
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)
pt_model = AutoModelForSequenceClassification.from_pretrained('gpt2')
print(type(pt_model))
pt_outputs = pt_model(**pt_batch)


ImportError: cannot import name 'tokenizer' from 'transformers' (/home/arjun/NewPytorchEnv/lib/python3.10/site-packages/transformers/__init__.py)

# AutoTokenizer
A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model.
The most important thing to remember is you need to instantiate a tokenizer with the same model name to ensure you’re using the same tokenization rules a model was pretrained with. 

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')
print(tokenizer)

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)


In [6]:
encoding = tokenizer("I am Johnathan Wick")
print(encoding)

{'input_ids': [40, 716, 1757, 6696, 36029], 'attention_mask': [1, 1, 1, 1, 1]}
