<a href="https://colab.research.google.com/github/AmirJlr/LLMs/blob/master/01_Intro_to_transformers_library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers

In [None]:
from transformers import pipeline

### Simple `pipline`

In [None]:
classifier = pipeline("sentiment-analysis", device='cuda')

classifier("We are very happy that are madridista!")

### Sentence ==> Tokenize ==> Model

## Use separate `Tokenizer` and  `Model`

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
txt = "We are very happy that are madridista!"
inp = tokenizer(txt, return_tensors="pt") # Pytorch tensor
inp

In [None]:
out = model(**inp)

print(out)

## TensorFlow Based

In [None]:
from transformers import AutoTokenizer, TFAutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = TFAutoModel.from_pretrained("bert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
outputs = model(**inputs)
outputs

## Customize pipeline

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

results = classifier(["We are very happy that are madridista!",
                      "I dont like him."])

for result in results:
    print(result)

## More Deeper in Outputs

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

txt = "We are very happy to show you the Transformers library."

tokens = tokenizer.tokenize(txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer(txt) # add first token


print (f' Tokens: {tokens}')
print("\n \n")

print (f'Token IDs: {token_ids}')
print("\n \n")

print (f'Input IDs: {input_ids}')

In [None]:
X_train = ["We are very happy to show you the Transformers library.",
            "We hope you don't hate it."]

batch = tokenizer(X_train, padding=True, truncation=True,
                   max_length=512, return_tensors="pt") # apply padding for smaller sentence tokens

print(batch)

## More Deep in Pytorch Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

X_train = ["We are very happy to show you the Transformers library.",
            "We hope you don't hate it."]

batch = tokenizer(X_train, padding=True, truncation=True,
                   max_length=512, return_tensors="pt")

print("Input Batch: ", batch)
print("\n \n")

with torch.no_grad():
    outputs = model(**batch)
    print("Outputs: ", outputs)
    print("\n \n")

    predictions = F.softmax(outputs.logits, dim=1)
    print("Predictions: ", predictions)
    print("\n \n")

    labels = torch.argmax(predictions, dim=1) # get indice of max value
    print("Labels: ", labels)
    print("\n \n")

    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print("Final Label: ", labels)