# WK05: Transformer Pipelines

## Setup

This code imports the functions we need to run our inference pipelines

In [None]:
from PIL import Image
from transformers import pipeline

## Text Completion

Let's use the [GPT2 model](https://huggingface.co/openai-community/gpt2) to create some text completions.

We use a pipeline object to run inference:

In [None]:
generator = pipeline(
  "text-generation",
  model="openai-community/gpt2"
)

Let's define a starter sentence to run our model on:

In [None]:
TOBE = "To be or not to be, that is the"

And run the model:

In [None]:
result = generator(TOBE, max_length=64, pad_token_id=0)
print(result)

#### Changing [model](https://huggingface.co/Xenova/llama2.c-stories110M)

is as easy as:

In [None]:
generator = pipeline(
  "text-generation",
  model="Xenova/llama2.c-stories110M"
)

Rerun with new model:

In [None]:
result = generator(TOBE, max_length=64, pad_token_id=0)
print(result)

### One last [model](https://huggingface.co/facebook/opt-125m):

In [None]:
generator = pipeline(
  "text-generation",
  # model="facebook/opt-125m"
  model="facebook/opt-1.3b"
)

result = generator(TOBE, max_length=64, pad_token_id=0)
print(result)

## Fill-Mask [model](https://huggingface.co/FacebookAI/xlm-roberta-large)

can be used to get the probabilities/scores of different possible words to complete a sentence:

In [None]:
filler = pipeline(
  "fill-mask",
  model="FacebookAI/xlm-roberta-large"
)

In [None]:
input = f"{TOBE} <mask>"
result = filler(input)

In [None]:
print(TOBE)

for r in result:
  word = r["token_str"]
  score = round(r["score"], 4)
  print(len(TOBE) * " ", f"{word}: {score}")

## Text Sentiment [Analysis](https://huggingface.co/joeddav/distilbert-base-uncased-go-emotions-student)

Classify the _tone_ of a sentence.

In [None]:
analyzer = pipeline(
  "sentiment-analysis",
  model="joeddav/distilbert-base-uncased-go-emotions-student"
)

Run on one example:

In [None]:
text = "A wave of elation washed over me, like sunlight breaking through the clouds"
result = analyzer(text)
print(result)

Run on example phrases

In [None]:
EXAMPLE_TEXTS = [
  "What a wonderful day",
  "OMG my head hurts",
  "What am I doing here?"
]

for t in EXAMPLE_TEXTS:
  result = analyzer(t)
  print(t, "->", result[0]["label"])

We can also pass an extra parameter to our pipeline if we want to get scores for all possible sentiments:

In [None]:
full_analyzer = pipeline(
  "sentiment-analysis",
  model="joeddav/distilbert-base-uncased-go-emotions-student",
  return_all_scores=True
)

print(text)
result = full_analyzer(text)
display(result)

### Other models

They each have a slightly different set of _emotions_ they were trained to detect.

- [GoEmotion BERT](https://huggingface.co/joeddav/distilbert-base-uncased-go-emotions-student) (27 emotions)
- [Roberta](https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion-multilabel-latest) (11 emotions)
- [Distil BERT](https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion) (6 emotions)

In [None]:
full_analyzer = pipeline(
  "sentiment-analysis",
  model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest",
  return_all_scores=True
)

In [None]:
print(text)
result = full_analyzer(text)
display(result)

In [None]:
full_analyzer = pipeline(
  "sentiment-analysis",
  model="bhadresh-savani/distilbert-base-uncased-emotion",
  return_all_scores=True
)

In [None]:
print(text)
result = full_analyzer(text)
display(result)

## Image [Description](https://huggingface.co/Salesforce/blip-image-captioning-base)

Describe what's in an image.

In [None]:
img_captioner = pipeline(
  "image-to-text",
  model="Salesforce/blip-image-captioning-base"
)

A test image:

In [None]:
test_image = Image.open("./imgs/GDTM.jpg").convert("RGB")
display(test_image)

Run inference:

In [None]:
result = img_captioner(test_image)
print(result[0]["generated_text"])

### Other image description models:

- [`VIT`](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)
- [`LLAVA`](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf)

In [None]:
vit_captioner = pipeline(
  "image-to-text",
  model="nlpconnect/vit-gpt2-image-captioning"
)

In [None]:
result = vit_captioner(test_image)
print(result[0]["generated_text"])

In [None]:
llava_captioner = pipeline(
  "image-text-to-text",
  model="llava-hf/llava-interleave-qwen-0.5b-hf"
)

In [None]:
prompt = [{
  "role": "user",
  "content": [
    {"type": "image", "image": test_image},
    {"type": "text", "text": "Describe the image"},
  ],
}]

In [None]:
result = llava_captioner(text=prompt, max_new_tokens=32, return_full_text=False)
print(result[0]["generated_text"])

## Depth [Prediction](https://huggingface.co/depth-anything/Depth-Anything-V2-Base-hf)

In [None]:
depth_estimator = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Base-hf")

In [None]:
image = Image.open("./imgs/flowers.jpg")
result = depth_estimator(image)
display(result["depth"])

## Object [Detection](https://huggingface.co/facebook/detr-resnet-101)

Some models don't work with the pipeline inference object, but the Transformers library still has some consistent-ish interfaces for running these models.

In [None]:
from PIL import Image
from transformers import DetrImageProcessor, DetrForObjectDetection

from WK05_utils import DetrPostProcessor

In [None]:
OBJ_MODEL = "facebook/detr-resnet-101"
detr_processor = DetrImageProcessor.from_pretrained(OBJ_MODEL, revision="no_timm")
detr_model = DetrForObjectDetection.from_pretrained(OBJ_MODEL, revision="no_timm")

In [None]:
image = Image.open("./imgs/people.jpg")
detr_inputs = detr_processor(images=image, return_tensors="pt")

detr_output = detr_model(**detr_inputs)

In [None]:
# draw bounding boxes on image
detr_image = DetrPostProcessor.process_output(detr_output, image, detr_model.config.id2label, 0.97)
display(detr_image)

## Image [Segmentation](https://huggingface.co/nvidia/segformer-b1-finetuned-ade-512-512)

Not only detects where objects are, but gives a mask of all pixels that belong to the objects detected.

In [None]:
from PIL import Image
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation

from WK05_utils import SegformerPostProcessor

In [None]:
SEG_MODEL = "nvidia/segformer-b1-finetuned-ade-512-512"
seg_processor = SegformerFeatureExtractor.from_pretrained(SEG_MODEL)
seg_model = SegformerForSemanticSegmentation.from_pretrained(SEG_MODEL)

In [None]:
image = Image.open("./imgs/street.jpg")
seg_inputs = seg_processor(images=image, return_tensors="pt")

seg_output = seg_model(**seg_inputs)
print(seg_output)

In [None]:
segments_image = SegformerPostProcessor.process_output(seg_output, image)
blended_image = Image.blend(image, segments_image, alpha=0.5)

display(segments_image)
display(blended_image)

## Multi-Modal Image [Comprehension](https://huggingface.co/openai/clip-vit-large-patch14)

Zero-Shot Classification.

Detects presence of objects that weren't part of the training data.

In [None]:
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

In [None]:
CLIP_MODEL = "openai/clip-vit-large-patch14"
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL)
clip_model = CLIPModel.from_pretrained(CLIP_MODEL)

In [None]:
LABELS = [
  "cat",
  "dog",
  "bird",
  "fish",
  "aquatic mammal",
  "erinaceinae",
  "vegetation"
]

In [None]:
image = Image.open("./imgs/arara.jpg")
clip_inputs = clip_processor(text=LABELS, images=image, return_tensors="pt", padding=True)

clip_output = clip_model(**clip_inputs)

In [None]:
label_idx = clip_output.logits_per_image.argmax(dim=1)
LABELS[label_idx]