<a href="https://colab.research.google.com/github/E1250/nlp_ref/blob/main/HuggingFace_Crash_Course.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HuggingFace Crash Course - Patrick Loeber
* Youtube Video - https://www.youtube.com/watch?v=GSt00_-0ncQ
* Fine Tuning - https://huggingface.co/docs/transformers/training

In [3]:
# First you must install Transformers library
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [4]:
from transformers import pipeline
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer , AutoModelForSequenceClassification

## Pipelines
Example - Creating Sentiment Analysis

Transformeres Pipelines Webpage - https://huggingface.co/docs/transformers/main_classes/pipelines

Youtube Video - https://www.youtube.com/watch?v=QEaBAZQCtwE

In [5]:
# Sentiment (Feelings) Analysis

model_name = "distilbert-base-uncased-finetuned-sst-2-english" # model name is optional and this is the default

classifier = pipeline("sentiment-analysis", model=model_name)
res = classifier("We are happy to show you the transformers library") # You can use list of string as input

print("The Sentiment of the text is : ",res)

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


The Sentiment of the text is :  [{'label': 'POSITIVE', 'score': 0.9998064637184143}]


* Tokenizer Webpage - https://huggingface.co/docs/transformers/main_classes/tokenizer

In [6]:
# Sentiment (Feelings) Analysis

model_name = "distilbert-base-uncased-finetuned-sst-2-english" # model name is optional and this is the default

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer : ",tokenizer)

classifier = pipeline("sentiment-analysis", model=model , tokenizer = tokenizer) # you can use your model and tokenizer here
res = classifier("We are happy to show you the transformers library") # You can use list of string as input
print("The Sentiment of the text is : ",res)


Tokenizer :  DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
The Sentiment of the text is :  [{'label': 'POSITIVE', 'score': 0.9998064637184143}]


In [7]:
tokens = tokenizer.tokenize("We are very happy to show you the Transformers library.")
print("Tokens : ",tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens Ids : ",token_ids)
token_ids = tokenizer(tokens)
print("Tokens Ids : ",token_ids)

Tokens :  ['we', 'are', 'very', 'happy', 'to', 'show', 'you', 'the', 'transformers', 'library', '.']
Tokens Ids :  [2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 19081, 3075, 1012]
Tokens Ids :  {'input_ids': [[101, 2057, 102], [101, 2024, 102], [101, 2200, 102], [101, 3407, 102], [101, 2000, 102], [101, 2265, 102], [101, 2017, 102], [101, 1996, 102], [101, 19081, 102], [101, 3075, 102], [101, 1012, 102]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}


In [9]:
X_train = ["We are very happy to show you the Transformers library." , "We hope you don't hate it"]
batch = tokenizer(X_train , padding = True , truncation = True , max_length=512 , return_tensors = 'pt')
print("Batches : ",batch)

Batches :  {'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          3075,  1012,   102],
        [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,   102,
             0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}


## PyTorch Classification
 

In [11]:
with torch.no_grad():
  outputs = model(**batch)
  print("Output : ",outputs)

  predictions = F.softmax(outputs.logits , dim = 1)
  print("Predictions : ",predictions)

  labels = torch.argmax(predictions , dim = 1)
  print("Labels Ids : ",labels)

  labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
  print("Labels : ",labels)

Output :  SequenceClassifierOutput(loss=None, logits=tensor([[-4.1329,  4.3811],
        [-0.8004,  0.7992]]), hidden_states=None, attentions=None)
Predictions :  tensor([[2.0060e-04, 9.9980e-01],
        [1.6804e-01, 8.3196e-01]])
Labels :  tensor([1, 1])
Labels :  ['POSITIVE', 'POSITIVE']


## Saving and loading models
you can also load models from HuggingFace Model Hub - https://huggingface.co/models


In [None]:
save_directory = ''

# Saivng
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

# Loading
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSequenceClassification.from_pretrainez(save_directory)

In [None]:
model_name = "model name from the hugging face"

# Loading
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrainez(model_name)

texts = ["this is a text to be predicted using the model been choosen","Another text"]

batch = tokenizer(texts , padding = True , truncation = True , max_length=512 , return_tensors = 'pt')

with torch.no_grad():
  outputs = model(**batch) # batch unpacking

  label_ids = torch.argmax(predictions , dim = 1)
  print("Labels Ids : ",label_ids)

  labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
  print("Labels : ",labels)