# HF Transformers

In [1]:
! pip install transformers datasets -q

^C



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

In [None]:
classifier('We are very happy to show you the 🤗 Transformers library.')

In [None]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.",
           "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

In [None]:
classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [None]:
inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(inputs)

In [None]:
'''
how the tokenizer process two sentences with diff length?
Looking at the longest sequence in your batch (the first sentence)
Padding the shorter sequences to match that length (not the full 512)
truncation=True tells the tokenizer to cut off sequences that exceed the specified max_length.
'''

## PYTORCH CODE
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
pt_batch

In [None]:
## PYTORCH CODE
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding='max_length',
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
pt_batch

In [5]:
# Cell 2: Import the required libraries
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# Cell 3: Load a pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
# Cell 4: Prepare input text
text = "Hello, I'm a sentence that needs embedding."

In [None]:
# Cell 5: Tokenize the input
inputs = tokenizer(text, return_tensors="pt")
print("Input IDs shape:", inputs["input_ids"].shape)
print("Input IDs:", inputs["input_ids"])
print("Attention mask:", inputs["attention_mask"])

In [None]:
# Show how embeddings work
token_embeddings = model.embeddings.word_embeddings(inputs["input_ids"])
print(f"Token embeddings shape: {token_embeddings.shape}")

# Explain positional encodings - crucial for transformers
position_ids = torch.arange(0, inputs["input_ids"].shape[1]).unsqueeze(0)
position_embeddings = model.embeddings.position_embeddings(position_ids)
print(f"Position embeddings shape: {position_embeddings.shape}")

In [None]:
# how tokenizers handle unknown tokens OOV
text_with_unknown = "This contains a very unusual word like supercalifragilisticexpialidocious!"
tokens = tokenizer.tokenize(text_with_unknown)
print("Tokenized into:", tokens)
print("Notice how unusual words are broken into subwords!")

#### downstream tasks

In [None]:
# Text Classification
classifier = pipeline("sentiment-analysis")
result = classifier("I've been waiting for a HuggingFace course my whole life.")
print(result)

In [None]:
# Named Entity Recognition (NER)
ner = pipeline("ner")
result = ner("Hugging Face was founded in Paris, France.")
print(result)

In [None]:
# Question Answering
qa = pipeline("question-answering")
result = qa(
    question="Where was Hugging Face founded?",
    context="Hugging Face was founded in Paris, France."
)
print(result)

In [None]:
# Text Generation
generator = pipeline("text-generation")
result = generator("Hugging Face is", max_length=50, do_sample=True)
print(result[0]['generated_text'])

In [None]:
# Text Summarization
summarizer = pipeline("summarization")
result = summarizer("""
    America has changed dramatically during recent years. Not only has the number
    of graduates in traditional engineering disciplines such as mechanical, civil,
    electrical, chemical, and aeronautical engineering declined, but in most of
    the premier American universities engineering curricula now concentrate on
    and encourage largely the study of engineering science.
""")
print(result[0]['summary_text'])

In [None]:
# Translation
translator = pipeline("translation_en_to_fr")
result = translator("Hugging Face is a technology company based in New York.")
print(result[0]['translation_text'])

In [None]:
# Fill-mask (Masked Language Modeling)
unmasker = pipeline("fill-mask")
result = unmasker("Hugging Face is working on <mask> models.")
for res in result:
    print(f"Token: {res['token_str']}, Score: {res['score']:.4f}")

## Lets infer a model from the Hub

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-4k-instruct",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [None]:
'''
return_full_text > By setting this to False , the prompt will not be returned but
merely the output of the model.
max_new_tokens > The maximum number of tokens the model will generate. By
setting a limit, we prevent long and unwieldy output as some
models might continue generating output until they reach their context window.
do_sample > Whether the model uses a sampling strategy to choose the
next token. By setting this to False , the model will always
select the next most probable token'''

from transformers import pipeline
# Create a pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=500,
do_sample=False
)

In [None]:
from transformers import pipeline

# Create pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False
)

# Define system message and user prompt
system_message = """You are a helpful, harmless, and precise AI assistant that provides accurate information and never makes things up."""
user_prompt = "Tell me about quantum computing"

# Format using chat template
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
response = generator(prompt)
print(response[0]['generated_text'])

In [None]:
response = generator("what is egypt's capital ?")
print(response[0]['generated_text'])

In [None]:
import transformers
transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.44.0'

#### Using a System msg

detailed approach

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

# Define a custom system message
system_message = """You are a helpful, harmless, and precise AI assistant that provides accurate information and never makes things up."""

# Create messages in the chat format
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": "Tell me about quantum computing"}
]

# Format the messages using the model's chat template
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
print(prompt)
# Generate response
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    inputs["input_ids"],
    max_new_tokens=500,
    do_sample=False
)

# Decode and print response
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print(response)

Too Much Details !! pipeline can handle this details without seperating each component

In [None]:
from transformers import pipeline

# Create pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False
)

# Define system message and user prompt
system_message = """You are a helpful, harmless, and precise AI assistant that provides accurate information and never makes things up."""
user_prompt = "Tell me about quantum computing"

# Format using chat template
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
response = generator(prompt)
print(response[0]['generated_text'])

# Fine Tuning

In [None]:
# !pip install datasets==3.3.2 -q
# restart session collab

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb")

# Load tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    logging_dir="./logs",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Fine-tune model
trainer.train()