# **Learn Hugging face text Classification**


*   Resourse notebook: https://www.learnhuggingface.com/notebooks/hugging_face_text_classification_tutorial
*  Setup: https://www.learnhuggingface.com/extras/setup

Note: A GPU is needed.



# IMPORT NECESSARY LIBRARIES

In [None]:
#Install dependences
try:
  import datasets,evaluate,accelerate
  import gradio as gr
except ModuleNotFoundError:
  !pip install -U datasets evaluate accelerate gradio
  import datasets,evaluate,accelerate
  import gradio as gr
import random
import numpy as np
import pandas as pd
import torch
import transformers
print(f'Using transformer version:{transformers.__version__}')
print(f'Using torch version:{torch.__version__}')
print(f'Using Datasets version:{datasets.__version__}')


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting accelerate
  Downloading accelerate-1.2.0-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from g

# 3.Getting a dataset
Building food not food text classification: need food not food text dataset

In [None]:
from datasets import load_dataset
ds=load_dataset("mrdbourke/learn_hf_food_not_food_image_captions")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})

In [None]:
ds.column_names

{'train': ['text', 'label']}

In [None]:
ds['train'][0:9]

{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed',
  'Coffee table cluttered with magazines',
  'Pair of reading glasses left open on a book',
  'Set of muffin tins stacked together',
  'Set of baking sheets stacked in a cabinet'],
 'label': ['food',
  'not_food',
  'not_food',
  'not_food',
  'not_food',
  'not_food',
  'not_food',
  'not_food',
  'not_food']}

In [None]:
#inspect random sample
random_indexes=random.sample(range(len(ds['train'])),5)
print(random_indexes)
random_samples=ds['train'][random_indexes]
print(f'[INFO] Random Samples from dataset \n')
for text,label in zip(random_samples['text'],random_samples['label']):
  print(f'Text: {text} | Label: {label}')

[249, 26, 159, 168, 13]
[INFO] Random Samples from dataset 

Text: Taking a nap on a hammock, a man has his dog snuggled up next to him | Label: not_food
Text: Fresh cherry tomatoes in a basket, sprinkled with sea salt for a savory snack. | Label: food
Text: A close-up of a woman practicing yoga in the living room while her dog mimics her poses | Label: not_food
Text: Rutabaga in a bowl, sprinkled with nutmeg and served with a side of brown sugar for a sweet, comforting dish. | Label: food
Text: Sweet and spicy sushi roll with ingredients like mango and jalapeno. | Label: food


In [None]:
#get unique label values
ds['train'].unique('label')

['food', 'not_food']

In [None]:
#checking the value counts for each label
from collections import Counter
Counter(ds['train']['label'])

Counter({'food': 125, 'not_food': 125})

In [None]:
#turn our dataset into dataframe and get random sample
food_notfood_dataframe=pd.DataFrame(ds['train'])
food_notfood_dataframe.sample(7)

Unnamed: 0,text,label
181,"A steaming bowl of fiery chicken curry, infuse...",food
196,Friends having a barbecue in the backyard whil...,not_food
106,Set of potholders stored in a drawer,not_food
150,Plate of sushi served with pickled ginger and ...,food
65,Set of plates stacked in a cupboard,not_food
29,Hammock swaying between two trees,not_food
218,"Mouthwatering mushroom curry, featuring shiita...",food


In [None]:
food_notfood_dataframe.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
food,125
not_food,125


# Preparing Data For Text Classification
1. We want to tokenizer our text and labels in numeric form
2. Create a train test split for training and evaluation


In [None]:
# Create a mapping for labels to numeric value
id2label={0:'not_food',1:'food'}
label2id={'not_food':0,'food':1}

In [None]:
#Create mapping programmetically from ds
id2label={idx:label for idx,label in enumerate(ds['train'].unique('label'))}
label2id={label:idx for idx,label in enumerate(ds['train'].unique('label'))}


In [None]:
label2id

{'food': 0, 'not_food': 1}

In [None]:
#turn labels into 0 and 1
def map_labels_to_number(example):
  example['label']=label2id[example['label']]
  return example

In [None]:
example_sample={'text':'This is about my favorite food:honey!','label':'food'}
#test oour function
map_labels_to_number(example_sample)

{'text': 'This is about my favorite food:honey!', 'label': 0}

In [None]:
ds['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 250
})

In [None]:
#to do with the whole dataset we can map our dataset labels to numbers
# we can do this with dataset.map
ds=ds["train"].map(map_labels_to_number)
ds[:5]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed'],
 'label': [0, 1, 1, 1, 1]}

# Split the Dataset into training and testing
We can split using dataset.Dataset.train_test_split

In [None]:
#split our dataset into train/test split
dataset=ds.train_test_split(test_size=0.2,seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

# Tokenizing our text data into numbers

In [None]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenizer('I love pizza!')

{'input_ids': [101, 1045, 2293, 10733, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
#get the length of our tokenizer vocab
len_of_tokenizer_vob=len(tokenizer.vocab)
len_of_tokenizer_vob

30522

In [None]:
#get the max seq length the tokenizer can handle
max_tokenizer_seq_len=tokenizer.model_max_length
max_tokenizer_seq_len

512

In [None]:
#does 'aina' in tokenizer vocab
tokenizer.vocab['aina'] #this shows an error because does not contain word aina

KeyError: 'aina'

In [None]:
#lets check how will it give token id for the words that does not present in vocab like aina
tokenizer('aina')

{'input_ids': [101, 7110, 2050, 102], 'attention_mask': [1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens(tokenizer('aina').input_ids)

['[CLS]', 'ain', '##a', '[SEP]']

In [None]:
#try to tokenize an empji
tokenizer.convert_ids_to_tokens(tokenizer('🍕').input_ids)

['[CLS]', '[UNK]', '[SEP]']

when tokenizer does not know any word then it break into subwords and check their presence in words and give their token nums like we seen for word 'aina' however if it still does not know it give token UNK like for the emoji of pizza

#Make a Preprocessing Function to tokenize Text


In [None]:
def tokenize_text(examples):
  """
  Tokenize given example text and return Tokenized text
  """
  return tokenizer(examples['text'],padding=True,truncation=True)


In [None]:
#map our tokenize_text function to our ds
tokenized_dataset=dataset.map(tokenize_text,batched=True,batch_size=1000)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

**in machine learning it is often faster to do things in batches together rather then one at a time due to levarging computer hardware parallelization in computing**

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50
    })
})

In [None]:
#get two samples from tokenized dataset
train_sample=tokenized_dataset['train'][0]
for key in (train_sample.keys()):
  print(f'[INFO] Key: {key}')
  print(f'Train Sample {train_sample[key]}')


[INFO] Key: text
Train Sample Set of headphones placed on a desk
[INFO] Key: label
Train Sample 1
[INFO] Key: input_ids
Train Sample [101, 2275, 1997, 2132, 19093, 2872, 2006, 1037, 4624, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[INFO] Key: attention_mask
Train Sample [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#SET UP AN EVALUATION METRIC:
What we want to do: use the evaluation metric to get a a numerical idea of how our model is performing
Some common evaluation metric for classification


*  Accuracy (how many example out of 100 did you get correct)
*   Precision
*   Recall
*  F1 score

Evaluation metric is imp becuase some project may have an threshold you need to fulfill


E.G: an insurance claim classification required 98% test accuracy



In [None]:
import evaluate
from typing import Tuple
accuracy_metric=evaluate.load('accuracy')
def compute_accuracy(prediction_and_labels:Tuple[np.array,np.array]):
  """
  Compute the accuracy of model by comparing predictions and labels
  """
  predictions,labels=prediction_and_labels
  if len(predictions.shape)>=2:
    predictions=np.argmax(predictions,axis=1)
  return accuracy_metric.compute(predictions=predictions,references=labels)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
#examples predictions and accuracy store
example_preds_all_correct=np.array([0,0,0,0,0,0,0,0,0,0])
examples_preds_one_incorrect=np.array([0,0,0,1,0,0,0,0,0,0])
example_labels=np.array([0,0,0,0,0,0,0,0,0,0])
#test the funcyion
print(f'Accuracy when all predictions are correct:{compute_accuracy((example_preds_all_correct,example_labels))}')
print(f'Accuracy when one prediction is  incorrect:{compute_accuracy((examples_preds_one_incorrect,example_labels))}')

Accuracy when all predictions are correct:{'accuracy': 1.0}
Accuracy when one prediction is  incorrect:{'accuracy': 0.9}


#Setting Up A model for training
- We are going to use transfer learning
- Transfer learning is a powerful technique , unique to deep learning models that enables us to use the patterns that one model has learned at another problem for our own problem.

Workflow for training


*   Create and preprocess data ✔
*   Define the model to use for our problem : https://huggingface.co/models or see the task guides in HF Transformers docs
- Define training arguments for our model training.TrainingArguments
    - these are also known as hyper parameters- settings on your model that you can adjust.
    - Parameters: weights/patterns in the model that get updated automatically.

- Pass Training arguments to an instance of transformers.Trainer
- Train the model by calling Trainer.train()
- Save the mode(to our local machine or to the hugging face Hub)
- Evaluate the model by making and inspecting predictions on test data
-Turn the model into shareable demo.



In [None]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained(
    'distilbert/distilbert-base-uncased',
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


Our model is comprised of following parts

1. 'embeddings' : are a form of learned representation of tokens.So if tokens are a direct mapping from token to number embeddings are a learned vector representation
2.  'transformer' our model architecture backbone , this has discoved patterns /relationship in embeddings
3. 'classifier' we need to customize this layer to suit our problem.

In [None]:
#OUR MODEL IS EXPECTING DATA IN FORM OF
tokenized_dataset['train'][0] #input id+ attention mask

{'text': 'Set of headphones placed on a desk',
 'label': 1,
 'input_ids': [101,
  2275,
  1997,
  2132,
  19093,
  2872,
  2006,
  1037,
  4624,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

# **Count the Perameters in our model.**

Weights/Parameters: small numeric oportunities for a model to learn patterns in data.

In [None]:
def count_params(model):
  """
  Count the parameters of Pytorch model

  """
  trainable_parameters=sum(param.numel() for param in model.parameters() if param.requires_grad)
  total_parameters=sum(param.numel() for param in model.parameters())
  return {"trainable_dictionary":trainable_parameters,
          'total_parameters':total_parameters}

In [None]:
count_params(model)

{'trainable_dictionary': 66955010, 'total_parameters': 66955010}

Looks like our model has 67M parameters and **all** are trainable.

Note

- Generally, the more parameters model has the more capacity it has to run.
- For comparision model such as  Llama 3 8b has 8 billion paramaters
 which has 120x more than our model.
- if you want best possible performance generally more parameters is better.

    - However more parameters require more compute or time.
    - You will be surprised how a well small model can perform with specific data.

# **Creating a Folder to save our model**

In [None]:
#create model output directory
from pathlib import  Path

#create model directory
model_dir=Path('models')
model_dir.mkdir(exist_ok=True)
#create save model name
model_save_name='learn_hf_food_not_food_text_classifier-distilbert-base-uncased'


In [None]:
model_save_dir=Path(model_dir,model_save_name)
model_save_dir

PosixPath('models/learn_hf_food_not_food_text_classifier-distilbert-base-uncased')

# **Setting Up Training Arguments (hyperparameters) with Training Arguments**

In [None]:
from transformers import TrainingArguments
print(f'[INFO] saved model checkpoint {model_save_dir}')
BATCH_SIZE=32
training_args=TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.0001,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=10,
    logging_strategy="epoch",
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    report_to='none',
    hub_private_repo='False'

)

[INFO] saved model checkpoint models/learn_hf_food_not_food_text_classifier-distilbert-base-uncased


#Pass Training arguments to an instance of Transformer.Trainer

In [None]:
from transformers import Trainer
#setup trainer interface
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)
trainer

  trainer=Trainer(


<transformers.trainer.Trainer at 0x7e2dba69b310>

#MODEL TRAINING

In [None]:
result=trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3703,0.053392,1.0
2,0.0264,0.006235,1.0
3,0.0048,0.002232,1.0
4,0.0021,0.001244,1.0
5,0.0013,0.000878,1.0
6,0.001,0.000713,1.0
7,0.0008,0.000627,1.0
8,0.0008,0.000581,1.0
9,0.0008,0.000557,1.0
10,0.0007,0.000549,1.0


In [None]:
result.metrics

{'train_runtime': 529.0942,
 'train_samples_per_second': 3.78,
 'train_steps_per_second': 0.132,
 'total_flos': 18110777160000.0,
 'train_loss': 0.04091630348536585,
 'epoch': 10.0}

In [None]:
#inspect training metrics
for key,value in result.metrics.items():
  print(f'{key}:{value}')

train_runtime:529.0942
train_samples_per_second:3.78
train_steps_per_second:0.132
total_flos:18110777160000.0
train_loss:0.04091630348536585
epoch:10.0


#Save the model for  later use

**Note**: if you are saving a model to Google Colab,note that it will disappear from your colab instance when it disconnects

In [None]:
###Save
print(f"Saving our model at {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)

#Inspect the model training class


In [None]:
trainer_history_all=trainer.state.log_history
trainer_history_metric=trainer_history_all[:-1]#include everything except last one
trainer_history_training_time=trainer_history_all[-1]
#view the first 3
trainer_history_metric[:3]

In [None]:
trainer_history_all

In [None]:
import pprint
#extract eval and training metric
trainer_history_training_set=[]
trainer_history_eval_set=[]
#loop through  our metric
for item in trainer_history_metric:
  item_keys=list(item.keys())
  if any("eval" in item for item in item_keys):
    trainer_history_eval_set.append(item)
  else:
    trainer_history_training_set.append(item)
#show first two item in each
print(f'First two item in training history' )
pprint.pprint(trainer_history_training_set[:2])
print(f'\nFirst two item in eval history')
pprint.pprint(trainer_history_eval_set[:2])

#lets convert these in Dataframe so that we can graph them out using matplotlib
Loss_curves= a good visualization of your model's performance over time


Ideally loss curve will trend downwards

In [None]:
trainer_history_training_df=pd.DataFrame(trainer_history_training_set)
trainer_history_eval_df=pd.DataFrame(trainer_history_eval_set)
trainer_history_training_df

In [None]:
#plot loss curves
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.plot(trainer_history_training_df['epoch'],trainer_history_training_df['loss'],label='Training loss')
plt.plot(trainer_history_eval_df['epoch'],trainer_history_eval_df['eval_loss'],label='Eval loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Text Classification fine tuning DistilBert training and evaluation loss over time')
plt.legend()
plt.show()

#Pushing our model  to Huggingface Hub
 Why do this?

 So we can share our model

 Other people can try it out.

 We can keep a history of different versions.

 To write to Hugging face

 - If on google colab : setup "token" with "read" and "write" access.
 - if one local machine :setup huggingface-cli.
 To save to the Huggingface Hub we can use Trainer.push_to_hub method https://huggingface.co/docs/transformers/v4.47.1/en/main_classes/trainer#transformers.Trainer.push_to_hub




In [None]:
#Save our model to Hugging face HUb
model_upload_url=trainer.push_to_hub(
    commit_message='Uploading food not food text classifier model',
    token='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'

)
print(f'Model successfully uploaded to Hugging Face Hub with url {model_upload_url}')


# Evaluations

Evaluating a model is as imp as training a model.

We can make predictions on our test data using trainer.predict() method. https://huggingface.co/docs/transformers/v4.47.1/en/main_classes/trainer#transformers.Seq2SeqTrainer.predict

In [None]:
predictions_all=trainer.predict(tokenized_dataset['test'])
predictions_values=predictions_all.predictions
predictions_metrics=predictions_all.metrics

print(f'[INFO] Predictions metrics on test data:')
predictions_metrics

In [None]:
predictions_values

this is our logits (raw output of model)--> prediction probablities with torch.softmax --> predicted labels

In [None]:
predictions_all.predictions[0]

In [None]:
#softmax  get all the values between 0 and 1 and sum of total values is 1
#This is what known as predictions probablities as in the model is assigning this value to how "likely" the prediction is given sample
#note these values dont suggest how 'right' our model is
torch.softmax(torch.tensor(predictions_all.predictions[0]),dim=0)


In [None]:
import torch
from sklearn.metrics import accuracy_score
#get prediction probablities with torch.softmax
preds_probs=torch.softmax(torch.tensor(predictions_values),dim=1)
#get  predicted labels
preds_labels=torch.argmax(preds_probs,dim=1)
#get true lables
true_labels=tokenized_dataset['test']['label']
#compare predicted labels to true labels and get accuracy
accuracy=accuracy_score(true_labels,preds_labels)
accuracy

**Note**: if you want a good evaluation method make prediction on your entire dataset then index on the prediction which are wrong but have high prediction probablitu . For example get the top 100-1000 and go through all of the examples where models predictions had high probablity but was incorrect -> this often leads to great insights of data

In [None]:
predictions_all.label_ids

### Exploring our model prediction probablities

It is a  very good way to evaluate a model by sorting prediction probablities and seeing where the model went wrong.

In [None]:
#Make a Df of test predictions
test_predictions_df=pd.DataFrame({
    "text":tokenized_dataset['test']['text'],
    'true_labels':true_labels,
    'pred_labels':preds_labels,
    'pred_probs':torch.max(preds_probs,dim=1).values
})
test_predictions_df.head()

In [None]:
#show 10 examples with low prediction probablities
test_predictions_df.sort_values('pred_probs',ascending=True).head(10)

#Making and inspecting predictions on custom data


#Discussing ways to make predictions (inferance)

 **Note**:Whenever you hear the word 'inferance' it means to use a model to make predictions on data.

 Two main ways to perform inferance
 1. **Pipeline** using transformers.pipeline to load our model and perform text classification
 2. **PyTorch mode**: Using a combination of AutoTokenizer and transformers.AutoModelForSequenceClassification and passing each our model name

 Each mode supports:
  1. Predictions one at a time(fast but slower with many many samples).
  2. Batches of predictions at a time (faster but upto point e.g say you predict 32 samples at a time this may be faster than one at a time but if you go to 128 then you may not see many more speed ups)

In [None]:
local_model_path='models/learn_hf_food_not_food_text_classifier-distilbert-base-uncased'

huggingface_model_path="AinaSiddiqui/learn_hf_food_not_food_text_classifier-distilbert-base-uncased"

In [None]:
#setup our device for making predictions
#Note: generally the faster the harware accelearator , the faster the predictions
def set_device():
  if torch.cuda.is_available():
    device=torch.device('cuda')
  elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device=torch.device('mps')
  else:
    device=torch.device('cpu')
  return device
Device=set_device()
print(f'Using Device {Device}')



#Making Predictions with Pipeline


In [None]:
from transformers import pipeline
#set the batch size
BATCH_SIZE=32 #reminder prediction speed often increases with higher batch size  (e.g 1->32 but can saturate at higher batch size)
#create an instance of transformers.pipeline
food_not_food_classifier=pipeline('text-classification',
                                  model=local_model_path,
                                  device=Device,
                                  top_k=1,
                                  batch_size=BATCH_SIZE)
food_not_food_classifier

In [None]:
test_custom_sentence='Apple is a fruit'
food_not_food_classifier(test_custom_sentence)

In [None]:
test_not_food_sentence="I have a deep learning computer and I love it"
food_not_food_classifier(test_not_food_sentence)

In [None]:
#Use model from Hugging face pipeline
from transformers import pipeline
#set the batch size
BATCH_SIZE=32 #reminder prediction speed often increases with higher batch size  (e.g 1->32 but can saturate at higher batch size)
#create an instance of transformers.pipeline
food_not_food_classifier=pipeline('text-classification',
                                  model=huggingface_model_path,
                                  device=Device,
                                  top_k=1,
                                  batch_size=BATCH_SIZE)
food_not_food_classifier

In [None]:
test_not_food_sentence="I have a deep learning computer and I love it"
food_not_food_classifier(test_not_food_sentence)

#Making Multiple Predictions at the same time with batch prediction

In [None]:
sentences = [
    # Food-related Sentences
    "I had pancakes and syrup for breakfast.",
    "Lunch was a big bowl of salad with grilled chicken.",
    "For dinner, I prepared spaghetti with marinara sauce.",
    "I grabbed an apple and a granola bar on my way out.",
    "He made a smoothie with bananas, strawberries, and yogurt.",

    # Not Food-related Sentences
    "The car was parked on the side of the road.",
    "We discussed the quarterly sales report in the meeting.",
    "She enjoys reading books about history.",
    "The weather today is sunny with a cool breeze.",
    "He went jogging around the neighborhood.",


]


In [None]:
food_not_food_classifier(sentences)

In [None]:
# Food-related Sentences
food_sentences = [
    "I savored a warm chocolate chip cookie with cold milk.",
    "Grilled salmon with roasted vegetables was served for dinner.",
    "She indulged in a decadent slice of cheesecake.",
    "For breakfast, I had scrambled eggs with whole-grain toast.",
    "He enjoyed a juicy burger with crispy fries.",
    "The aroma of freshly baked croissants filled the air.",
    "We shared a plate of spicy buffalo wings.",
    "Her favorite snack is hummus with carrot sticks.",
    "The chef prepared a mouth-watering beef stir-fry.",
    "I treated myself to a refreshing ice cream cone.",
]

# Not Food-related Sentences
not_food_sentences = [
    "The city skyline was breathtaking at sunset.",
    "She loves playing with her adorable kitten.",
    "The new smartphone features advanced camera technology.",
    "He went hiking in the beautiful mountains.",
    "The teacher explained complex math concepts clearly.",
    "We visited the museum to see the art exhibit.",
    "The park was filled with children playing tag.",
    "She practices yoga to relax and unwind.",
    "The company launched a new sustainable energy project.",
    "He wrote a compelling story about his childhood memories.",
]

In [None]:
food_not_food_classifier(food_sentences)

In [None]:
food_not_food_classifier(not_food_sentences)

In [None]:
confusing_captions = [
    "Sizzling hot and fresh out of the oven.",
    "A sweet treat to brighten up your day.",
    "Juicy and tender, just the way I like it.",
    "The perfect blend of flavors.",
    "A delicious view to wake up to.",
    "Tasty and addictive, beware!",
    "Freshly brewed and ready to go.",
    "A mouth-watering masterpiece.",
    "Savoring the flavor of success.",
    "The icing on the cake.",
]
food_not_food_classifier(confusing_captions)

#Time our model across larger samples

In [None]:
import time
sentences_1000=sentences*1000
len(sentences_1000)

#lets see how long it takees to make predictions on all sentences (one at a time)
print(f'[INFO] Number of sentences: {len(sentences_1000)}')
start_time_one_at_a_time=time.time()
for sent in sentences_1000:
  #Make a prediction
  food_not_food_classifier(sent)
end_time_one_at_a_time=time.time()

total_time_one_at_a_time=end_time_one_at_a_time-start_time_one_at_a_time
avg_time_per_pred=total_time_one_at_a_time/len(sentences_1000)
print(f'[INFO] Total Time for making a predictions on {len(sentences_1000)} sentences: {total_time_one_at_a_time}s')
print(f'Avg time per predictions: {avg_time_per_pred}s')


In [None]:
#Lets now use pipelines in batches
for i in [10,100,1000]:
  sentences_big=sentences*i
  print(f'[INFO] Number of sentences {len(sentences_big)}')
  start_time=time.time()
  #Predict on all sentences in batch mode
  food_not_food_classifier(sentences_big)
  end_time=time.time()

  total_time=end_time-start_time
  avg_time_per_sent_batch=total_time/len(sentences_big)
  print(f'Inferance time for {len(sentences_big)} sentences: {round(total_time,6)} s')
  print(f'Avg inferance time per sent : {round(avg_time_per_sent_batch,8)} s')
  print()

Making Predictions with PyTorch

Steps to make predictions with PyTorch
 1. Create the tokenizer with AutoTokenizer
 2. Create the model with AutoModelForSequenceClassification
 3. Tokenizer text with 1
 4. Make predictions with 2
 5. Format predictions

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
#setup the mdel path
model_path='AinaSiddiqui/learn_hf_food_not_food_text_classifier-distilbert-base-uncased'
#create an example to predict on
sample_food_text='A delicious photo of a plate of scrambled eggs,toast and bacon'

#prepare the tokenizer
tokenizer=AutoTokenizer.from_pretrained(model_path)
inputs=tokenizer(sample_food_text,return_tensors='pt')

In [None]:
#load the model
model=AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
#lets make predictions
import torch
model.eval()
with torch.inference_mode():
  output=model(**inputs)#** means  input all of dictionary keys as named arguments/parameters
  outputs_verbose=model(input_ids=inputs['input_ids'],
                        attention_mask=inputs['attention_mask'])
output

In [None]:
outputs_verbose

In [None]:
#Convert logits into prediction probablity + label
predicted_class_id=output.logits.argmax().item()
prediction_probablity=torch.softmax(output.logits,dim=1).max().item()
prediction_probablity

In [None]:
print(f'Text :{sample_food_text}')
print(f"Predicted label: {model.config.id2label[predicted_class_id]}")
print(f'Prediction Probablity {prediction_probablity}')

In [None]:
food_not_food_classifier(sample_food_text)

#Putting it all together

In [None]:
# 1.Import necessary packages
import pprint
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import datasets
import evaluate
from transformers import pipeline
from transformers import TrainingArguments,Trainer
from transformers import AutoTokenizer,AutoModelForSequenceClassification

#2.setup variables for model training and saving pipeline
DATASET_NAME="mrdbourke/learn_hf_food_not_food_image_captions"
MODEL_NAME='distilbert/distilbert-base-uncased'
MODEL_SAVE_DIR_NAME='models/learn_hf_food_not_food_text_classifier-distilbert-base-uncased'

#3.create a directory for saving model
print(f'Creating Directory for saving models {MODEL_SAVE_DIR_NAME} ')
model_save_dir=Path(MODEL_SAVE_DIR_NAME)
model_save_dir.mkdir(parents=True,exist_ok=True)

#4.load and preprocess the data from Hugging face Hub
print()
print(f'Downloading Dataset "{DATASET_NAME}" from Hugging Face HUB')
dataset=datasets.load_dataset(DATASET_NAME)
id2label={idx:label for idx,label in enumerate(dataset['train'].unique('label'))}
label2id={label:idx for idx,label in enumerate(dataset['train'].unique('label'))}
#Create a function to map IDs to label in dataset
def map_labels_to_number(example):
  example['label']=label2id[example['label']]
  return example
dataset=dataset['train'].map(map_labels_to_number)

#split the dataset into train/test dataset
dataset=dataset.train_test_split(test_size=0.2,seed=42)

#5. Import a tokenizer and map it to our dataset
print()
print(f'[INFO] Tokenizing text for our model training with {MODEL_NAME}')
tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=True)
#Create a function to tokenize text samples
def tokenize_text(examples):
  return tokenizer(examples['text'],
                   padding=True,
                   truncation=True)
tokenized_dataset=dataset.map(tokenize_text,batched=True,batch_size=1000)

#6setup an evaluation metric
print()
print('[INFO]Setting up an evaluation metric')
accuracy_metric=evaluate.load('accuracy')
def compute_accuracy(predictions_and_labels):
  predictions,labels=predictions_and_labels
  #Model will output logits in the form of ([[item_1,item_2],[item_1,_item_2]]) depending on number of classes
  #But we want to compare labels which are in the form of ([0,0,0,1,0])
  if(len(predictions.shape))>=2:
    predictions=np.argmax(predictions,axis=1)
  return accuracy_metric.compute(predictions=predictions,references=labels)


#setup a model
print()
print(f'[INFO] LOADING MODEL: {MODEL_NAME}')
model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                         num_labels=2,
                                                         id2label=id2label,
                                                         label2id=label2id)
print(f'[INFO] MODEL LOADING COMPLETE')

#setup TrainingArguments(these are hyperparameters for our model)
#HyperParameters=settings that we can set as developer
training_args=TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.0001,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy='epoch',
    report_to='none',
    push_to_hub=False,
    hub_private_repo=False
)
#Cretae Trainer
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

#train the model
print()
print(f'[INFO] Commencing model training')
result=trainer.train()
#save the model to local directory
print()
print(f'[INFO] Saving the model into local directory')
trainer.save_model(output_dir=model_save_dir)


#push the model to hugging face hub
print()
print(f'[INFO] Uploading the model to Hugging face hub')
model_upload_url=trainer.push_to_hub(
    commit_message='Uploading food not food text classifier (putting it all together)',
    token='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'

)
print()
print(f'[INFO] Model Upload complete, model availabel at {model_upload_url}')
#evaluate the model
print(f'[INFO] Performing evaluation on test dataset')
predictions_all=trainer.predict(tokenized_dataset['test'])
predictions_values=predictions_all.predictions
predictions_metrics=predictions_all.metrics

print(f'[INFO] Predictions metrics on test dataset')
pprint.pprint(predictions_metrics)

#Turning our model into a Demo

This will help you share it with others so thecan try it out

and can shared some insights on how our model could be improved

We are going to create machine learning demo with gradio app

### Create a functions perform inferance

1. Take an input of string
2. Setup a text classification pipelines
3.  get the output from pipeline
4. return the output from pipeline in step 3 as a  formatted dictory with the format [{"label_1":probablity_1,"label_2":"probablity_2"}]

In [None]:
from typing import Dict

#Create a function to take input string
def food_not_food_classifier(test:str)->Dict[str,float]:
  #setup food not food classifier
  classifier=pipeline('text-classification',
                      model='AinaSiddiqui/learn_hf_food_not_food_text_classifier-distilbert-base-uncased',
                      batch_size=32,
                      device='cuda' if torch.cuda.is_available() else "cpu",
                      top_k=None #top_k=None means return all possible labels
                      )
  #get the output from the classifier
  outputs=classifier(text)[0]
  #format out for gradio
  output_dict={}
  for item in outputs:
    output_dict[item['label']]=item['score']
  return output_dict
food_not_food_classifier('Yo! We are building local demo')


In [None]:
food_not_food_classifier('My fav food is biryani')

### Building  a small Gradio demo to run locally

1. Import gradio.
2. Create a gradio interface
3. launch the interface

In [None]:
#import gradio
import gradio as gr
#create a gradio interface
demo=gr.Interface(
    fn=food_not_food_classifier,
    inputs='text',
    outputs=gr.Label(num_top_classes=2),
    title='FOOD NOT FOOD CLASSIFIER',
    description='A text classifier to determine if a sentence is about food or not',
    examples=[['I wipped up a fresh batch of code, but it seems to have an syntax error'],
              ['A plate of pancake and strawberry icecream']]
)
#Launch the interface
demo.launch()

##Making our demo publically accessible

There are two main ways to make our demo publically accessible with Hugging face Spaces:
1. Manually - We can go to huggingface.co/spaces --> "Create new Spaces" add our files and publish.
2. Programmetically -- we can use hugging face hub Python API and add our files  to a Space with code

To create Spaces programetically we are going to need three files
 1. 'app.py'- This is main app with functionality of our app
 2. 'requirements.txt'  These are app dependencies which our app will require
 3. 'README.md' This will explain what our project/demo is about. And will add some metadata in YAML Format.

 To create above files we'll use the following file structure.

```
demos/
└── food_not_food_text_classifier/
    ├── app.py
    ├── README.md
    └── requirements.txt
```

In [None]:
##Making a directory ti store our demo
from pathlib import Path
demos_dir=Path('../demos')
demos_dir.mkdir(exist_ok=True,)
##Create  folder for the food not food text classifier
food_not_food_text_classifier_demo_dir=Path(demos_dir,'food_not_food_text_classifer')
food_not_food_text_classifier_demo_dir.mkdir(exist_ok=True)

## Making an app.py file
Our 'app.py' will contain the main logic of our application to run.

When we upload it to Hugging face Spaces,Spaces will try to run app.py file automatically.

In our app.py file we want to
 1. Import Packages
 2. Define our function to use our model (this will work with gradio)
 3. Create a demo
 4. Launch the demo

 To create each of our file we are going to use magic command %%writefile

In [None]:
%%writefile ../demos/food_not_food_text_classifer/app.py
import torch
import gradio as gr
from typing import Dict
from transformers import pipeline

#Define our function to use with our model
from typing import Dict

#Create a function to take input string
def food_not_food_classifier(test:str)->Dict[str,float]:
  #setup food not food classifier
  classifier=pipeline('text-classification',
                      model='AinaSiddiqui/learn_hf_food_not_food_text_classifier-distilbert-base-uncased',
                      batch_size=32,
                      device='cuda' if torch.cuda.is_available() else "cpu",
                      top_k=None #top_k=None means return all possible labels
                      )
  #get the output from the classifier
  outputs=classifier(text)[0]
  #format out for gradio
  output_dict={}
  for item in outputs:
    output_dict[item['label']]=item['score']
  return output_dict
#3. Create a gradio interface
demo=gr.Interface(
    fn=food_not_food_classifier,
    inputs='text',
    outputs=gr.Label(num_top_classes=2),
    title='Food Not Food text Classifier',
    description='A text classifier to determine if a sentence is about food or not',
    examples=[['I wipped up a fresh batch of code, but it seems to have an syntax error'],
              ['A plate of pancake and strawberry icecream']]

)
##Launch the interface
if __name__=='main.py':
  demo.launch()


### Making a Readme file
This file is a markdown format.

With a special YAML block at a top.

The YAML block is used for metadata and settings.

In [None]:
%%writefile ../demos/food_not_food_text_classifer/README.md
---

title: FOOD NOT FOOD TEXT CLASSIFIER
emoji:🍕🍗🚫
colorFrom:blue
colorTo:yellow
sdk:gradio
sdk_version:5.8.0
app_file:app.py
pinned:False
license:apache-2.0
---
# 🍕🍗🚫 Food Not Food text Classifier

Small demo to showcase a text classifier to determine if the sentence is about food or not food.

DistilBERT model small fine tuned on a small synthetic dataset of 250  generated food_not_food image captions.


## Making a Requirements.txt file

When you upload app.py file to Hugging face Spaces,it will attempt to run it automatically.

And just like running the file locally, we need to make sure all of the required packages are available

Otherwise our Space will throw an error like
```
 Traceback (most recent call last):
  File "app.py", line 1, in <module>
    import torch
ModuleNotFoundError: No module named 'torch'
```


In [None]:
%%writefile ../demos/food_not_food_text_classifer/requirements.txt
torch
transformers


#Uploading our demo to hugging face Spaces

1. Import necessary packages
2. Define what we want to upload
3. Create a repo
4. Get the name of repo from upload
5. Upload the content of our `..demos/food_not_food_text_classifier to our Hugging Face Hub
6.Hope it all works and inspects

In [None]:
#import the required methods foruploading huggingg face hub
from huggingface_hub import (create_repo,get_full_repo_name,upload_file,upload_folder)
#define the parameters we like to use for uploading our space
LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD="../demos/food_not_food_text_classifier/"
HF_TARGET_SPACE_NAME='learn_hf_food_not_food_text_classifier'
HF_REPO_TYPE='space'
HF_SPACE_SDK='gradio'
HF_TOKEN='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'
##Create a Repo
print(f'[INFO] Create a Hugging Face Hub with name {HF_TARGET_SPACE_NAME}')
create_repo(
    repo_id=HF_TARGET_SPACE_NAME,
    repo_type=HF_REPO_TYPE,
    private=False,
    space_sdk=HF_SPACE_SDK,
    exist_ok=True

)
#Get the full repo name
hf_full_repo_name=get_full_repo_name(model_id=HF_TARGET_SPACE_NAME,token='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')
print(f'[INFO] Get Full Hugging Face Hub repo name {hf_full_repo_name}')
#Upload our demo folder
print(f'[INFO] Uploading {LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD} to repo {hf_full_repo_name}')
folder_upload_url=upload_folder(
    repo_id=HF_TARGET_SPACE_NAME,
    folder_path=LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD,
    path=".",
    token='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',
    repo_type=HF_REPO_TYPE,
    commit_message='Uploading food not food text classifier demo from our notebook',

)
print(f'[INFO] Demo Uploaded Successfully with commit URL {folder_upload_url}')

In [None]:
huggingface-cli login
