In [2]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np

## Task

* Given a text, predict the class that the text is about


## Preprocess Data

In [4]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [5]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
train_df.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [6]:
# view a sample text
train_df['text'].iloc[2]

"Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."

In [7]:
label_counts = train_df["label"].value_counts()
label_counts

2    30000
3    30000
1    30000
0    30000
Name: label, dtype: int64

0 = world,
1 = sports,
2 = business,
3 = sci/tech

### One hot encode the labels

In [8]:
train_df = pd.get_dummies(train_df, columns=['label'])
train_df

Unnamed: 0,text,label_0,label_1,label_2,label_3
0,Wall St. Bears Claw Back Into the Black (Reute...,0,0,1,0
1,Carlyle Looks Toward Commercial Aerospace (Reu...,0,0,1,0
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,0,0,1,0
3,Iraq Halts Oil Exports from Main Southern Pipe...,0,0,1,0
4,"Oil prices soar to all-time record, posing new...",0,0,1,0
...,...,...,...,...,...
119995,Pakistan's Musharraf Says Won't Quit as Army C...,1,0,0,0
119996,Renteria signing a top-shelf deal Red Sox gene...,0,1,0,0
119997,Saban not going to Dolphins yet The Miami Dolp...,0,1,0,0
119998,Today's NFL games PITTSBURGH at NY GIANTS Time...,0,1,0,0


### Clean up the text

In [9]:
import nltk
import string
import re
from nltk.stem import PorterStemmer

nltk.download("punkt")

def preprocess_text(text):
  text = text.lower()

  # remove punctuation
  text = text.translate(str.maketrans("", "", string.punctuation))

  # remove numbers
  text = re.sub(r'\d+', '', text)

  # stemming
  stemmer = PorterStemmer()
  tokens = nltk.word_tokenize(text)
  stemmed_text = ' '.join([stemmer.stem(token) for token in tokens])

  return stemmed_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
train_df["text"] = train_df["text"].apply(preprocess_text)
train_df

Unnamed: 0,text,label_0,label_1,label_2,label_3
0,wall st bear claw back into the black reuter r...,0,0,1,0
1,carlyl look toward commerci aerospac reuter re...,0,0,1,0
2,oil and economi cloud stock outlook reuter reu...,0,0,1,0
3,iraq halt oil export from main southern pipeli...,0,0,1,0
4,oil price soar to alltim record pose new menac...,0,0,1,0
...,...,...,...,...,...
119995,pakistan musharraf say wont quit as armi chief...,1,0,0,0
119996,renteria sign a topshelf deal red sox gener ma...,0,1,0,0
119997,saban not go to dolphin yet the miami dolphin ...,0,1,0,0
119998,today nfl game pittsburgh at ny giant time pm ...,0,1,0,0


### Train test split

In [12]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df["text"],
    train_df.drop(["text"], axis=1),
)

### Tokenizer

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
dataset["train"][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [15]:
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

In [20]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

## Evaluate

In [17]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [18]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [19]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [21]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [23]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=4
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [24]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [25]:
model.compile(optimizer=optimizer)  # No loss argument!

In [26]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [27]:
model.fit(tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[metric_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7bd1f002fb50>

In [30]:
from transformers import AutoTokenizer
text = "Australia has the highest percentage of Kangaroos"

inputs = tokenizer(text, return_tensors="tf")

logits = model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])

print(logits), print(predicted_class_id)

# model.config.id2label[predicted_class_id]

tf.Tensor([[ 2.59918   -2.0999012 -2.4402075  1.2873685]], shape=(1, 4), dtype=float32)
0


(None, None)

In [31]:
model.save_pretrained("distilbert_text_classification_model")

### Loading a pre trained model

In [1]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/distilbert_text_classification_model')

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/distilbert_text_classification_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [2]:
from transformers import AutoTokenizer
import tensorflow as tf
text = "Australia has the highest percentage of Kangaroos"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

inputs = tokenizer(text, return_tensors="tf")

logits = model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])

print(logits), print(predicted_class_id)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tf.Tensor([[ 2.59918   -2.0999012 -2.4402075  1.2873685]], shape=(1, 4), dtype=float32)
0


(None, None)

In [3]:
inputs

{'input_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[  101,  2660,  2038,  1996,  3284,  7017,  1997, 21652,  2015,
          102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

0 = world,
1 = sports,
2 = business,
3 = sci/tech

In [4]:
from transformers import AutoTokenizer
import tensorflow as tf


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def predict_text(text):
  inputs = tokenizer(text, return_tensors="tf")

  logits = model(**inputs).logits

  predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])

  print(logits), print(predicted_class_id)
  return predicted_class_id

predict_text("Belgium soccer team wins the world cup every year")
predict_text("Bitcoin will eventually run the world")
predict_text("The royal family owns a large country")

tf.Tensor([[ 1.3730699  4.8763914 -2.8066082 -3.2761738]], shape=(1, 4), dtype=float32)
1
tf.Tensor([[-0.12236271 -4.801699   -0.6200071   3.260698  ]], shape=(1, 4), dtype=float32)
3
tf.Tensor([[ 2.3290505 -2.7871583  1.164522  -2.0756993]], shape=(1, 4), dtype=float32)
0


0

In [5]:
predict_text("Maradona is the top scorer in soccer ever")

tf.Tensor([[ 1.8160654  3.7035215 -1.974746  -3.5851183]], shape=(1, 4), dtype=float32)
1


1

In [6]:
predict_text("Spain has a large population")

tf.Tensor([[ 3.0845523  -2.8736486  -0.6422905  -0.63165694]], shape=(1, 4), dtype=float32)
0


0