In [1]:
!pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.1-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.3/297.3 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd

df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [5]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [6]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [7]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 2)

In [8]:
df_balanced['Category'].value_counts()

Category
ham     747
spam    747
Name: count, dtype: int64

In [9]:
# Convert the category to a binary variable
df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [10]:
# drop the old category column
df_balanced.drop('Category', axis='columns', inplace=True)
df_balanced.head()

Unnamed: 0,Message,spam
2584,"Goodmorning, today i am late for 1hr.",0
4539,Dare i ask... Any luck with sorting out the car?,0
1744,I love to wine and dine my lady!,0
1477,I'm watching lotr w my sis dis aft. So u wan 2...,0
1068,Meeting u is my work. . . Tel me when shall i ...,0


In [11]:
# rename columns to 'text' and 'label'
df_balanced.rename(columns={'Message': 'text'}, inplace=True)
df_balanced.rename(columns={'spam': 'label'}, inplace=True)
df_balanced.head()

Unnamed: 0,text,label
2584,"Goodmorning, today i am late for 1hr.",0
4539,Dare i ask... Any luck with sorting out the car?,0
1744,I love to wine and dine my lady!,0
1477,I'm watching lotr w my sis dis aft. So u wan 2...,0
1068,Meeting u is my work. . . Tel me when shall i ...,0


In [12]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df_balanced, test_size=0.2, random_state=42)

In [13]:
print(train_data.shape)
print(test_data.shape)

(1195, 2)
(299, 2)


In [32]:
from datasets import Dataset
ds_train = Dataset.from_dict({"text": train_data['text'], "label": train_data['label']})
ds_test = Dataset.from_dict({"text": test_data['text'], "label": test_data['label']})


In [33]:
print(ds_test)

Dataset({
    features: ['text', 'label'],
    num_rows: 299
})


In [34]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [35]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [37]:
tokenized_data_train = ds_train.map(preprocess_function, batched=True)
tokenized_data_test = ds_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/299 [00:00<?, ? examples/s]

In [38]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [39]:
import evaluate

accuracy = evaluate.load("accuracy")

In [40]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [41]:
id2label = {0: "ham", 1: "spam"}
label2id = {"ham": 0, "spam": 1}

In [42]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [43]:
# Check TensorFlow and Transformers versions
import tensorflow as tf
import transformers

print("TensorFlow version:", tf.__version__)
print("Transformers version:", transformers.__version__)

TensorFlow version: 2.15.0
Transformers version: 4.38.2


In [44]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [45]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data_train,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data_test,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [46]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [47]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [48]:
callbacks = [metric_callback]

In [49]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7a39ed0cfa60>

In [50]:
model.save('spam-detector')



In [52]:
!pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-4.17.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-decision-forests>=1.5.0 (from tensorflowjs)
  Downloading tensorflow_decision_forests-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting packaging~=23.1 (from tensorflowjs)
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<3,>=2.13.0 (from tensorflowjs)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00

In [53]:
!tensorflowjs_converter --input_format=tf_saved_model \
                       --output_format=tfjs_layers_model \
                       spam-detector \
                       spam-detector-js


Traceback (most recent call last):
  File "/usr/local/bin/tensorflowjs_converter", line 8, in <module>
    sys.exit(pip_main())
  File "/usr/local/lib/python3.10/dist-packages/tensorflowjs/converters/converter.py", line 958, in pip_main
    main([' '.join(sys.argv[1:])])
  File "/usr/local/lib/python3.10/dist-packages/tensorflowjs/converters/converter.py", line 962, in main
    convert(argv[0].split(' '))
  File "/usr/local/lib/python3.10/dist-packages/tensorflowjs/converters/converter.py", line 948, in convert
    _dispatch_converter(input_format, output_format, args, quantization_dtype_map,
  File "/usr/local/lib/python3.10/dist-packages/tensorflowjs/converters/converter.py", line 720, in _dispatch_converter
    raise ValueError(
ValueError: Unsupported input_format - output_format pair: tf_saved_model - tfjs_layers_model


In [57]:
!zip -r /content/drive/MyDrive/model.zip spam-detector/

  adding: spam-detector/ (stored 0%)
  adding: spam-detector/variables/ (stored 0%)
  adding: spam-detector/variables/variables.data-00000-of-00001 (deflated 27%)
  adding: spam-detector/variables/variables.index (deflated 78%)
  adding: spam-detector/fingerprint.pb (stored 0%)
  adding: spam-detector/saved_model.pb (deflated 92%)
  adding: spam-detector/keras_metadata.pb (deflated 94%)
  adding: spam-detector/assets/ (stored 0%)


In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
