In [None]:
!pip install --quiet tensorflow tensorflow_hub tensorflow_text pandas numpy matplotlib seaborn transformers

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers
from sklearn.model_selection import train_test_split

In [None]:
# Load the dataset
df = pd.read_csv('data.csv', encoding='latin-1')
df = df.rename(columns={'v1': 'label', 'v2': 'text'})
df.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4825.0,2793.421762,1604.646764,0.0,1409.0,2794.0,4184.0,5571.0
spam,747.0,2734.331995,1634.377702,2.0,1227.0,2718.0,4149.5,5567.0


**Dataset**

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


Conversion of Label from Str to Bool

In [None]:
df['label'] = np.where(df['label'] == 'spam', 1, 0)

In [None]:
# Split the dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], 
                                                                    random_state=42, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=42, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [None]:
# Load the BERT tokenizer and encoder
tokenizer = transformers.BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
distilbert_encoder = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# Define the input pipelines
max_length = 128
batch_size = 32

train_encodings = tokenizer(list(train_text), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(list(val_text), truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(list(test_text), truncation=True, padding=True, max_length=max_length)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(batch_size)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
# Define the model architecture
input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

distilbert_outputs = distilbert_encoder(input_ids, attention_mask=attention_mask)
pooled_output = distilbert_outputs[0][:, 0]

dropout = tf.keras.layers.Dropout(0.3)(pooled_output)
dense = tf.keras.layers.Dense(256, activation='relu')(dropout)
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)


In [None]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
epochs = 5
history = model.fit(train_dataset, validation_data=val_dataset, epochs=epochs)

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)

Test Loss: 0.021957039833068848
Test Accuracy: 0.9940191507339478
