In [17]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras
from transformers import BertTokenizer, TFBertForSequenceClassification

In [11]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=5)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### A. Loading Dataset

In [3]:
data = pd.read_csv("./Text Classification Documentation - 2440023002 - Andreas Christianto.csv")

texts = data['Text'].tolist()
labels = data['Label'].tolist()

In [4]:
data.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


### B. Tokenize

In [5]:
def convert_example_to_feature(text, label):
    inputs = tokenizer(text, max_length=128, truncation=True, padding='max_length', add_special_tokens=True, return_token_type_ids=False, return_tensors='tf')
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label': label
    }

features = [convert_example_to_feature(text, label) for text, label in zip(texts, labels)]

### C. Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
input_ids = [f['input_ids'] for f in features]
attention_masks = [f['attention_mask'] for f in features]
labels = [f['label'] for f in features]

train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(input_ids, attention_masks, labels, test_size=0.2, random_state=42)

In [8]:
train_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': tf.convert_to_tensor(train_input_ids),
    'attention_mask': tf.convert_to_tensor(train_attention_masks)
}, tf.convert_to_tensor(train_labels)))

test_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': tf.convert_to_tensor(test_input_ids),
    'attention_mask': tf.convert_to_tensor(test_attention_masks)
}, tf.convert_to_tensor(test_labels)))

# Shuffle and batch the training dataset
train_dataset = train_dataset.shuffle(100).batch(8).repeat(2)
test_dataset = test_dataset.batch(8)


### D. Model Training

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [14]:
history = model.fit(train_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


### E. Model Evaluation

In [15]:
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")

Test Loss: 0.10718000680208206
Test Accuracy: 0.9707865118980408
