In [1]:
# Install dependencies
!pip install numpy
!pip install pandas
!pip install tensorflow
!pip install scikit-learn
!pip install transformers



In [2]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizerFast, TFBertForSequenceClassification
from keras.callbacks import ModelCheckpoint

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if tf.test.gpu_device_name():
    print('Default GPU Device Details: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU with TensorFlow support found")

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    print('Not running on TPU')

print("REPLICAS: ", strategy.num_replicas_in_sync)


No GPU with TensorFlow support found
Not running on TPU
REPLICAS:  1


In [4]:
# Load the dataset
print("Loading dataset")
df_train = pd.read_csv('../data/training.csv')
df_val = pd.read_csv('../data/validation.csv')

# Encode labels
le = LabelEncoder()
df_train['cyberbullying_type'] = le.fit_transform(df_train['cyberbullying_type'])

# Split into training and validation sets
train_texts, train_labels = df_train['tweet_text'].toList(), df_train['cyberbullying_type'].toList()
val_texts, val_labels = df_val['tweet_text'].toList(), df_val['cyberbullying_type'].toList()

Tokenizing


In [None]:
# Initialize the BERT tokenizer
print("Tokenizing...")

# Load pre-trained tokenizer
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load custom trained tokenizer
tokenizer = BertTokenizerFast.from_pretrained('../models/bert/tokenizer/')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
  dict(train_encodings),
  train_labels
)).shuffle(1000).batch(8)
val_dataset = tf.data.Dataset.from_tensor_slices((
  dict(val_encodings),
  val_labels
)).batch(32)

In [5]:
# Load pre-trained BERT model
# model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))

# Load custom BERT model
model = tf.keras.models.load_model('../models/bert/tf_saved_model/')

# Define optimizer, loss, and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

# Compile the model
print("Compiling...")
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Create a callback that saves the model's weights
checkpoint = ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5', 
                             monitor='val_accuracy', 
                             save_best_only=True, 
                             mode='max', 
                             verbose=1)

# Train the model
print("Training...")
history = model.fit(train_dataset, validation_data=val_dataset, epochs=3)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Compiling...
Training...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
model.save('../models/bert/tf_saved_model')



INFO:tensorflow:Assets written to: ./tf_saved_model\assets


INFO:tensorflow:Assets written to: ./tf_saved_model\assets


In [10]:
tokenizer.save_pretrained('../models/bert/tokenizer/')

('./bert_tokenizer/tokenizer_config.json',
 './bert_tokenizer/special_tokens_map.json',
 './bert_tokenizer/vocab.txt',
 './bert_tokenizer/added_tokens.json',
 './bert_tokenizer/tokenizer.json')