## Huggingface BERT model finetuning with native Tensorflow

In [1]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import itertools
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalMaxPool1D, BatchNormalization, Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf
import keras.backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax
from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from tensorflow.keras import mixed_precision
from tensorflow.keras.utils import to_categorical
from datasets import DatasetDict, Dataset

# tf.config.set_visible_devices([], 'GPU')
plt.style.use('ggplot')

# Define mixed precision policy
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3070, compute capability 8.6


In [3]:
# Data imports
with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()
  
  
reviews = reviews.split('\n')
labels = labels.split('\n')
labels = [1 if label == "positive" else 0 for label in labels]

# reviews = reviews[:10000]
# labels = labels[:10000]

In [4]:
reviews[:3]

['bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   ',
 'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience i

In [4]:
ds = Dataset.from_dict({'reviews': reviews, 'labels': labels})
ds

Dataset({
    features: ['reviews', 'labels'],
    num_rows: 25001
})

In [5]:
dataset = ds.train_test_split()

In [6]:
checkpoint = 'distilbert-base-uncased'
batch_size = 16
max_length = 300

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(train_dataset):
    return tokenizer(train_dataset['reviews'], padding='max_length', truncation=True, max_length = max_length)
  
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/18750 [00:00<?, ? examples/s]

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

In [7]:
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']

train_dataset = train_dataset.remove_columns(['reviews']).with_format('tensorflow')
test_dataset = test_dataset.remove_columns(['reviews']).with_format('tensorflow')

train_features = {x: train_dataset[x] for x in tokenizer.model_input_names}
train_set = tf.data.Dataset.from_tensor_slices((train_features, train_dataset['labels'])).shuffle(1000).batch(batch_size)

test_features = {x: test_dataset[x] for x in tokenizer.model_input_names}
test_set = tf.data.Dataset.from_tensor_slices((test_features, test_dataset['labels'])).shuffle(1000).batch(batch_size)

In [8]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.layers[0].trainable = False

model.compile(
  optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
  metrics = ['accuracy']
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [9]:
history = model.fit(train_set,
                   validation_data = test_set,
                   epochs = 3,
                   batch_size = batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Fine tuning Huggigface BERT model with trainer object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(X_train, truncation = True, padding = True)
test_encodings = tokenizer(X_test, truncation = True, padding = True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test))

In [None]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
  output_dir = './results',
  num_train_epochs = 2,
  per_device_eval_batch_size = 4,
  per_device_train_batch_size = 4,
  warmup_steps = 500,
  weight_decay = 0.01,
  logging_dir = './logs',
  logging_steps = 10,
  eval_steps = 50
)

In [None]:
with training_args.strategy.scope():
  model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
  
trainer = TFTrainer(
  model = model,
  args = training_args,
  train_dataset = train_dataset,
  eval_dataset = test_dataset
)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)

In [None]:
print('Model accuracy: {}'.format(sum([tf.math.argmax(predictions[0], axis = -1).numpy().tolist()[i] == y_test[i] for i in range(len(y_test))]) / len(y_test)))