## Model fine tuning in tensorflow and huggingface

In [1]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import itertools
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalMaxPool1D, BatchNormalization, Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf
import keras.backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax
from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from tensorflow.keras import mixed_precision
from tensorflow.keras.utils import to_categorical
from datasets import Dataset


# tf.config.set_visible_devices([], 'GPU')
plt.style.use('ggplot')

# Define mixed precision policy
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3070, compute capability 8.6


In [2]:
# Read the data
amz_reviews = pd.read_csv('amazon_cells_labelled.txt', sep = '\t', names = ['review', 'label'])
amz_reviews.head(3)

Unnamed: 0,review,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1


In [3]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(amz_reviews['review'], amz_reviews['label'], test_size = 0.2, random_state = 42)

In [4]:
# Load the tokenizer and tokenize data
checkpoint = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_data_train = tokenizer(X_train.to_list(), return_tensors = 'np', padding = True)
tokenized_data_test = tokenizer(X_test.to_list(), return_tensors = 'np', padding = True)

labels_train = np.array(y_train)
labels_test = np.array(y_test)

print(tokenized_data_train['input_ids'][0])

[  101 17554   112   189  2080  2965   119   102     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]


In [5]:
# Load the model
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Model compilation and parameters
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
model.compile(optimizer = tf.keras.optimizers.Adam(5e-6), loss = loss, metrics = ['accuracy'])
model.layers[0].trainable = False

# model.bert.encoder.layer[i].trainable = False

In [7]:
# Fit the model to data
model.fit(dict(tokenized_data_train),
         labels_train,
         validation_data = (dict(tokenized_data_test), labels_test),
         epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18e9de8e4c0>

In [39]:
# Model evaluation
results = model.predict(dict(tokenized_data_test))['logits']

y_test_probabilities = tf.nn.softmax(results)
y_test_predictions = np.argmax(y_test_probabilities, axis = -1)

print(accuracy_score(y_test_predictions, y_test))

1.0


### Model optimization with Hugging Face arrow dataset format

In [32]:
# Convert python dataframe to Hugging Face arrow dataset
hg_amz_review = Dataset.from_pandas(amz_reviews)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Function to tokenize data
def tokenize_dataset(data):
  return tokenizer(data['review'])

# Tokenize the dataset
dataset = hg_amz_review.map(tokenize_dataset)

# Load model
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

# TF dataset
tf_dataset = model.prepare_tf_dataset(dataset = dataset, batch_size = 16, shuffle = True, tokenizer = tokenizer)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [33]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
model.compile(optimizer = tf.keras.optimizers.Adam(5e-6), loss = loss, metrics = ['accuracy'])

In [34]:
model.fit(tf_dataset, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1983f3247f0>

In [37]:
model.predict(tf_dataset)



TFSequenceClassifierOutput(loss=None, logits=array([[-3.07 ,  2.809],
       [-3.379,  3.025],
       [-2.969,  2.906],
       ...,
       [ 3.455, -3.11 ],
       [-3.518,  3.234],
       [ 3.018, -2.799]], dtype=float16), hidden_states=None, attentions=None)