# Fine Tuning For BERT Models with TPU

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf
import os

# Note that the `tpu` argument is for Colab-only
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [3]:
strategy = tf.distribute.TPUStrategy(resolver)

In [4]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


## Import Library

In [6]:
import numpy as np
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('/content/drive/MyDrive/DataSets/HepsiBurada/hepsiburada.csv', encoding = "utf-8")
df.head()

Unnamed: 0,Rating,Review
0,1,3 yıldır tık demedi. :)
1,1,3 yıldır kullanıyorum müthiş
2,1,Ürün bugün elime geçti çok fazla inceleme fırs...
3,1,Almaya karar verdim. Hemencecik geldi. Keyifle...
4,1,Günlük kullanımınızı çok çok iyi karsılıyor kı...


In [8]:
df.Rating.value_counts()

1    229821
0     13676
Name: Rating, dtype: int64

In [9]:
df.Rating = df.Rating.map({1:0, 0:1})

In [10]:
df.Rating.value_counts()

0    229821
1     13676
Name: Rating, dtype: int64

In [11]:
X = df['Review'].values
y = df['Rating'].values

## Tokenization

### Fixing token counts of all documents

In [12]:
from transformers import AutoTokenizer   

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

max_len = 0

# For every sentence...
max_token = []
for sent in X:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    
    input_ids = tokenizer.encode(sent.lower(), add_special_tokens = True) 
    max_token.append(len(input_ids))

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/251k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (758 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  758


In [13]:
text = "Ankara Türkiye'nin başkentidir.".lower()
 
tokenizer.tokenize(text)

['ankara', 'türkiye', "'", 'nin', 'başkenti', '##dir', '.']

In [14]:
# tokenizer.encode(text.lower())

In [15]:
# tokenizer.encode(text.lower() , add_special_tokens = True)

In [16]:
sentence = "Ankara Türkiye'nin başkentidir".lower()

tokens = tokenizer.encode(sentence, add_special_tokens = True)

print(tokens)
print(len(tokens))

[2, 11462, 13604, 11, 2491, 9925, 2067, 3]
8


In [17]:
sentence = "Ankara Türkiye'nin başkentidir".lower()

tokens = tokenizer.encode_plus(sentence, add_special_tokens=True)

print(tokens)

{'input_ids': [2, 11462, 13604, 11, 2491, 9925, 2067, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [18]:
liste = [5, 10, 8, 9, 12, 15]

print(np.array(liste) < 10)
print(sum(np.array(liste) < 10))

[ True False  True  True False False]
3


In [19]:
np.array(max_token).mean()

36.4837390193719

In [20]:
sum(np.array(max_token) < 160) / len(max_token) # %98 is more than 160 length.

0.9877000537994308

In [21]:
def transformation(X):
  # set array dimensions
  seq_len = 160
  num_samples = len(X)

  # initialize empty zero arrays
  Xids = np.zeros((num_samples, seq_len))
  Xmask = np.zeros((num_samples, seq_len))

    
  for i, phrase in enumerate(X):
      tokens = tokenizer.encode_plus(phrase.lower(), max_length=seq_len, truncation = True,
                                      padding='max_length', add_special_tokens = True) 
      # assign tokenized outputs to respective rows in numpy arrays
      Xids[i, :] = tokens['input_ids']
      Xmask[i, :] = tokens['attention_mask']
  return Xids, Xmask

In [22]:
Xids, Xmask = transformation(X)

In [23]:
Xids.shape

(243497, 160)

In [24]:
Xmask.shape

(243497, 160)

In [25]:
sentence = "Ankara Türkiye'nin başkentidir.".lower()

tokens = tokenizer.encode_plus(sentence.lower(), max_length=20, truncation=True,
                                      padding='max_length', add_special_tokens=True)
print(tokens)

{'input_ids': [2, 11462, 13604, 11, 2491, 9925, 2067, 18, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [26]:
np.zeros((10, 10))

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [27]:
Xids

array([[2.0000e+00, 2.3000e+01, 5.1420e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0000e+00, 2.3000e+01, 5.1420e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0000e+00, 2.7820e+03, 3.1200e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [2.0000e+00, 3.8070e+03, 9.2500e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0000e+00, 3.1560e+03, 4.1650e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0000e+00, 1.6807e+04, 1.9870e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

In [28]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [29]:
# from tensorflow.keras.utils import to_categorical

# l = np.array([1, 0, 2, 3, 4])
# to_categorical(l, 5) 

In [30]:
np.array([1, 0, 1, 0, 0]).reshape(-1,1) #reshape(5,1)

array([[1],
       [0],
       [1],
       [0],
       [0]])

In [31]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
labels = y.reshape(-1,1)
labels

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [33]:
import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels)) # tf.data.Dataset.from_tensors((Xids, Xmask, labels))
dataset

<TensorSliceDataset element_spec=(TensorSpec(shape=(160,), dtype=tf.float64, name=None), TensorSpec(shape=(160,), dtype=tf.float64, name=None), TensorSpec(shape=(1,), dtype=tf.int64, name=None))>

In [34]:
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

def map_func(Xids, Xmask, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': Xids, 'attention_mask': Xmask}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

In [35]:
dataset

<MapDataset element_spec=({'input_ids': TensorSpec(shape=(160,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(160,), dtype=tf.float64, name=None)}, TensorSpec(shape=(1,), dtype=tf.int64, name=None))>

## Train Test Split

In [36]:
batch_size = 32

# shuffle and batch
# fit into a batch of 32
dataset = dataset.shuffle(243500, reshuffle_each_iteration = False).batch(batch_size, drop_remainder = True)

In [37]:
dataset

<BatchDataset element_spec=({'input_ids': TensorSpec(shape=(32, 160), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(32, 160), dtype=tf.float64, name=None)}, TensorSpec(shape=(32, 1), dtype=tf.int64, name=None))>

In [38]:
# set split size (90% training data) and calculate training set size
split = 0.9
size = int(len(dataset)*split) #int((Xids.shape[0]/batch_size)*split)

# get training and validation sets
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [39]:
print(len(dataset))
print(len(dataset)*0.9)
print(int(len(dataset)*0.9))

7609
6848.1
6848


In [40]:
size

6848

In [41]:
Xids.shape

(243497, 160)

In [42]:
#batch_size=32

int(Xids.shape[0]/batch_size) # Total batch.

7609

## Adamw optimization

In [43]:
!pip install -q tf-models-official

[K     |████████████████████████████████| 2.3 MB 5.4 MB/s 
[K     |████████████████████████████████| 352 kB 78.1 MB/s 
[K     |████████████████████████████████| 5.8 MB 73.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 71.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 76.8 MB/s 
[K     |████████████████████████████████| 636 kB 80.6 MB/s 
[K     |████████████████████████████████| 118 kB 85.8 MB/s 
[K     |████████████████████████████████| 238 kB 82.3 MB/s 
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[K     |████████████████████████████████| 38.2 MB 116.7 MB/s 
[K     |████████████████████████████████| 588.3 MB 7.9 kB/s 
[K     |████████████████████████████████| 6.0 MB 71.5 MB/s 
[K     |████████████████████████████████| 439 kB 84.4 MB/s 
[K     |████████████████████████████████| 1.7 MB 74.7 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [44]:
len(train_ds)

6848

In [45]:
from official.nlp import optimization 

epochs = 3
#batch_size = 32

steps_per_epoch = len(train_ds)

num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)

init_lr = 2e-5  # 3e-5, 5e-5
optimizer = optimization.create_optimizer(init_lr = init_lr,
                                          num_train_steps = num_train_steps,
                                          num_warmup_steps = num_warmup_steps,
                                          optimizer_type = 'adamw')      

## Creating Model

In [46]:
def create_model():
    from transformers import TFAutoModel
    model = TFAutoModel.from_pretrained("dbmdz/bert-base-turkish-cased")
    input_ids = tf.keras.layers.Input(shape = (160,), name = 'input_ids', dtype = 'int32')
    attention_mask = tf.keras.layers.Input(shape = (160,), name = 'attention_mask', dtype = 'int32')

    embeddings = model.bert(input_ids = input_ids, attention_mask = attention_mask)["pooler_output"] #[1]

    x = tf.keras.layers.Dense(160, activation = 'relu')(embeddings)
    x = tf.keras.layers.Dropout(0.2, name = "dropout")(x)
    y = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'outputs')(x)

    return tf.keras.Model(inputs=[input_ids, attention_mask], outputs = y)

In [47]:
with strategy.scope():
  
  optimizer = optimizer #tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
  loss = tf.keras.losses.BinaryCrossentropy() # loss = keras.losses.BinaryCrossentropy()
  recall = tf.keras.metrics.Recall()
  model3 = create_model()
  model3.compile(optimizer = optimizer, loss = loss, metrics = [recall])

Downloading:   0%|          | 0.00/545M [00:00<?, ?B/s]

Some layers from the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-turkish-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [48]:
pd.Series(y).value_counts(normalize = True)

0    0.943835
1    0.056165
dtype: float64

In [49]:
#weights = {0:0.05, 1:0.95}

In [50]:
model3.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 160)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 160)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  110617344   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 160,                                           

In [51]:
# tf.autograph.set_verbosity(10)

In [52]:
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [53]:
history = model3.fit(
    train_ds, validation_data= val_ds,
    epochs=epochs)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Epoch 2/3
Epoch 3/3


## Model evaluation

In [54]:
# model3.predict(train_ds)

TypeError: ignored

In [None]:
# from sklearn.metrics import classification_report, confusion_matrix

# y_pred = model3.predict(val_ds) >= 0.5

# y_test = []
# for i in val_ds:
#   for j in np.array(i[1]):
#     y_test.append(j)

# print(classification_report(y_test, y_pred))

TypeError: ignored

In [None]:
# y_train_pred = model3.predict(train_ds) >= 0.5

# y_train = []
# for i in train_ds:
#   for j in np.array(i[1]):
#     y_train.append(j)

# print(classification_report(y_train, y_train_pred)) 

TypeError: ignored

In [None]:
for i in val_ds:
  print(np.array(i[1]))
  break

[[0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]]


In [None]:
# from sklearn.metrics import precision_recall_curve, average_precision_score
# import matplotlib.pyplot as plt

# y_pred_proba = model3.predict(val_ds)
# precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
# plt.plot([1,0],[0,1],'k--')
# plt.plot(recall, precision)
# plt.xlabel('recall')
# plt.ylabel('precision')
# plt.title('precision recall curve')
# plt.show()

TypeError: ignored

In [None]:
# average_precision_score(y_test, y_pred_proba)

In [55]:
model3.save("sentiment_model_without_weighted.h5")

In [None]:
!pip install -q tf-models-official

In [None]:
# import tensorflow as tf
# import official.nlp.optimization as adamw
# from tensorflow.keras.models import load_model


# model4 = load_model('/content/drive/MyDrive/sentiment_model_without_weighted.h5', custom_objects={'AdamWeightDecay':adamw.AdamWeightDecay})

In [56]:
# initialize tokenizer from transformers
from transformers import AutoTokenizer, TFAutoModel

tokenizers = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

def prep_data(text):
    # tokenize to get input IDs and attention mask tensors
    tokens = tokenizers.encode_plus(text.lower(), max_length=160,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True,
                                   return_tensors='tf')
  
    return {'input_ids': tokens['input_ids'],      #tf.cast(tokens['input_ids'], tf.int32)
            'attention_mask': tokens['attention_mask']} #tf.cast(tokens['attention_mask'], tf.int32)

In [None]:
review1 = "çok beğendim herkese tavsiye ederim"
review2 = "süper ürün aynı gün elime geçti"
review3 = "büyük bir hayal kırıklığı yaşadım bu ürünü bu markaya yakıştıramadım"
review4 = "kelimelerle tarif edilemez"
review5 = "tasarımı harika ancak kargo çok geç geldi ve ürün açılmıştı hayal kırıklığı gerçekten"
review6 = "hiç resimde gösterildiği gibi değil"
review7 = "kötü yorumlar gözümü korkutmuştu ancak hiçbir sorun yaşamadım teşekkürler"
review8 = "hiç bu kadar kötü bir satıcıya denk gelmemiştim ürün siparişimi iptal ediyorum"
review9 = "tam bir fiyat performans ürünü"
review10 = "beklediğim gibi çıkmadı"
review11 = "on numara beş yıldız"
review12 = "bu kargoyu biran önce bırakın. yoksa üründe sıkıntı yok"
review13 = "televizyonun görüntü kalitesi çok kötü, dün akşam evde arkadaşlarla toplandık. yedik, içtik, sohbet ettik. Sonra televizyonda Türkiye - İngiltere maçını seyrettik. "
review14 = '1 gün gibi kısa bir sürede elime geçti. Ve bu fıyata süper bir ürün tavsiye ederim. Lakin eli büyük olan kişiler daha büyük modelini tercih edebilirler ortaboy ürün. Teşekkürler '
review15 = "alınca anlarsın anyayı konyayı"
review16 = "çöpe atacak paran varsa alabilirsin"
review17= "Telefon fena değil"
review18 = "al da gününü gör"
review19 = "Ürün süper ama satıcı ve kargo berbat"
review20= "kargo süper ama ürün berbat"
review21 = "Aldigim TV cok kaliteli diye dusunmustum, sonradan cok da iyi bir TV olmadigini dusundum, ama neyse yine de memnunum."
reviews = [review1, review2, review3, review4, review5, review6, review7, review8, review9, review10, review11, review12, review13, review14, review15, review16, review17, review18, review19, review20, review21]

In [None]:
probs = []
for i in reviews:

  in_tensor = prep_data(i)

  prob = model3.predict(in_tensor)[0][0]

  probs.append(prob)

NameError: ignored

In [None]:
probs

[]

In [None]:
classes  = (np.array(probs) >= 0.5).astype("int")
classes

In [None]:
my_dict = {"Review":reviews, "prob":probs, "classes":classes}

In [None]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(my_dict)

In [None]:
# There is too many errors...