In [2]:
import pandas as pd

In [3]:
from google.colab import drive

In [4]:
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [6]:
dataset = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/reviews-outliers-the_story_of_success_processed.csv")

In [7]:
dataset.shape

(8872, 5)

In [8]:
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,stars,review,date,country
0,0,0,Journalism and pseudoscience. Having read lots...,2017-11-12,Reino Unido
1,1,1,Salient and grounded. ​Gladwell argues that su...,2018-03-03,Reino Unido
2,2,1,Interesting enough but certainly no revelation...,2014-06-23,Reino Unido
3,3,1,Fascinating and thought-provoking. It takes a ...,2019-01-17,Reino Unido
4,4,1,Enjoyable read but scientifically it makes me ...,2020-02-18,Reino Unido


In [9]:
dataset.drop(columns=['date', 'country'])

Unnamed: 0.1,Unnamed: 0,stars,review
0,0,0,Journalism and pseudoscience. Having read lots...
1,1,1,Salient and grounded. ​Gladwell argues that su...
2,2,1,Interesting enough but certainly no revelation...
3,3,1,Fascinating and thought-provoking. It takes a ...
4,4,1,Enjoyable read but scientifically it makes me ...
...,...,...,...
8867,8867,1,"This one is a weird one for me to write, and I..."
8868,8868,1,This has got to be Malcolm Gladwell’s best boo...
8869,8869,0,This is one of those books that give popular n...
8870,8870,1,My first exposure to Gladwell. SO was more or ...


In [13]:
X, y = dataset.review, dataset.stars

In [14]:
from sklearn.model_selection import train_test_split

In [15]:

SEED = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = SEED, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.20, random_state = SEED, stratify=y_train)

In [16]:
!pip install -q transformers

[K     |████████████████████████████████| 3.8 MB 12.0 MB/s 
[K     |████████████████████████████████| 596 kB 40.6 MB/s 
[K     |████████████████████████████████| 6.5 MB 28.4 MB/s 
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
[K     |████████████████████████████████| 895 kB 42.6 MB/s 
[?25h

In [17]:
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
import tensorflow as tf

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
def convert_example_to_feature(review):
  return tokenizer.encode_plus(review,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

In [20]:
# can be up to 512 for BERT
max_length = 512
batch_size = 6

In [21]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

In [23]:
import tensorflow as ts

In [24]:
import numpy as np

In [25]:
X_train.head(5)

7377    Interesting and profound. Wow loved it. For so...
3611                  Paper back. Very interesting book !
6553    Gladwell's least impressive work. Tipping poin...
3367    Success is a mixture. The myth of success hold...
6218                     Five Stars. Thought provoking...
Name: review, dtype: object

In [26]:
def encode_examples(x_train, y_train, limit=-1):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  # print(limit)
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = y_train
  if (limit > 0):
      x_train = x_train.take(limit)
      # print('x_train', x_train)
  for review in x_train:
    bert_input = convert_example_to_feature(review.encode().decode())
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    # label_list.append([stars])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [27]:
# train dataset
X_train_encoded = encode_examples(X_train, y_train).shuffle(10000).batch(batch_size)
# valid dataset
X_valid_encoded = encode_examples(X_valid, y_valid).batch(batch_size)
# test dataset
X_test_encoded = encode_examples(X_test, y_test).batch(batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [28]:

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1
# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [30]:
bert_history = model.fit(X_train_encoded, epochs=number_of_epochs, validation_data=X_valid_encoded)



In [31]:
# Save the weights
model.save_weights('./checkpoints_v4/my_checkpoint_v4')

In [32]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model_v4/my_model_v4')



INFO:tensorflow:Assets written to: saved_model_v4/my_model_v4/assets


INFO:tensorflow:Assets written to: saved_model_v4/my_model_v4/assets


In [40]:
loss, accuracy = model.evaluate(X_test_encoded)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.06995943188667297
Accuracy: 0.9701408743858337


In [None]:
# test_sentence = "great book. Great reading, It will widen your horizon and see things in different perspective."
# predict_input = tokenizer.encode(test_sentence),
# truncation=True,
# padding=True,
# return_tensors="tf")

In [None]:
# test_sentence = "Extremely boring!. I love reading, and I honestly tried my best but I simply couldn't finish this book! Gladwell goes on and on and spends full chapters saying something that could have been done in a paragraph... So many pointless things... I felt such a waste of my time trying to finish the book. If I could I would ask my money back!"

# predict_input = tokenizer.encode(test_sentence,
# truncation=True,
# padding=True,
# return_tensors="tf")